Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple structures, 2S)

Test 1: uops

Code:

  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.016

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.016

retire (01)cycle (02)03050708090a0b0e0f191e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
660052939122012320171000810456628790020168056016401620004000200010000475544229842910429248310600020004000200080002904429115116100110001000020044420050022002424221274992926870308054420091308438138474528409163901315814793200040002928129213292942929829271
66004293292191191120100010104580288370001678360044004200040002000100004749832300729004291603106000200040002000800029025290441161001100010000200340200401220026462212983920868663045940199813133380812414528424163361333214680200040002917229284292762927929227
660042930421911912170000810457328733000168456016402020004000200010000475067231012910729229310600020004000200080002906229095116100110001000020022620040252000040201290792996849305964320073311738129474128423162331323714629200040002926429179291912927129265
6600429192219121111800008004635287922011683560044016200040002000100004743272300529007292583106000200040002000800029089291261161001100010000200244200401220024242112921904168413054936201403088381012373828404160731323414803200040002928729255293322927029284
6600429270218117101610002104726288260201689160164004200040002000100004756282304529150292983106000200040002000800029071291281161001100010000200446200400520024442212955926568383059740200293104381510393828378164611345514975200040002920629279292602922729189
6600429223218117111610002104560287210001681660044016200040002000100004767422302829133292203106000200040002000800029138291571161001100010000200246200400220024442212954922068363089641199873088381713444228418162961314814744200040002919829329292352926529172
660042928422012011191000800457728773012168206012401620004000200010000475005229732915729311310600020004000200080002911329106116100110001000020053620030042002444201292691946827306364020047311938128444428309162051316314890200040002919829252293182926629182
6600429240219113111310008004659288420011680560204016200040002000100174759232297729145293203106000200040002000800029120291291161001100010000200326200201420020402013111905368443085739201363082381315414128420165251327814989200040002923629280292462935229259
660042928522011300151000500458228811200168836016401220004000200010000476628229722913429232310600020004000200080002918929190116100110001000020023420030022002426221289091636870305783819993308538138444828368163331324814815200040002927029188292402916829310
660042917421912211161010800458028789002168246004401620004000200010000475643230122909529305310600020004000200080002917629221116100110001000020042020030122002040201292991736820303154420054308638146424328397162531333714860200040002921529192292692931329253

Test 2: throughput

Count: 8

Code:

  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03050b0e0f1e22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8a9acafb5b6bbdcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4802058007059900110102800492012025480184100320076160000100320000160000500800042960000018004508006080064034248010020016000032000020016000064000080041800591180201100991001008000080000010016000035016003600361600396000151092171180057001001600003200001008006580065800658004280065
48020480041599100000028002621212025480184100320000160000100320000160000500800853697596418002208006480064034248010020016000032000020016000064000080064800411180201100991001008000080000010016000035016003200361600006132400510911711800610141001600003200001008006580065800658004280065
480204800646000000421008004921212025480184100320084160000100320000160000500804079960000018002208006480041034648010020016000032000020016000064000080064800601180201100991001008000080000010016000035016003200361600326132005109117128004100701600003200001008004280065800658006580065
48020480041600000042002800452120025480184100320084160000100320000160000500800377960000018004108006480060034648010020016000032000020016000064000080064800411180201100991001008000080000010016000000160000000160032600005109117118006100001600003200001008006580042800658006580065
48020480064600000001028004900120254801841003200841600001003200001600005008003771088000008002208004180041034248010020016000032000020016000064000080060800411180201100991001008000080000010016000035016003200016003201360051091171180061114001600003200001008006580042800428004280065
480204800415990001000280049212120254801841003200001600001003200001600005008003759600000080045080064800640346480100200160000320000200160000640000800648006011802011009910010080000800000100160000001600360036160032613200510911711800611101001600003200001008006580061800658006580061
48020480064600001000008004901212025480100100320084160000100320000160000500800377108799961800220800658015203464801002001600003200002001600006400008004180041118020110099100100800008000001001600003501600360036160000613240051091171180061014001600003200001008004380065800658006580065
480204800646000010420028002600002548010010032008416000010032000016000050080379896000001800410800648006403464801002001600003200002001600006400008006080041118020110099100100800008000001001600003501600361036160000613640051091172280038114001600003200001008004280065801478004580042
480204800645990010420008004920002548018410032000016000010032000016000050080118710880752180045080064800640323480100200160000320000200160000640000800608004111802011009910010080000800000100160000001600360036160039613640051091171180038010001600003200001008004280065800658006580061
480204800645991000420008002601200254801001003200001600001003200001600005008011876722740080045080064800410323480100200160000320000200160000640000800608006011802011009910010080000800000100160000001600360001600396032400510911711800610141001600003200001008006580065800658006580065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)0308090e0f1e22243a3f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4800258006060001104700080045012025480010103200761600001032000016000050800374384000080022800608006003234800102016000032000020160000640000800608006011800211091010800008000001016000035160000013716003261323500501931733800571100160000320000108006180061800618006180042
48002580060599000038000800452120254800861032000016000010320000160000508004149600000800418004180060034248001020160000320000201600006400008006080060118002110910108000080000010160000351600000991600320100005019317328005711010160000320000108006180061800618006180061
48002480060599000001008004520122548001010320076160000103200001600005080010396000008004180041800600342480010201600003200002016000064000080060800601180021109101080000800000101600003516003201711600000100005019317238005712810160000320000108006180061800618006180061
4800248006059900003810080045212122548008610320076160000103200001600005080048496000008004180060800410342480010201600003200002016000064000080060800601180021109101080000800001101600003516003201216003261035005019317468017101010160000320000108006180042800618006180061
4800248007259900003810080045000254800101032007616000010320000160000508000009600000800228006080060034248001020160000320000201600006400008004180060118002110910108000080000010160000351600320269160032613235005019217338005711410160000320000108006180061800618006180042
4800248006059900003810080026012122548008610320076160000103200001600005080037796000008004180060800600342480010201600003200002016000064000080061800441180021109101080000800000101600000160000028116003200320005019317338005701010160000320000108004280042800618006180042
4800248006060000003800080045212122548008610320076160000103200001600005080043196000008004180060800600323480010201600003200002016000064000080060800411180021109101080000800000101600000160032052160032603235005019217338003801010160000320000108006180042800618006180061
480024800696000000380008004521202548008610320076160000103200001600005080048296000008004180060800600342480010201600003200002016000064000080060800601180021109101080000800000101600000160032029516003201323500501931733800571140160000320000108004280061800618004280061
480024800695990000380008004501212254800101032007616000010320000160000508003779600000800858004180060034248001020160000320000201600006400008004180060118002110910108000080000010160000016003205116000061323500501931733800571010160000320000108006180042800618004280061
4800248006760000003810080045012122548009410320076160000103200001600005080046438400008004180060800420323480010201600003200002016000064000080060800601180021109101080000800000101600003516000002831600326032350050193173380057100160000320000108004280061800428006180061