Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (single structure, B)

Test 1: uops

Code:

  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.004

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.004

retire (01)cycle (02)03040708090a0b0e0f18191e1f22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)dfe0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
650052852021211012000006010496328160010160535004400410004000100050004752201022984283002830631050001000400010008000283962842011610011000100001000210001010010120137489759704734370481920933243812134140227957147401294313290100040002833928357284042839028309
650042838821201001000004000502028110000161655004400410004000100050004759950022954284002837831050001000400010008000284302834011610011000100001000010000010000020136329866713533600421932232133811154238227926146901280613409100040002825228377283072847128422
650042846421311001000002000503228299000161255004400410004000100050004743200022968283932835531050001000400010008000281532827811610011000100001000210000010012030135919982713333101441932233233817124242227973145121261813635100040002841528368283112832528340
650042841021200001000000000506728150000160795004400410004000100050004754800022910282852838631050001000400010008000284212826111610011000100001000210000010012100136579894716532961421930532573811144545228006147991275813525100040002844028434283522838928473
6500428456213010010100040005011281101001629550044004100040001000500047587400229542818228449310500010004000100080002827528335116100110001000110002100001100101001367110017708032960461929932073820164444227930145511271213842100040002831128309282942841728462
6500428488213010010000040004828280241001607450044004100040001000500047575300229902822328367310500010004000100080002835928371116100110001000010002100000100021301375210024723033490441916532393814154435227832151021268313632100040002835328298285102841428317
6500428349212010010000040005018281360001606350044004100040001000500047558000229892823128349310500010004000100080002827428425116100110001000010002100000100021301371310020711133350471924632603818254337227928148851301613546100040002827828264283302838328402
650042847921301001000007010499628204100160615004400710004000100050014755640022981283312840931050001000400010008000282712832511610011000100001000310000010011130137129974717633160391924732273817144545228064147611261713352100040002837728384284462833028285
650042832021301001000004000486328050001161905000400010004004100050004743000022958282922839431050001000400010008000285112829411610011000100001000010010010002100136359889714533220421932733143815203842227994147481275613570100040002844128617282752837728235
6500428403214210010000040004958280260001595950004004100040001000500047442000229382844828447310500010004000100080002855128368116100110001000010002100000100010201367110067709133681461921432603818123842227848148011276313600100040002835528378283772843328399

Test 2: throughput

Count: 8

Code:

  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  ld4 { v0.b, v1.b, v2.b, v3.b }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40020516006111991110000310001160031166799612540011610032001680000100320000800005004000032271406801600420160046160046799553800434001002008000032000020080000640000160046160061118020110099100100800008000011008000882380028013088000061262472005110117111600430099280000320000100160062160062160047160062160062
4002041600461198110000032000116003100679946254001081003200088000010032000080000500400033227140680160042016006116006179955380028400100200800003200002008000064000016006116004611802011009910010080000800000100800088248000701268001801252461005110117111600431009280000320000100160062160062160062160062160062
400204160061119910100003100001600461607996125400116100320016800001003200008000050040000322716884016004201600611600617995538002840010020080000320000200800006400001600611600611180201100991001008000080000010080007724800270026800410026070005110117111600581099280000320000100160062160062160062160062160062
40020416004611991000000600001600311667994625400116100320016800001003200008000050040004122716884016004201600611600617995538004340010020080000320000200800006400001600611600611180201100991001008000080000010080007724800270026800206172460005110117111600580099080000320000100160047160062160047160062160062
40020416006111991100000701001600461667996125400116100320016800001003200008000050040000322716884016002701600611600617995538004340010020080000320000200800006400001600611600611180201100991001008000080000010080009624800270026800206172472005110117111600580099280000320000100160062160047160062160062160062
400204160061119910110003200011600461067996125400108100320016800001003200008000050040003322714068016002701600611600617995538004340010020080000320000200800006400001600461600611180201100991001008000080000110080009808002600258001861252460005110117111600430099280000320000100160062160062160062160047160062
4002041600611199100000031000016004610679946254001081003200168000010032000080000500400039227168840160027016006116009579955380043400100200800003200002008000064000016004616006111802011009910010080000800000100800067248002601268000061262470005110117111600580099280000320000100160062160062160111160062160047
400204160061119911000003201001600310667994625400116100320016800001003200008000050040003322716884116004201600611600617994038002840010020080000320000200800006400001600611600461180201100991001008000080000110080008724800260068000061262470005110117111600580009280000320000100160062160047160062160062160062
4002041600611199110000060101160046160799612540011610032001680000100320000800005004000272271688411600420160061160046799553800284001002008000032000020080000640000160046160061118020110099100100800008000011008000662480006017800000072470005110117111600580099280000320000100160062160047160047160062160062
4002041600611199101110067000116004616679961254001161003200168000010032000080000500400038227168841160027016006116006179955380023400100200800003200002008000064000016006116006111802011009910010080000800000100800077248002500268001861262470005110117111600580099280000320000100160062160047160062160062160062

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)030508090b0e0f1e223a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c5cdcfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40002516005611990010003710160025160799592540001810320008800001032000080000504000182271604416003716005916005979934038004140001020800003200002080000640000160040160065118002110910108000080000110800000188003100298002461141800150204175516003700028000032000010160041160057160060160041160057
4000241600401199000001350016004410079940254000181032000880000103200008000050400000227160441600401600591600567995903800384000102080000320000208000064000016005916005611800211091010800008000001080000018800170008000001140000502051754160056001028000032000010160057160057160060160060160060
4000241600561199000000240016004410079959254000181032000080045103200008000050400042227160441600401600591600597993403800384000102080000320000208000064000016005916005611800211091010800008000001080007628800180017800180114220005020517541600530101028000032000010160057160041160060160041160060
40002416005911990000006101600441007995925400018103200088000010320000800005040000022716044160040160040160040799530380022400010208000032000020800006400001600591600561180021109101080000800000108000001880030013480023011800005020517561600560101028000032000010160041160060160041160060160060
40002416005911990000002410160044006799592540001810320008800001032000080000504000002271268816004016005916004079953038003840001020800003200002080000640000160056160040118002110910108000080000010800000080013202180018611322000502051745160037101328000032000010160066160066160066160047160066
4000241600401199000000000160025106799592540001010320008800001032000080000504000092271268816003716005916005979934038004740001020800003200002080000640000160059160056118002110910108000080000010800000080014001380014011722000502051754160056113028000032000010160066160066160066160066160047
40002416004611990111002400160025160799402540001810320000800001032000080000504000002271656016004016004016005979953038002240001020800003200002080000640000160059160056118002110910108000080000010800000188000000080000000220005020617561600561131008000032000010160041160060160060160057160041
4000241600561198000000751016004116079959254000101032000880000103200008000050400024227160441600401600591600597995303800224000102080000320000208000064000016004016005611800211091010800008000001080000018800170008001401000005020517551600530131028000032000010160060160060160041160060160060
4000241600591199000000000160044106799404440001010320008800001032000080000504000572271748016002116004016005979934038002840001020800003200002080000640360160059160111118002110910108000080000010800087278000901080018600220005020517551600560131008000032000010160060160060160060160041160060
400024160040119800001023101600441067995925400010103200088000010320000800005040001822716560160037160059160040799530380041400010208000032000020800006400001600591600561180021109101080000800000108000001880017001880000610220005040517561600531131328000032000010160057160060160057160041160060