Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (single structure, S)

Test 1: uops

Code:

  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.004

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.004

retire (01)cycle (02)030708090a0e0f191e1f2223243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)a0a1a3a6a8acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6500529356220210121110200004623288830001700650044004100040001000500047513000230622911329305310500010004000100080002917229354116100110001000100041100020100020201284891786943302365020159319038185383628342164831397215076100040002933329150295372921929339
6500429255219111010000263520004531287860001703350044004100040001000500047558300229402911129223310500010004000100080002919429118116100110001000100003100000100020201317791846920306974220155311838176373928297163541401514928100040002931829205292252927129262
650042929121919001800014010044892883600016930500440041000400010005007475481002296229090293033105000100040001000800029159292091161001100010001000021000001000212012823919668453060436203773135382012394028391163781395615016100040002922929293292852919329315
6500429237218140021000000004582288110101690550044004100040001000500047548000229022906229238310500010004000100080002906129105116100110001000100002100000100020301280394976843319844020211312538159373828353161991387814811100040002925929335293122933129319
6500429301219130115000200004737287170011694650044004100040001000500047538110228892903829253310500010004000100080002911829150116100110001000100003100001100020201312291576858304594520195308338168363728304159941394314774100040002924329204291852922429254
6500429248220220010000300004504287880111695850084004100040001000500047585200229162908429285310500010004000100080002910329017116100110001000100002100001100220201296090746898305183320200308938157383428467162091387314923100040002926629174292192924429275
650042925221911011800020000452828837010170035004400410004000100050004763800022874290232921331050001000400010008000292062906211610011000100010000210000110001030128289100680930291035201043183381611383728368163491389015024100040002923829207292632930429232
6500429265220170110010300004596288730011698950084004100040041000500247524105228702903729253329500010004000100080002914029137116100110001000100003100101100022301288292006858309964220122308738219383828289162671381715041100040002930729200292582925129253
65004292072201701160020000046162876301016969500840081000400010005057475261002288629044292483105000100040001000800029125291371161001100010001000021000021000202213227925668333064636202143124381410343528323161151394715056100040002923429269294042928629266
650042933321913001311062010045532881301016901500440041000400010005000475960002288229041293143105000100040001000800029110292091161001100010001000021000021001203012935941968583072740201703192382010393828377163581405514775100040002925629376292472924129239

Test 2: throughput

Count: 8

Code:

  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  ld4 { v0.s, v1.s, v2.s, v3.s }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)030508090b0e0f18191e1f2223243a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400205160061119811100000320000116003110679961254001161003200168000010032000080000500400008227154720160034016005316005379947038003540010020080000320000200800006400001600531600401180202100991001008000080000010080000015800110008001161015000051100117111600500066280000320000100160054160054160041160041160054
400204160053119900000000001000160025106799532540010810032000880000100320000800005004000072271547201600420160061160061799550380043400100200800003200002008000064000016006116004611802011009910010080000800000100800098080026104180018612624720051100117111600430009280000320000100160062160062160062160062160062
40020416006111991010000090010011600461067994625400116100320016800001003200008000050040003922716884016002701600611600467995503800434001002008000032000020080000640000160046160061118020110099100100800008000001008000782480008002680020612624700051100117111600581090280000320000100160062160062160047160062160062
40020416006111981000000060000116004610079946254001081003200168000010032000080000500400033227140680160027016006116006179955038004340010020080000320000200800006400001600611600611180201100991001008000080000010080008808002600298001801724610051100117111600580099080000320000100160047160062160062160062160047
4002041600611199111100007000001600311607994625400116100320016800001003200008000050040003022716884016004201601151600617995503800434001002008000032000020080000640000160061160061118020110099100100800008000001008000782380028007800200170600051100117111600580099280000320000100160062160062160062160062160047
400204160046119910100000320000116004616079961254001161003200088000010032000080000500400000227168840160042016006116006179955038004340010020080000320000200800006400001600461600611180201100991001008000080000010080008708002500258001961623710051100117111600580090280000320000100160062160062160062160062160062
400204160061119910100000701001160046166799612540011610032000880000100320000800005004000002271688401600420160061160046799550380043400100200800003200002008000064000016006116004611802011009910010080000800000100800087080029002580019612624700051100117111600580009280000320000100160062160062160062160062160062
40020416006111991101000032000011600460667994625400108100320016800001003200008000050040003322716884016004201600611600617994003800284001002008000032000020080000640000160061160061118020110099100100800008000001008000782380028002680019602523600051100117111600581099280000320000100160062160062160062160047160062
4002041600611199111010003201001160046160799612540011610032001680000100320000800005004000392271406801600420160046160061799550380043400100200800003200002008000064000016006116004611802011009910010080000800000100800077248002600268001861724711051100117111600581009080000320000100160062160047160062160062160062
40020416006111991111100060100016004616679961254001161003200168018010032000080000500400027227168841160042016004616006179955038002840010020080000320000200800006400001600611600461180201100991001008000080000010080008724800060026800000170610051100117111600580099280000320000100160062160062160047160062160062

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)030e0f1e22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6d9ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40002516005611990020000160044006799592540001810320008800001032000080000504000002271604401600400160040160059799530380022400010208000032000020800006400001600401600561180021109101080000800000108000000800001328001361022001050201517057160056171028000032000010160041160060160060160041160041
4000241600401199113300016002516679940254000181032000880045103200008000050400029227126881160037016005916005979953038004140001020800003200002080000640000160056160056118002110910108000080000010800000188001400800186014000005020517075160037013028000032000010160060160060160041160060160057
400024160040119900200001600251667994025400010103200088000010320000800005040002222716044116004001600401600597995303800414000102080000320000208000064000016004016004011800211091010800008000001080000022800180178000060022000050205170571600560101028000032000010160057160041160060160060160060
400024160040119800241001600441667995925400010103200088000010320000800005040000022716044116002101600591600597993403800224000102080000320000208000064000016005916005611800211091010800008000001080000008001400800006117000005020517075160056001028000032000010160041160060160060160060160057
400024160059119900240001600251667995925400018103200088000010320000800005040003722716044116004001600401600597995303800224000102080000320000208000064000016004016005611800211091010800008000011080000018800130080013011422000050208170571600561131328000032000010160041160041160057160060160060
400024160059119900200001600250667995925400018103200088000010320000800005040002922712688116004001600561600407995303800414000102080000320000208000064000016004016005611800211091010800008000001080000008001401780000601418000050207170671600560131328000032000010160060160041160041160060160041
400024160059119900231001600251607994025400018103200088000010320000800005040002422716044116002101600591600407995303800414000102080000320000208000064000016005616005611800211091010800008000001080000018800180188001461142200005020717077160037010028000032000010160057160060160041160060160041
400024160059119800010016002506079940254000181032000880000103200008000050400032227160441160021016005916004079934038004140001020800003200002080000640000160040160056118002110910108000080000010800482188001802080018011322000050205170861600560131308000032000010160060160057160041160057160041
40002416005911980031100160044066799084740011110320104800901032035280000504000262271604401600400160059160059799530380041400010208000032000020800006400001600401600561180021109101080000800000108000001880000224800006102200005041717067160056101028000032000010160060160060160060160060160041
400024160040119900320001600441007994025400103103200088000010320000800005040002922712688016004001600401600407995303800384000102080000320000208000064000016005916005611800211091010800008000001080000008001801780013611818000050205170751600561131008000032000010160060160041160041160060160057