Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (single structure, H)

Test 1: uops

Code:

  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.004

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.004

retire (01)cycle (02)030407090a0f18191e1f22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6500529423220341300012601046612885200017048500440041000400010005000475290102294829075292263105000100040001000800029138291501161001100010000100021000201000202013053921968583123571201923063381311625328351158851407615012100040002934629345293252928429312
650042923821943020001500045452881200017028500540041000400010005000475150152290029123293873105000100040001000800029194292021161001100010000100021001161000202012835915669523112453201193125381611506428335165121384115056100040002937729289293192926629348
65004293262194206000420104692288440001696750044004100040001000500047560015229402920429301310500010004000100080002930429165116100110001000010002100000100020201281790726853303055220206309138148484428374162541408615062100040002938429352293242924329371
650042924121933050003101046782883000016980500440061000400010005000475733052289429065292483105000100040001000800029140292381161001100010000100021001021002202012914917669323147249201443261381712514428505164481392214942100040002921729281292742931529230
6500429278219360500035001046222883600017022500440081000400010005000476103152291429210293093105000100040041000800029291292651161001100010000100031000061000203013005911569133133356201933037381013524728351163371387914811100040002933429359292182928729245
6500429275220260500033601045582890100016945500440041000400010005000475152152290529121292743105000100040001000800029174290741161001100010000100021001001001202012863920269083192553202063076381311465428356163181399514795100040002927229362292422925629361
65004292872192402000270104515287310011706450044004100040001000500047546305229482911729298310500010004000100080002923929181116100110001000010002100101100020301272991516837305325220181310138164525828500163711389215044100040002930329287292832925129199
650042941321933040003790104678287980011706850044004100040001000500047656210229262906729216310500010004000100080002915629140116100110001000110002100000100020201294990226867308755220166312038129565328476162621346514914100040002925029188293042934829289
650042930822005060002701046242897201016966500440041000400010005000475337102294829118292343105000100040001000800029108290971161001100010000100021000011000203012837918168763109352201193059381411625328468163781382014885100040002921929274293182931729398
6500429258220430600031201045692878601016967500440041000400010005000475603152289929171292593105000100040001000800029077293101161001100010000100021000021000213012797924769123079454201653128380914635128379162521396214920100040002927029259292272931429378

Test 2: throughput

Count: 8

Code:

  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)03090b0e0f18191e1f2223243a3f4346494e5051inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c5branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4002051600531199200000200000001600251607995502540010810032000880000100320000800005004000002271586401600360160055160040799343800374001002008005032000020080000640000160053160053118020110099100100800008000001008000000800130038005260015000051100117111600520066080000320000100160041160056160041160054160056
4002041600531198000000980000016003810679955025400108100320008800001003200008000050040000922712688016002101600551600647994938002240010020080000320000200800006400001600401600531180201100991001008000080000010080000008001100080014601118000051100117111600371006280000320000100160054160041160054160056160041
40020416005311990000002201000160038066799530254001081003200448000010032000080000500400000227126880160034016005516004079949380037400100200800003200002008000064000016004016005311802011009910010080000800001100800000080011001480000611115000051100117111600520099280000320000100160041160056160054160041160041
400204160055119800000040601000160025060799400254001081003200088000010032000080000500400014227126880160021016005516005579949380037400100200800003200002008000064000016005516005311802011009910010080000800000100800000158001100148000061100000051100117111600521099280000320000100160056160056160056160041160056
40020416005511980010006900000160025106799550254001081003200088000010032017680000500400022227154720160036016004016005579934380022400100200800003200002008000064000016005516005311802011009910010080000800000100800000158001400080014011215000051100117111600370099080000320000100160056160056160056160056160056
40020416005511990000008601000160040100799400254001081003200088000010032000080000500400008227158640160036016004016005579934380022400100200800003200002008000064000016005516005311802011009910010080000800000100800000158001400080014002518000051100117111600521090280000320000100160041160041160041160056160054
40020416005511990000009500000160025166799550254001081003200088000010032000080000500400007227154720160021016005516005579947380035400100200800003200002008000064000016005316004011802011009910010080000800000100800000158000000148001061110000051100117111600501009280000320000100160054160056160054160041160054
40020416004011990000002001000160040066799550254001081003200008000010032000080000500400000227154720160021016004016005579934380037400100200800003202002008000064000016005316005311802011009910010080000800000100800000080010001180010601118000051100117111600501090280000320000100160041160054160041160054160056
400204160055119900000086010001600250667994002540010810032000080000100320000800005004000002271547201600360160055160040799493800354001002008000032000020080000640000160040160053118020110099100100800008000001008000001580000000800000000000051100117111600520006280000320000100160056160056160054160056160041
4002041600531199000000113010001600401667994002540010010032004080000100320000800005004000112271547201600360160053160053799473800374001002008000032000020080000640000160053160055118020110099100100800008000001008000001580014101780000601118000051100117111600521096280000320000100160054160041160054160054160056

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)030508090b0e0f18191e22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5cfd0d2d5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4000251600531199001000000000160038106799402540001810320008800001032000080000504000002271547200160083160040160053799340380035400010208000032000020800006400001600531600531180021109101080000800000108000772380007001780020616236205020501171116005819928000032000010160062160062160062160062160062
400024160061119910110000320011600461067994425400026103200168000010320000800005040000322716884151600421600611600467994003800284000102080000320000208000064000016006116006111800211091010800008000001080009923800270002580019616246005020001171116004300008000032000010160062160047160062160047160047
4000241600611199100000003200016003106679961254000181032001680000103200008000050400034227140681016002716006116004679955038004340001020800003200002080000640000160061160046118002110910108000080000010800077248002708030800206125247205045501171116005819928000032000010160047160062160062160062160047
400024160046119910110000320001600311607996125400026103200168000010320000800005040003822716888001600271600611600617994003800434000102080000320000208000064000016004616006111800211091010800008000001080008724800260016800186126237105020541171116005800928000032000010160062160062160047160047160062
40002416011011991110000061001600460667994625400026103200168000010320000800005040000022716884051600441600461600467995503800434000102080000320000208000064000016006116004611800211091010800008000001080007708000600178000001707205020001171116004319028000032000010160047160062160062160047160062
40002416006111991010000046101160046106799612540002610320016800001032000080000504000032271688415160042160046160061799550380043400010208000032000020800006400001600611600611180021109101080000800000108000770800250012680020002606005020041171116005800928000032000010160047160062160047160062160062
4000241600461198111000003200116004616679961254000181032001680000103200008000050400003227168841016004216004616006179955038002840001020800003200002080050640000160061160061118002110910108000080000010800088238000800125800196026246005020541171116005809928000032000010160062160062160047160062160062
4000241600611198111100007101160046166799612540001810320016800001032000080000504000442271406810160027160046160061799550380043400010208000032000020800006400001600461600611180021109101080000800000108000680800070105380020607246105020501171116005819928000032000010160062160062160047160047160062
400024160061119910000000310001600311667996125400018103200088000010320000800005040003922716884051600421600611600617994003800434000102080000320000208000064000016006116006111800211091010800008000001080007723800250002680018617237005020041171116004309928000032000010160047160047160047160062160062
400024160061119910100000321001600461607994625400111103200168000010320000800005040003922716884101600421600611600617995503800434000102080000320000208000064000016004616006111800211091010800008000011080008824800080102680019017237205020501171116005800908000032000010160062160062160062160062160062