Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 8B)

Test 1: uops

Code:

  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0305070a0b0e0f18191e1f22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
64005286452130191401100601052232847202223043200020002000100003001623527963286133102000400020002834128369116100110001000020000420020000200444400136181028972433437861197663370382311544928064140611250413770200020002815428144282372819028224
64004281652131142000000600052082795902023446200020002000100002001630428209282913102000400020002838028494116100110001000020042020030022200220400135899595724834031147192193279381718394027870141031281913596200020002830128301282282832128388
6400428239213018141001030105066280960102336120002000200010000500162582789728467310200040002000284532814011610011000100002004362003131220000240013777980671683275647194903274381917434927977143131254614154200020002829928476285602836528569
640042831421402119101001701049412824002023305200020002000100009001624027989284953102000400020002825228110116100110001000120000420020000200004400139021025071443446549191313429381315535228102145481211414286200020002838628155284312832128779
640042854321401214100002010516228336000229682000200020001000020016246280002821731020004000200028409280501161001100010000200234200400022000204001388510400718233301049196043484381718515227735141971207313476200020002857228466282532842128500
64004281352110182500000600049622809102023405200020002000100005001624627902282163102000400020002812528040116100110001000020034420020224200042621137251024071853404105019208326838119434628063139831207913690200020002817728155284882817728433
64004284212121152001000401050162797400023042200020002000100009001625428059281683102000400020002840928511116100110001000020000420000002200042623139431020072203417952195303344381716424427833137481255013623200020002832228227282182813628483
640042829921111926000002000051452796700023000200020002000100005001626527985283323102000400020002829228136116100110001000020044420031112200222000135731044272363424945191403254381810444128016146661246113786200020002815228442284922811328194
6400428206211022230000012010500528312222233652000200020001000090016280278772835531020004000200028112282221161001100010000200430200300222000446211379610063716933171143192043369381313474128070141281215213964200020002846128134283472832128543
640042836121111815001203800051732817100222936200020002000100004001627527947284243102000400020002816128040116100110001000020044020050117200042422136411007470493320947193873298381813485027851146321249913146200020002826128513283052827528136

Test 2: throughput

Count: 8

Code:

  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030708090b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020580067599001011003800018002521212025160100100160000100160000500800000080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600002716003203216004261240005109217228003716601600001600001008004180041800418004180041
320204800405990000000038000080025212120251601001001600001001600005008003750800150800408004032216010020032000020016000080040800401180201100991001008000080000110016000027160000032160032602435005109217228003710001600001600001008004180041800418004180041
32020480040600000000003801008002501212025160100100160000100160000500800377080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600000160032032160024602427005109217238003700621600001600001008004180041800418004180041
320204800405990000000000001800250121202516010010016000010016000050080022218001508004080040322160100200320000200160000800408004011802011009910010080000800000100160000271600320241600246103500510921722800370101001600001600001008004180041800418004180041
3202048004060000000000001018002521212025160100100160000100160000500800217080015080040800403221601742003200002001600008004080040118020110099100100800008000001001600000160000124160024613200051092172280037110621600001600001008004180041800418004180041
320204800405990000000038000080025012002516010010016000010016000050080024818001508004080040322160100200320592200160000800408004011802011009910010080000800000100160000271600320241600246124350051092172280037110001600001600001008004180041800418004180041
320204803836000000000038000180025200025160100100160000100160000500800216180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600000160000032160032612436005109220228003706621600001600001008004180041800418004180041
3202048004060200100000380000800252121202516010010016000010016000050080162008001508004080040322160100200320000200160000800408004011802011009910010080000800000100160000016000000160000612400051092173280037010621600001600001008004180041800418004180041
3202048004059910100000380101800250012025160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003200160024610350051092172280037010621600001600001008004180041800418004180041
320204801026000000000000101800252012025160100100160000100160000500802291180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600000160032232160024613235005109317228003700621600001600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030e0f1e22233f43464951inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8a9acafb5b6bbdcache load miss (bf)c5cdcfd0d5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32002580041600003810800252121225160010101600001016000050800377080015080040800403221600102032000020160000800408004011800211091010800008000001016000035016003261032160032610351050190217248003710102160000160000108004180041800418004180041
32002480040600003810800252121225160010101600001016000050800377080015080040800403221600102032000020160000800408004011800211091010800008000001016000035016003200321600006003500501902173480037110102160000160000108004180041800418004180041
32002480040600003800800252121225160010101600001016000050800374080015080040800403221600102032000020160000800408004911800211091010800008000001016000035016003200321600326103500501902173380037110100160000160000108004180041800418004180041
3200248004059900381080025212122516001010160000101600005080037408001508004080040322160010203200002016000080040800401180021109101080000800000101600003501600320001600326103500501902173280037110102160000160000108004180041800418004180041
32002480040599003810800252120251600101016000010160000508003770800150800408004032216001020320000201600008004080040518002110910108000080000010160000350160032003216003261323500501903173380037110102160000160000108004180041800418004180041
32002480040600000108002521212251600101016000010160000508000000800153800408004032216001020320000201600008004080051118002110910108000080000010160000350160032003216003261323500501943172380037110102160000160000108004180041800418004180041
32002480040599003810800252120251600101016000010160000508000000800150800408004032216001020320000201600008004080040118002110910108000080000010160000350160032003216003261323500501903173380037110102160000160000108004180041800418004180041
320024800405990038108002521212251600101016000010160000508003770800150800408004032216001020320000201600008004080040118002110910108000080000010160000350160032003216003261323501501903172380037110102160000160000108004180041800418004180041
32002480040600013810800252121225160010101600001016000050800376080015080040800403221600102032000020160000800408004011800211091010800008000001016000035016003200321600326132350050190417328003700100160000160000108004180041800418004180041
320024800406001138108002521212251600101016000010160000508010800800150800408004032216001020320000201600008004080040518002110910108000080000010160000350160032003216003261323500501903172380037110102160000160000108004180041800418004180041