Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 8B)

Test 1: uops

Code:

  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
64005286452130191401100601052232847202223043200020002000100003001623527963286133102000400020002834128369116100110001000020000420020000200444400136181028972433437861197663370382311544928064140611250413770200020002815428144282372819028224
64004281652131142000000600052082795902023446200020002000100002001630428209282913102000400020002838028494116100110001000020042020030022200220400135899595724834031147192193279381718394027870141031281913596200020002830128301282282832128388
6400428239213018141001030105066280960102336120002000200010000500162582789728467310200040002000284532814011610011000100002004362003131220000240013777980671683275647194903274381917434927977143131254614154200020002829928476285602836528569
640042831421402119101001701049412824002023305200020002000100009001624027989284953102000400020002825228110116100110001000120000420020000200004400139021025071443446549191313429381315535228102145481211414286200020002838628155284312832128779
640042854321401214100002010516228336000229682000200020001000020016246280002821731020004000200028409280501161001100010000200234200400022000204001388510400718233301049196043484381718515227735141971207313476200020002857228466282532842128500
64004281352110182500000600049622809102023405200020002000100005001624627902282163102000400020002812528040116100110001000020034420020224200042621137251024071853404105019208326838119434628063139831207913690200020002817728155284882817728433
64004284212121152001000401050162797400023042200020002000100009001625428059281683102000400020002840928511116100110001000020000420000002200042623139431020072203417952195303344381716424427833137481255013623200020002832228227282182813628483
640042829921111926000002000051452796700023000200020002000100005001626527985283323102000400020002829228136116100110001000020044420031112200222000135731044272363424945191403254381810444128016146661246113786200020002815228442284922811328194
6400428206211022230000012010500528312222233652000200020001000090016280278772835531020004000200028112282221161001100010000200430200300222000446211379610063716933171143192043369381313474128070141281215213964200020002846128134283472832128543
640042836121111815001203800051732817100222936200020002000100004001627527947284243102000400020002816128040116100110001000020044020050117200042422136411007470493320947193873298381813485027851146321249913146200020002826128513283052827528136

Test 2: throughput

Count: 8

Code:

  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
32020580067599001011003800018002521212025160100100160000100160000500800000080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600002716003203216004261240005109217228003716601600001600001008004180041800418004180041
320204800405990000000038000080025212120251601001001600001001600005008003750800150800408004032216010020032000020016000080040800401180201100991001008000080000110016000027160000032160032602435005109217228003710001600001600001008004180041800418004180041
32020480040600000000003801008002501212025160100100160000100160000500800377080015080040800403221601002003200002001600008004080040118020110099100100800008000001001600000160032032160024602427005109217238003700621600001600001008004180041800418004180041
320204800405990000000000001800250121202516010010016000010016000050080022218001508004080040322160100200320000200160000800408004011802011009910010080000800000100160000271600320241600246103500510921722800370101001600001600001008004180041800418004180041
3202048004060000000000001018002521212025160100100160000100160000500800217080015080040800403221601742003200002001600008004080040118020110099100100800008000001001600000160000124160024613200051092172280037110621600001600001008004180041800418004180041
320204800405990000000038000080025012002516010010016000010016000050080024818001508004080040322160100200320592200160000800408004011802011009910010080000800000100160000271600320241600246124350051092172280037110001600001600001008004180041800418004180041
320204803836000000000038000180025200025160100100160000100160000500800216180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600000160000032160032612436005109220228003706621600001600001008004180041800418004180041
3202048004060200100000380000800252121202516010010016000010016000050080162008001508004080040322160100200320000200160000800408004011802011009910010080000800000100160000016000000160000612400051092173280037010621600001600001008004180041800418004180041
3202048004059910100000380101800250012025160100100160000100160000500800377180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600003516003200160024610350051092172280037010621600001600001008004180041800418004180041
320204801026000000000000101800252012025160100100160000100160000500802291180015080040800403221601002003200002001600008004080040118020110099100100800008000001001600000160032232160024613235005109317228003700621600001600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)030e0f1e22233f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
32002580041600003810800252121225160010101600001016000050800377080015080040800403221600102032000020160000800408004011800211091010800008000001016000035016003261032160032610351050190217248003710102160000160000108004180041800418004180041
32002480040600003810800252121225160010101600001016000050800377080015080040800403221600102032000020160000800408004011800211091010800008000001016000035016003200321600006003500501902173480037110102160000160000108004180041800418004180041
32002480040600003800800252121225160010101600001016000050800374080015080040800403221600102032000020160000800408004911800211091010800008000001016000035016003200321600326103500501902173380037110100160000160000108004180041800418004180041
3200248004059900381080025212122516001010160000101600005080037408001508004080040322160010203200002016000080040800401180021109101080000800000101600003501600320001600326103500501902173280037110102160000160000108004180041800418004180041
32002480040599003810800252120251600101016000010160000508003770800150800408004032216001020320000201600008004080040518002110910108000080000010160000350160032003216003261323500501903173380037110102160000160000108004180041800418004180041
32002480040600000108002521212251600101016000010160000508000000800153800408004032216001020320000201600008004080051118002110910108000080000010160000350160032003216003261323500501943172380037110102160000160000108004180041800418004180041
32002480040599003810800252120251600101016000010160000508000000800150800408004032216001020320000201600008004080040118002110910108000080000010160000350160032003216003261323500501903173380037110102160000160000108004180041800418004180041
320024800405990038108002521212251600101016000010160000508003770800150800408004032216001020320000201600008004080040118002110910108000080000010160000350160032003216003261323501501903172380037110102160000160000108004180041800418004180041
32002480040600013810800252121225160010101600001016000050800376080015080040800403221600102032000020160000800408004011800211091010800008000001016000035016003200321600326132350050190417328003700100160000160000108004180041800418004180041
320024800406001138108002521212251600101016000010160000508010800800150800408004032216001020320000201600008004080040518002110910108000080000010160000350160032003216003261323500501903172380037110102160000160000108004180041800418004180041