Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 8B)

Test 1: uops

Code:

  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e223a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
6300529265220124002700061452128877010241952000200020001000021616028685292923102000300020002913529018116100110001000020032520030002200060242113004922468573215127620613310338225555328408165011382615357200010002930229220291252923929345
630042923222011811151012145862884600124164200020002000100000161582863129174310200030002000291452912311610011000100002002362003000020004006001310591766859304065120536312438096494928344164811362615705200010002933329304293002928829261
630042920821901900160001614611288510202413320002000200010000416152286342913831020003000200029060291561161001100010000200006200000012200000040012977935268243053115620647311838176535428379165161372115664200010002936129268293082924929240
63004292602190210020000714593288420002420020002000200010000016151287002934431020003000200029078291071161001100010000200006200000002000402400129359044683930311053205753067381117444628368162611367815402200010002918829215293092921329156
630042922921902300220005145342881300024108200020002000100000161752858729275310200030002000290882905111610011000100002000062002000020004026001298790746830306295420734307538116505128363164691394315606200010002930529271292492928029312
630042929721902500170005146832886202224084200020002000100000161542863829287310200030002000287092916211610011000100002000042002000020024026001288790906843306735020628311838196474928372162421354215575200010002931129262292822921029289
630042922322001200160007044712884900224155200020002000100000161452864829303310200030002000291102913711610011000100002000042000000220004004001291290236905303574420615316038137534728393163801380615557200010002926929297291812924529212
63004292342200210018000514632288440202428520002000200010000716137286392934831020003000200029082290831161001100010000200004200000002000402400129589225683830441343206093030381013464128463162851375315612200010002922029289291762923629237
630042925421902600190000045172889002024167200020002000100000161842870129193310200030002000291572919311610011000100002000042002000020004006001294991166803307475020623307338089535028351162291371815643200010002922929266293022924429221
63004292542200200021000714561287770202417620002000200010000016154287372923331020003000200029154291091161001100010001200004200000002000402400128519256681030321048205413068380610524728337162201380515647200010002926929278293162923829235

Test 2: throughput

Count: 8

Code:

  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  ld1 { v0.8b, v1.8b, v2.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03l1i tlb fill (04)090e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
24020553415400011144001533792121281251601001001600001001600005002358180005336905339453374232973233521601002002400002001600005339453394118020110099100100800008000001001600004301600380381600386139445110116115339510107160000800001005339953399533995339953399
24020453374400011144101533832011925160100100160000100160000500233741700533690533985339823297323356160100200240000200160000533945339411802011009910010080000800000100160000430160000039160038613944511011611533951000160000800001005339953399533755339553395
240204533744000000440015338321119251601001001600001001600005002349885005336905339853374233213233561601002002400002001600005337453394118020110099100100800008000001001600004301600380391600006039445110116115339514140160000800001005337553375533995339953375
24020453398400001044001533792111925160100100160000100160000500234988510533690533945339423317323356160100200240000200160000533985339411802011009910010080000800000100160000430160038038160038613844511011611533920144160000800001005340653405534035340053395
2402045339840000004410153359212119251601001001600001001600005002337495105336905339453394233213233561601002002400002001600005339853374118020110099100100800008000001001600004301600380391600396138445110116115339514147160000800001005339953395533995337553399
24020453398400000144101533832110251601001001600001001600005002349885105337305339853398233213233321601002002400002001600005339853374118020110099100100800008000001001600004301600000381600386138445110116115339514147160000800001005339953399533955339553399
24020453398400000045000533592121025160100100160000100160000500234988510533720533985339823321323359160100200240000200160000533985339411802011009910010080000800000100160000001600380381600386139435110116115339810107160000800001005340253395533995339553399
240204533984000000450015338321119251601001001600001001600005002349885105337305339853398233213233561601002002400002001600005339853394118020110099100100800008000001001600004301600390411600386139445110116115339514140160000800001005339953399533995339953399
2402045339440000014400153383212019251601001001600001001600005002337495105337305339453394233213233561601002002400002001600005340253394118020110099100100800008000001001600004301600390381600386139445110116115339510107160000800001005339953399533755339553375
240204533984000000440015338321119251601001001600001001600005002345544105337305339853374233173233521601002002400002001600005339853374118020110099100100800008000001001600004301600380411600386139445110116115339114107160000800001005339553375533995339953375

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
240025533903990001000000450001533791703012516001010160000101600005023261710533555339853380233253233601600102024000020160000533945338911800211091010800008000001016000003901600390001816005561544300150202416242253378006216000080000105339053381533905339953395
24002453380399000200110045000253379212012251600101016000010160000502352959153369533945339823339323374160010202400002016000053394533801180021109101080000800000101600000001600390004216001801354300050202416232453386006216000080000105339553390533975338153395
240024533984000001000100450002533651812121251600101016000010160000502358180053355533805339523339323378160010202400002016000053394533801180021109101080000800000101600000001600390001816003961180000502019162118533770100016000080000105339953381533955338353395
240024533984000002000000330001533791712121725160010101600001016000050235295915335553394533942333932337416001020240000201600005339453389118002110910108000080000010160000039016014801003516003501354300050202316241953391066316000080000105339553395533815339553399
24002453415400000200110018010253374181201251600101016000010160000502352959053369533985339423325323374160010202400002016000053394533801180021109101080000800000101613000433881615950041065516172561390020513633972523551591106016000080000105554355375545955539555541
2400245542841502010110139175796810255522201210713991617021316182011161602502342429153355533945339423334323369160010202411542016000055406551911418002110910108000080000010161692204031618380101136616146961354304052192912512395528921010416000080000105569655395538615539455250
240024558654200201300017162394105600155207215181372518162221111606501216302661241383305499155372568202543223425320162682202457722016384357410573782018002110910108000080000010162342006161612050021627416185961180000524726161324533950104216000080000105339653535552235547955506
2400245462442302001001004500005336521827125160010101600001016000050234177615335653381533812333532336916001020240000201600005338953389118002110910108000080000010160000058016005401035160018603539000502025161425534000104416000080000105539455686555525523956148
24002455836457040100010084079210153379250191725160010101600001016000050235295905337753402533802334732336016001020240000201600005338053402118002110910108000080000010160000043016003800018160038005544000502020162313533770010716000080000105339953399533955338153403
2400245340239900000001004400015337921112516001010160000101600005023374950533775338053380233253233781600102024000020160000533805339411800211091010800008000001016000004401600380003916003861394400050201716231953395000016000080000105339953399533815340353381