Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 1D)

Test 1: uops

Code:

  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e2223243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
63005286282141218014000529728309220229422000200020001000010001618927936281473102000300020002795428046116100110001000020004200014200020414121104827299357064319588340738097403427866152351262713856200010002817428007282002813428080
63004281762111415010000519227893222234792000200020001000071016143280012799731020003000200028001285081161001100010000200042004012200040413996985272433495335198563282380710383428033148011213812992200010002800428071279882810728348
63004282152111510010100527428057222233602000200020001000020016212278932810831020003000200028069280881161001100010000200042000042003204139351050871053542739194693408381710373627770139751205713188200010002809028078281802824128129
63004285122141411011010053112800420223234200020002000100001181618027828281073102000300020002840327896116100110001000020004200004200424414042105447048350163220014348038188353527886140471203413360200010002814228303281042823628068
63004280712111212014000529727833220231632000200020001000000816187280722853631020003000200028036285191161001100010000200042002042000200137981024172373469540194343501381114323527862136471276313507200010002809028113280562821928161
630042856321381001600052252822722023450200020002000100001010161602809228155310200030002000280662814511610011000100002000420040138200022014186997670563427533194383377381012433627724138111199313277200010002817328055281652814428139
63004279972101511016000526827962002229832000200020001000030016405279132808931020003000200028011282481161001100010000200042004042004244139671028972603519444198933389381612373527848139621194213321200010002806028338283502836528118
630042815721016110166000527627874202234002000200020001000000016161279572804431020003000200027979280611161001100010000200042002002004244139961035073363557739194203508381514343527818149741235113163200010002816027957280532856528055
630042846121191001010051502790700223023200020002000100002081617027916281573102000300020002800427952116100110001000020000200204200244413465984573503297637192993343381013353827728139921211113083200010002807628019284622800627962
630042840921161001600052342786822222918200020002000100003181618427965285023102000300020002801128189116100110001000020000200404200400614026980572753315435198643438381912323627826153091228413266200010002798828333286032805628431

Test 2: throughput

Count: 8

Code:

  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
24020553391400110100021102533662018025160100100160000100160000500233662515337453381533992332232335716010020024000020016000053398533981180201100991001008000080000010016001919001600571005916003861564219251101161153395992160000800001005339953400534005338253489
24020453399400110010021001533830181822251601001001600001001600005002326082153356533995339823322323356160100200240000200160000533995339911802011009910010080000800000100160021194201600191005916000061564219051101161153396992160000800001005338253382533825340053400
24020453399400110000065102533732181816251601001001600001001600005002343231153374533995339823321323357160100200240000200160000533985339911802011009910010080000800000100160019204201600580015916003861574219051101161153378090160000800001005340053382534005339953399
240204533994001101100210005336621818452516010010016000010016000050023360891533835339953400233213233571601002002400002001600005340053399118020110099100100800008000001001600191900160057100621600380057019251101161153396902160000800001005338253382534005338253399
240204533813991000000210035338321800251601001001600001001600005002334036153374533815339823321323357160100200240000200160000533985339911802011009910010080000800000100160019194201600582122116003861574219051101161153396902160000800001005340053382533995338253400
240204533994001000000210035338310180251601001001600001001600005002334036153374533995338123322323356160100200240000200160000533995339911802011009910010080000800000100160020204201600571012116000060194219051101161153395902160000800001005339953399533825340053399
240204533814001100100650035338421801525160100100160000100160000500233662515338153399533812332132335716010020024000020016000053381533981180201100991001008000080000010016002020420160019101231600006157019151101161153396092160000800001005340053382533995338253382
2402045338139911011006500353366218181625160100100160000100160000500235338915339753381533992330432333916010020024000020016000053381533991180201100991001008000080000010016002119001600570005916000060574219151101161153395992160000800001005338253400533875340053399
2402045339940011000006500353383201815251602301001600001001600005002343231153374533985338123304323339160100200240000200160000533995339811802011009910010080000800000100160020204201600190005916003861574219151101161153395992160000800001005340753400533865340153404
2402045338940010011006600353383018181625160100100160000100160000500235421215337353399533982332132335716010020024000020016000053398533981180201100991001008000080000110016002019001600571005916000061193619151101161153395990160000800001005357053382534005338253382

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6674

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2400255339740000100000066010025338727012516001010160000101600005023473870153356053402534022334832338216001020240000201600005340253403118002110910108000080000010160019194301600581016116004061580190502017167135339101010416000080000105339553395533755355453395
2400245339440000000000000000153379201216251600101016000010160000502358180015336905339453394233193233741600102024000020160000533945339411800211091010800008000001016000004301600390003916000061390005020816713533910100416000080000105339553395533955339553395
2400245339440000000000045010005337921201625160010101600001016000050235295901533780534035340323348323382160010202400002016000053402533811180021109101080000800000101600192043016001910163160040615943190523611133913552450013516000080000105555055407554055537955395
2400245539841510000001313187812321002553873779994001617011116182012162314502409247015522105554255716233393233741600102024000020160000533945339411800211091010800008000001016169224340316108100090781617296158431935151161386135526911313516000080000105556653714552635551654379
240024558594171200001161122741408100454768270837518162090121622101216284862243041601563500575255759825414214251101649452024662920164426566245633027180021109101080000800000101628852243691162919000228731623796158019054011326014185625611313016000080000105678456488578545631755686
24002455857444100100000660100353388370882516001010160000101600005023391400153369053394533942335232337416001020240000201600005340253375118002110910108000080000010160000043016003900042160000613943005020716685337101010416000080000105339553395533965339553395
24002453403413010001117448000005337901201625160010121600001016000050235337711533660533905338223328182357716001020240000201600005337453389118002110910108000080000010160000039016003500001600356135390050205169553386066216000080000105337553390533905337553390
2400245338940000000000041000115337431818122516001010160000101600005023415104153364053389533892333432336916001020240000201600005338953374118002110910108000080000010160000039016003500035160035013500050209167953386066216000080000105339053390533905337553375
24002453389400000001000410001153359218002516001010160000101601965023417764153364053389533892333432336916001020240000201600005338953389118002110910108000080000010160000039016003500035160000010390050207166753386066216000080000105337553393533755339053390
24002453389400000001010000011533782180152516001010160000101600005023417764153515053374533742333432336916001020240000201600005338953374118002110910108000080000010160000000160035000351600356136000502071681353386066216000080000105337553390533755339053390