Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, three registers, 1D)

Test 1: uops

Code:

  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03070a0e0f1e2223243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
63005286282141218014000529728309220229422000200020001000010001618927936281473102000300020002795428046116100110001000020004200014200020414121104827299357064319588340738097403427866152351262713856200010002817428007282002813428080
63004281762111415010000519227893222234792000200020001000071016143280012799731020003000200028001285081161001100010000200042004012200040413996985272433495335198563282380710383428033148011213812992200010002800428071279882810728348
63004282152111510010100527428057222233602000200020001000020016212278932810831020003000200028069280881161001100010000200042000042003204139351050871053542739194693408381710373627770139751205713188200010002809028078281802824128129
63004285122141411011010053112800420223234200020002000100001181618027828281073102000300020002840327896116100110001000020004200004200424414042105447048350163220014348038188353527886140471203413360200010002814228303281042823628068
63004280712111212014000529727833220231632000200020001000000816187280722853631020003000200028036285191161001100010000200042002042000200137981024172373469540194343501381114323527862136471276313507200010002809028113280562821928161
630042856321381001600052252822722023450200020002000100001010161602809228155310200030002000280662814511610011000100002000420040138200022014186997670563427533194383377381012433627724138111199313277200010002817328055281652814428139
63004279972101511016000526827962002229832000200020001000030016405279132808931020003000200028011282481161001100010000200042004042004244139671028972603519444198933389381612373527848139621194213321200010002806028338283502836528118
630042815721016110166000527627874202234002000200020001000000016161279572804431020003000200027979280611161001100010000200042002002004244139961035073363557739194203508381514343527818149741235113163200010002816027957280532856528055
630042846121191001010051502790700223023200020002000100002081617027916281573102000300020002800427952116100110001000020000200204200244413465984573503297637192993343381013353827728139921211113083200010002807628019284622800627962
630042840921161001600052342786822222918200020002000100003181618427965285023102000300020002801128189116100110001000020000200404200400614026980572753315435198643438381912323627826153091228413266200010002798828333286032805628431

Test 2: throughput

Count: 8

Code:

  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire (01)cycle (02)0305080b0e0f18191e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
24020553391400110100021102533662018025160100100160000100160000500233662515337453381533992332232335716010020024000020016000053398533981180201100991001008000080000010016001919001600571005916003861564219251101161153395992160000800001005339953400534005338253489
24020453399400110010021001533830181822251601001001600001001600005002326082153356533995339823322323356160100200240000200160000533995339911802011009910010080000800000100160021194201600191005916000061564219051101161153396992160000800001005338253382533825340053400
24020453399400110000065102533732181816251601001001600001001600005002343231153374533995339823321323357160100200240000200160000533985339911802011009910010080000800000100160019204201600580015916003861574219051101161153378090160000800001005340053382534005339953399
240204533994001101100210005336621818452516010010016000010016000050023360891533835339953400233213233571601002002400002001600005340053399118020110099100100800008000001001600191900160057100621600380057019251101161153396902160000800001005338253382534005338253399
240204533813991000000210035338321800251601001001600001001600005002334036153374533815339823321323357160100200240000200160000533985339911802011009910010080000800000100160019194201600582122116003861574219051101161153396902160000800001005340053382533995338253400
240204533994001000000210035338310180251601001001600001001600005002334036153374533995338123322323356160100200240000200160000533995339911802011009910010080000800000100160020204201600571012116000060194219051101161153395902160000800001005339953399533825340053399
240204533814001100100650035338421801525160100100160000100160000500233662515338153399533812332132335716010020024000020016000053381533981180201100991001008000080000010016002020420160019101231600006157019151101161153396092160000800001005340053382533995338253382
2402045338139911011006500353366218181625160100100160000100160000500235338915339753381533992330432333916010020024000020016000053381533991180201100991001008000080000010016002119001600570005916000060574219151101161153395992160000800001005338253400533875340053399
2402045339940011000006500353383201815251602301001600001001600005002343231153374533985338123304323339160100200240000200160000533995339811802011009910010080000800000100160020204201600190005916003861574219151101161153395992160000800001005340753400533865340153404
2402045338940010011006600353383018181625160100100160000100160000500235421215337353399533982332132335716010020024000020016000053398533981180201100991001008000080000110016002019001600571005916000061193619151101161153395990160000800001005357053382534005338253382

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6674

retire (01)cycle (02)030508090a0b0e0f18191e1f2223243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2400255339740000100000066010025338727012516001010160000101600005023473870153356053402534022334832338216001020240000201600005340253403118002110910108000080000010160019194301600581016116004061580190502017167135339101010416000080000105339553395533755355453395
2400245339440000000000000000153379201216251600101016000010160000502358180015336905339453394233193233741600102024000020160000533945339411800211091010800008000001016000004301600390003916000061390005020816713533910100416000080000105339553395533955339553395
2400245339440000000000045010005337921201625160010101600001016000050235295901533780534035340323348323382160010202400002016000053402533811180021109101080000800000101600192043016001910163160040615943190523611133913552450013516000080000105555055407554055537955395
2400245539841510000001313187812321002553873779994001617011116182012162314502409247015522105554255716233393233741600102024000020160000533945339411800211091010800008000001016169224340316108100090781617296158431935151161386135526911313516000080000105556653714552635551654379
240024558594171200001161122741408100454768270837518162090121622101216284862243041601563500575255759825414214251101649452024662920164426566245633027180021109101080000800000101628852243691162919000228731623796158019054011326014185625611313016000080000105678456488578545631755686
24002455857444100100000660100353388370882516001010160000101600005023391400153369053394533942335232337416001020240000201600005340253375118002110910108000080000010160000043016003900042160000613943005020716685337101010416000080000105339553395533965339553395
24002453403413010001117448000005337901201625160010121600001016000050235337711533660533905338223328182357716001020240000201600005337453389118002110910108000080000010160000039016003500001600356135390050205169553386066216000080000105337553390533905337553390
2400245338940000000000041000115337431818122516001010160000101600005023415104153364053389533892333432336916001020240000201600005338953374118002110910108000080000010160000039016003500035160035013500050209167953386066216000080000105339053390533905337553375
24002453389400000001000410001153359218002516001010160000101601965023417764153364053389533892333432336916001020240000201600005338953389118002110910108000080000010160000039016003500035160000010390050207166753386066216000080000105337553393533755339053390
24002453389400000001010000011533782180152516001010160000101600005023417764153515053374533742333432336916001020240000201600005338953374118002110910108000080000010160000000160035000351600356136000502071681353386066216000080000105337553390533755339053390