Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, three registers, 4H)

Test 1: uops

Code:

  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030507080a0b0e0f1e223a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
630052926921901709000714589288760002407120002000200010005316142285652925131020003000200029060291111161001100010000200004200003262000406001283491816833305954320614303838165263328389164261378115528200010002920129181292332920929328
63004292522190908000714742288870202413620002000200010000416162285132927731020003000200029107291261161001100010000200004200006002000404001288291146827310952920603306638129292828417163701370715625200010002925429288293222919629294
63004292972190120110000144972884000124145200020002000100005161522860029303310200030002000291402910711610011000100002000062000000220004020012854920768633028529205633083381612313128418163461384015598200010002929929290292892913929254
6300429267220011011000514609287950002422320002000200010000216156286672927131020003000200029150291491161001100010000200004200000002000204001281792616884299973720595311738158283428456161211372115585200010002921029234292422928929269
6300429306220013011000714630288250002416720002000200010000116133286362934031020003000200029155290961161001100010000200002200000022000404001295991136813307113320555308438167313028392165151381715464200010002925129295292432921729283
6300429183219010011000514568288450012423420002000200010000016141286332925131020003000200029078290561161001100010000200004200000002000402001283393186820305343220582305738158302928486164951362515856200010002927429212293252927429244
63004292332190906000514579288300002421220002000200010000616134286322928831020003000200029156291711161001100010000200004200201002004402001294491736865306172920573305638139353428365164591373215747200010002924129313293152923629256
6300429281218015011000514639288460002415320002000200010000016156285302917131020003000200029165291201161001100010000200004200000002000402001286090636818305533420640302938118282128442162751355615626200010002926829272292482925629258
63004293002200120160015145322890501024155200020002000100007161542859929229310200030002000290822907911610011000100002000022000010220004020012824899968683016231206123054381610272528414165561367715534200010002922529217292842922929272
630042917522009010000514613287880002414120002000200010000316141285982917331020003000200029144291321161001100010000200006200000002000404001277990896848302753620615317938168262728390164441369315671200010002926029235292412923329342

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire (01)cycle (02)0305080b0e0f1e1f223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
24020553400400100114501353384218182325160100100160000100160000500234554415337853389533922331932335916010020024000020016000053399534021180201100991001008000080000110016000019420160057101631600386035390005110116115339614192160000800001005340053399536465343353413
2402045339940010100210025336621818832516010010016000010016000050023366250533735338153399233213233571601002002400002001600005339853399118020110099100100800008000001001600201900160019000591600380057421910511011611533950090160000800001005338353399536515340653400
24020453399400100002201253383201814025160100100160000100160000500233662505337353399533992332132335616010020024000020016000053381533991180201100991001008000080000010016001919420160058001211600386157421910511011611533780992160000800001005339953399537395339853382
2402045339840011000210025338321818382516010010016000010016000050023373150533565339853399233043233571601002002400002001600005339953398118020110099100100800008000001001600201900160057001621600380157421910511011611533960992160000800001005340053400533825338253413
2402045339940010000650005338421818525160100100160000100160000500235873905335653399534012332232335616010020024000020016000053381533831180201100991001008000080000010016002119420160057202211600386057421900511011611533960992160000800001005340053400536545341453400
2402045339940011100660035338421818352516010010016000010016000050023342010533735339953399233223233391601002002400002001600005339953381118020110099100100800008000001001600191900160057100591600386057421910511011611533950092160000800001005339953400536135341253486
2402045339840011100210025338310181825160100100160000100160000500233608905337353381533982330432333916010020024000020016000053398533981180201100991001008000080000010016001919420160057010211600386119421910511011611533950092160000800001005338253399536545341053399
240204533994001110065003533842180222516010010016000010016000050023471940533745339953399233223233561601002002400002001600005339953399118020110099100100800008000001001600191900160056111211600386157421910511011611533950092160000800001005339953400536435340953400
240204533814001000065003533842018912516010010016000010016000050023342010533735339953399233223233571601002002400002001600005339953381118020110099100100800008000001001600191900160057102591600386157421900511011611533960992160000800001005340053399536565339453399
240204533814001110065003533662181813025160100100160000100160000500233420105337353381533992332132335716010020024000020016000053398533811180201100991001008000080000010016001919420160057000591600386157421900511011611533750662160000800001005337553400536235342553400

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6674

retire (01)cycle (02)0308090b0e18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)c2cficache miss (d3)d5d6dadbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240025533944000100004501015337921212232516001010160000101600005023581800053369533955339423339323374160010202400002016000053394533941180021109101080000800001101600000430160039003916003961043050200121600109534230100416000080000105340753404533955339553395
2400245339440000000045000153379001216251600101016000010160000502345544005336953394533942331932337416001020240000201600005337453374118002110910108000080000110160000043331600390039160039613943050200816001085339101010416000080000105400853410533955339953395
2400245339440000000045010153379212121625160010101600001016000050234554400533495339453394233393233741600102024000020160000533945339411800211091010800008000011016000004301600390039160039613943050200101600118533910100416000080000105340653408533955339553399
24002453394400000000450101533792121216251600101016000010160000502358180005336953394533942333932337416001020240000201600005339453394118002110910108000080000110160000000160039000160000603943050200916008105339101010416000080000105340553388533955337553411
2400245339439900000045010153359212122325160010101600001016000050235295900533695339453394233393233541600102024000020160000533945339411800211091010800008000011016169204337716173310979516172901394305235016133001814544522100416000080000105553155405554485566355346
2400245460241400001314185112321015552220128474031618321116065010160000502337085005334953394533942333932337416001020240000201625005522855347141800211091010800008000001016169404340016108110104081615990104305249115142001717553001010416000080000105556055513554055552655506
24002455859419000017172292149600256222201212855211622241316234011163062562428734005659056944570722528623825432163574202449052016346556142561692018002110910108000080000110161690043016003900016003960394305036010520014145461011010416000080000105339953398534375341453395
24002455350443000017152160140800053379212121625160010101614301316000050235295900533705339553395233193233541600102024000020160000533745339411800211091010800008000011016000004301600391031600396139430502009160010105339101010416000080000105586755847556455542156169
24002456217433000019184500015338021212025160010101600001016019550233732700533495339453394233393233741600102024000020160000533945337411800211091010800008000011016000004301600000039160039603943050200916011085339101010416000080000105339553395533955339553395
2400245339439900000000101533590120162516001010160000101600005023529590053369533945339423339323374160010202400002016000053374533941180021109101080000800000101600000430160039004216003901394305020012160011115338401010716000080000105339753395533955339553395