Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, S)

Test 1: uops

Code:

  ldr s0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03080b0e0f1e20223a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a2a3a6a7a8a9aaabacafb5b6bbdcache load miss (bf)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebld/st retires (ed)gpr retires (ef)f5f6f7f8fd
100510408010056121101025111671425200010001000100010005074645824110151040104082438982000100010001040104011100110001000010160062102600130831102340628480073216111037100027271000100010411044104110411041
1004104070000512003010250147152520001000100010001000507624582411015104010408243898200010001000104010401110011000100001000003710250000060104122424480073116111037100027281000100010411041104110411041
1004104380000431301010257135162520001000100010001000507624582411015104010408243898200010001000104010401110011000100001020120461029201601829103727516320073116111037100028231000100010411041104110411041
10041040810004420010102571512125200010001000100010005077045824110151040104082438982000100010001040104011100110001000010070037103500001225103921318320073116111037100027341000100010411041104110411041
100410458010056100401025171121125200010001000100010005077845824110151040104082438982000100010001040104011100110001000010190069103122150022103129220560073116111037100030321000100010411041104110411041
100410558000079001010430111222520001000100010001000507464582411015104010408243897200010001000104010401110011000100001000006110340000027102336616640073117111037100035351000100010411041104110411041
100410518000061001010257144182520001000100010001000507064582211016104010408243898200010001000104010401110011000100001000004210280000035102728525240973116111037100020301000100010411041104110411041
1004104080000631215010250244152520001000100010001000507464582811015104010408243898200010001000104010401110011000100001000026710341320321238103528622560073116111037100020421000100010411041104110411041
100410407000065241101025727419252000100010001000100050762458241101510401040824389820001000100010401040111001100010000103400451026401202224104521621320073116111037100027351000100010411041104110411041
10041044700005513012010250131112520001000100010001000506984582411015104010408243898200010001000104010401110011000100001000005810200146014461043557174002373116111037100025331000100010411041104110411057

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr s0, [x6], #8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1914

retire (01)cycle (02)030508090b0e0f18191e1f20222324293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5020912201891310001000596083910079201481217567923712145525704965035810127100004010010000100001083586460405046467911121913121733121766115201311555760100302001000010000611941000010168121733121764215020110099100401001000010000010010915111752810711208139548645109611393134101203210176111215035034010049168561000050100121783121656121762121644122255
50204121680912100001005761768261007522108121707809211213952570543503881013510002401001000010000107771446065744653825112217812174412183411507331154266010030200100001000060200100001000012189812184911502011009910040100100001000001001089621285521071721810930843410947132111212040321017611121738502908589327941000050100121705121783121553121689121630
50204121782914100000005060852100736111612180381112121433257050550336101441000040100100001000010767424605004465603611217291217751217691151313115369601003020010000100006020010000100001217281216021150201100991004010010000100000100109331116551107172061194484331091212531191030321011711121594502629148668291000050100121786121813121639121879121840
50204121788911210000005310860100752216812181682022121473257050850370101281000040100100001000010771804605653465527811218051220511219531149733115381601003020010000100006020010000100001218731217271150201100991004010010000100000100109343115532107302091092782411097612641362031321017611121496502819429187241000050100121852122160121603121745121696
502041220249131000000054608291005521108121653810321213762570478503681012310000401001000010000107651745978754650869112171212168612158211491531152906010030200100001000060200100001000012167312179211502011009910040100100001000011001095631155221073622013924683010905133411810703210176111214845025296210569651000050100121906121810121809121970121625
5020412187891320000000546084210073611081219298293312135425704785035010130100004010010000100001086161460155046481831121772121860121815114889311551460100302001000010000602001000010000121802121783115020110099100401001000010000010010948111556510707209994632271089913421212050321017611121641502389048567731000050100121704121790121882121757121767
5020412174991111000000486085710073621041217608114112125025704905037610142100004010010000100001077030460006046463411121692121824121758114940311552460100302001000010000602001000010000121687121727115020110099100401001000010000010010919199539107302161994186132109421265124205960321017611121605502589229987871000050100121727121839121773121793121744
502041217629131001000050308361007202136121865802131214602570484503581014010000401001063511024114900146520154703168112370812416012430411613121111696166095335421113911149671281117611154124796124337301502011009910040100100001000001001097811215441078121911906468416310937124212213303495430723123513505449809028151000050100124041124654124892124220124999
5020412442093500000033316020817100568110012430581031121435257047850364101291000040100100001000010776844604604465045011219271217001216341148223115369601003020010000100006020010000100001216361217711150201100991004010010000100000100109004115529107262311212407443109741202127235032101761212158950264111911649221000050100121898121936121851121987121996
50204121951913100000003920810010744210012192984022121688257051450370101311000040100100001000010791474608233465557611218301218561218081152173115527601003020010000100006020010000100001221171219741150201100991004010010000100000100109593121586107392031398272161095212341412614032101761112181550262111710409071000050100121881122026121841121978122011

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1893

retire (01)cycle (02)030508090b0e0f181e1f2022293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3c5branch mispredict (cb)cficache miss (d3)d5d6daddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500291220269155000010415083717603120122050763411214382570379502901012110000400101000010000107982446206854653114012177801217001217281151600311559660010300201000010000600201000010000121992122058115002110910400101000010000010108832111535106961991187944301087512451362350031400982057121717502601128107310051000050010121832121847121836121828121778
50024122054913210000039308081712225212196183240121698257040050272101351000040010100001000010789694602654466147601217820121877121930115300031155836001030020100001000060020100001000012198712196311500211091040010100001000001010880129853510722209995468341090411781314330131400782067121674502549829889551000050010121980121824121772121818122019
5002412182891444000004510804178441081218017874012169525704215026210133100004001010000100001078267460841246605750121829012192112180311524003115512600103002010000100006002010000100001217411219751150021109104001010000100000101090881165291072020399107833109311238122443003140068207612162950274115010268651000050010121808121900121758121899121933
50024122057914300000048008051720312012184579731121340257041850272101321000040010100001000010784294600013465915401218960121852122103115254031154266001030020100001000060020100001000012191812179411500211091040010100001000001010932310953310703208109134631109591177128309003140068206101216875029298011149291000050010121967121685122064122086121973
500241220179143003000455079117524140121877807511215772570412502821014310000400101000010000107914046159184658089012181801217561219211152180311565360010300201000010000600201000010000121761121840115002110910400101000010000010109183108538107082091597850371092112151203930031400108201010121654502449729958761000050010121846121900121964121868121999
5002412202791250000104740778171241081217358085012136345704365027810139100004001010000100001079428460947646575520122049012193912187611532303115449600103002010000100006002010000100001219991219361150021109104001010000100000101089641205301073521279144829109191267130498003140068209712162150282105911339811000050010121833121835121989121795122092
5002412179991330000004430821170439212191278550121554257038850270101241000040010100001000010772594604969465488801217450121808121799115009031154526001030020100001000060020100001000012204612186511500211091040010100001000001010878610152710691194138996431109211166134303003140088208912158550246106810989581000050010121742121887121960121938122122
50024121957912400000050708001672410412191077750121650257041850252101331000040010100001000010792394606699466577501218760121810121971115540031154816001030020100001000060020100001000012182212188411500211091040010100001000001010901410153410712193119147230108931358129401200314007820761215275025010519929341000050010121872121872121815121919121800
50024121921912400400042208301720313612194976830121691257043050300101301000040010100001000010778444605650465857901217870121914121652115073031154006001030020100481000060020100001000012193212194711500211091040010100001000001010875411152210694211118954624109221217123331100314001182010712169350256101411159661000050010121693121854121861122028122061
5002412185391330000004640822171231001218747914012156825703825028410140100004001010000100001078159460801646600840121719012187812211911527303115570600103002010000100006002010000100001218121217701150021109104001010000100000101087931155251069121312903683210876123711430300314007820101012154650260107410219251000050010121907122055121707122039121948

Test 3: throughput

Count: 8

Code:

  ldr s0, [x6], #8
  ldr s0, [x7], #8
  ldr s0, [x8], #8
  ldr s0, [x9], #8
  ldr s0, [x10], #8
  ldr s0, [x11], #8
  ldr s0, [x12], #8
  ldr s0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3683

retire (01)cycle (02)0305080b0f181e2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a5a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3c5cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80209301742211101069177761760115116295118206591923192322212516015880152800008010080000400849130136814629355295722943519066319370160100200800002008000029191294261180201100992710010080000800001100808891734957840854396611687242581986350803133541451921703051101171129352258005952650612080000801002944429667294292946829474
8020429477221100007329815178498152295697926831945202821092516014980146800008010080000400759130148115029646294672960219326319350160100200800002008000029474295581180201100994610010080000800001100809121839359020856716041687436542486005703127505452981803051101171129579208004658359011880000801002947129506293922937529612
802042932422200000725576817129814429466777545186419142077251601528015680000801008000040084113002361542947129301292041912731941416010020080000200800002930029351118020110099261001008000080000110080896039253280862036961887786559786128716116555646030010051101171129515248003869158112480000801002983129447295102940729448
80204294062200000069107781656118132294007725602075196622302516013780152800008010080000400806129590615229728293532924619495319377160100200800002008000029461294621180201100991810010080000800001100808780366535908587264614903645420862657941225600508900305110117112937621800525064579680000801002939529478293462927329277
80204293982210000070877961680991002947274462218841950217625160153801558000080100800004007501306662161295852954529569193483196051601002008000020080000294722925411802011009930100100800008000011008088003875541085368655148503046978643875013751995273093051101161129471178004456348111080000801002958029750295912935229662
8020429501222000107012835167210311629291779735195621432187165160140801458000080100800004007771289060152292402974929559194203194651601002008000020080000294962929011802011009921100100800008000011008091403375701085870598148945656108658774712348464649033051101161129337318004954860613280000801002939929595296632959729520
80204294972220000076688461712931122951378169518921855205025160154801418000080100800004008091287518143293682933129297191583194591601002008000020080000292332933911802011009924100100800008000011008085603895913085623653159123058228648676912654975086006051101161129469248005351950511080000801002950529285293202930829321
802042947322100000726282517441212482940081385019121911230225160153801548000080100800004007571301731148296632926429388195583192371601002008000020080000294952957911802011009926100100800008000011008087404155318085747641158686651008610573012351655025003051101161129403278004755658011380000801002934929467294882950729356
802042946622100000764680816161121442951075079618122112215925160152801518000080100800004007841301274140295932931929509194133194031601002008000020080000292722935811802011009932100100800008000011008090003715186085610707119132856948626778011456025391003051101171129280308005355553412080000801002957829718293112962729487
802042932922100000702378717121011522944879552319021856229225160155801448000080100800004008141302743149295982966229262192173195981601002008000020080000292502955611802011009944100100800008000011008091403455890085191681189061005811861387421305393522600005110117112941621800526285559180000801002948729403294842956229419

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3641

retire (01)cycle (02)030508090b0e0f181e1f202224293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a5a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
800292941921820001106232082110720971682900378542019512019199725160050800638000080010800004002851283383038290632919729077191933190861600102080000208000029027288961180021109241010800008000011080887043954050848517631089742490985627614142546455670035020916542926528800583833765980000800102917329209292022925429373
8002429331218000000070250788107201211082905081247519341956202725160059800608000080010800004002811281646043290652903829256191293190111600102080000208000029098290541180021109261010800008000011080905043645620845637781291580467385430780126406444330035020316242892331800483543116980000800102910929056292242914229208
800242914821900010006555078810752901162907479338318462024181425160050800588000080010800004002631286383047291142908829109191553189601600102080000208000029025292131180021109191010800008000011080897043444410844477321190280472585476763117475346190035020416442901841800394112736980000800102918129015291712916029017
8002429017216110000061310799107201189629129791382213620631746251600488005780000800108000040033812785240302896029115289381903431904516001020800002080000290032909311800211091610108000080000110808901842351590847077328904444350853847651355054499218035020416452903727800304143555380000800102919129173292422918828949
8002429230218000000063130816107041091442920479438919672002188225160364800528000080010800004002691274437045291222907229261192063190471600102080000208000029147289371180021109171010800008000011080912043348370845387696881122461685032762117487643610035020416432897819800403233585480000800102905729061291292897429150
800242912521710000006672082410720108922893780638018092160175025160054800488000080010800004002551285421139291552912229025190763192761600102080000208000029170290711180021109151010800008000011080918047550780849597648909324950854317491274292481513395020216242909438800363133496580000800102910428932292582897028896
800242885221700000006749080710728103112289557813691890214017892516006080054800008001080000400228127909313529106290852913319350319057160010208000020800002910129142118002110925101080000800001108091204365243084816775689736469785743712127483248670035020316462905526800454033536480000800102919629234291292887629157
800242920121700000005978081810712101144290117943891889191118902516005380053800008001080000400318128035103629281291692898019130319018160010208000020800002909929199118002110920101080000800001108091818428482608468278210936324484861827401214609495317045020416442909927800353763735380000800102921029144291332919629238
80024291042180000000638908041074496112289417954201659180217812516004880050800008001080000400293128569713928851291832912419203319096160010208000020800002912129135118002110925101080000800001108091104615536084986748790348445985857769122420251100035020216662895032800363853635880000800102910929092291172917329135
800242915421800000006221080510768951122904877040921022067199125160054800528000080194800004002771282842143289852931329089190743191291600102080000208000029189291641180021109211010800008000011080882048950690847547388890384706852976741234628545000155020616662898131800503503546080000800102900929056290172928429162