Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, S)

Test 1: uops

Code:

  ldr s0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030508090b0e0f1e2022292b3a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)a0a1a2a3a6a7a8a9aaabacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
10051052700001193010040102516311152325200010001000100010005073045824110151040104082439132000100010001040104011100110001000102602751026141120103010385192379000732162210371000332101000100010411041104110411041
10041040800010072361005010256579124520001000100010001000492104582401015104010408243898200010001000104010401110011000100010250251102851260221310234031563000732162210381000213301000100010411041104110411041
100410408010001572610020102565141419252000100010001000100050754458241101510401040824389820001000100010401040111001100010001043724810409112003310283041739740732162210371000242501000100010411041104110411041
100410407111100716100220102593129182520001000100010001000507544582411015104010408243898200010001000104010401110011000100010291216810477110002910203042571000732162210371000212801000100010411041104110411041
100410408000000526100301025037111925200010001000100010005074645824110151040104082438982000100010001055104011100110001000100000551035910002210193321847000732162210371000242201000100010541056104110411041
1004104071001005801003010258411522252000100010001000100050738458251101510401040824391020001000100010401040111001100010001017003110333113062410152941740000732162210371000211901000100010411041104110411041
1004104070000005201002010402048521252000100010001000100050738458240101510401040824389820001000100010401040111001100010001026190551037911002210233432539710732162210381000242901000100010411041104110411041
100410408110100582610040102510556192520001000100010001000507144582411021104010558243910200010001000104010401110011000100010227248102921180122110303652947703732162210371000242201000100010411041104110411041
1004104081011005521100301025011072325200010001000100010005074645825110151040104082439132000100010001040104011100110001000100680471034001002710202832347610732162210371000242201000100010411041104110411041
100410408111101731510030102511291013252000100010001000100050706458240101510401040824389820001000100010401040111001100010001030838010444117082910404142655600732162210371000292701000100010411041104110411041

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr s0, [x6, #8]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1888

retire (01)cycle (02)030405090b0e0f18191e1f20222324293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9e9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502091218839120000000060308421007842108121689792211213652570469503501011710000401001000010000107684146090724642614012163312168812163211496531153756010030200100001000060200100001000012182412169211502011009910040100100001000000100109040132553106812171391172291091513721290350321377698121362502608629087761000050100121640122121121840121763121728
502041217839120000000159908531007361100121692820211214622570457503441012310000401001000010046107793945987044652878012150412175712174811489731153256010030200100001000060200100001000012184612176911502011009910040100100001000000100109161131527107242141493872301091513031281150321377668121490502769269407631000050100121711121580122110121744121678
502041217829101100000057008231007362140121699776211215162570514503761013510000401001000010000107667046027534654528012162612153012173711500531153896010030200100001000060200100001000012139812169711502011009910040100100001000000100109191138534107072081290072301091412521131190321387698121481502408588907751000050100121742121796121793121944121769
5020412184291101000000606082610070421121216257942112183025704635032610139100004010010000100001077012460238846521800121683121719121790114929311542560100302001000010000602001000010000121860121675115020110099100401001000010000001001091011425121068721410929503610929125212111250321367668121252502649508608421000050100121687121703121828121841121659
502041218019110000000057208221007363108121623807111215932570472503641013310000401001000010000107644546047184644276012180712154412168211511431151776010030200100001000060200100001000012172512164911502011009910040100100001000000100109100145573107232121189972401092212111310340321587688121362502649209407941000050100121836121532121758121908121779
5020412177991100000010581081710072011441215087991112143625704725036210114100004010010160100001075482459983646493010121864121754121747114840311526960100302001000010000602001000010000121741121641115020110099100401001000010000101001093301375171069621715927343210947134112903110321387687121334502569549008391000050100121630121743121870121647121887
5020412197491100000000563082310069611281217397992112150725705025036810132100004010010000100001078452460371646513090121554121742121684115216214116840660993354011097111606679811098111551241311241312815020110099100401001000010000001001092901335401069421412955307935910889122112203110321387678121498505049609328081000050100123881124028124300124361124436
5020412415393120000028293459264079610076831361219747921112161039070872506061017710062445181122711174114714446633954713884012160712176812176811489631154266010030325100001000060200100001000012159212174521502011009910040100100001000000100109493131566107532151192232331092313021241670321367679121457502401093116410021000050100121994122097121997121900121941
502041219309140110000047108240017042160121940801111214352570526503581013210000401001000010000107901046021484657458112195712185112185711501331157206010030200100001000060200100001000012195112200111502011009910040100100001000000100109251137521106882191192234421096313231181911032137768812164850260109510569361000050100121888121914121958121994121926
50204121828913010000004130832001776314812199982321121572257048450374101361000040100100001000010788484609828465660711217581220271220231152413115729601003020010000100006020010000100001219901218682150201100991004010010000100001010010908112453110725211149523830109471193131115032135767912166450278101310629381000050100121852121993122064121867121754

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1921

retire (01)cycle (02)0305080b0f18191e1f2022293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3c5cfd2d5d6dadbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50029122068914000000421084517520140121906785101214842570421502721013210000400101000010000107990546150424658281012183612192212186411515231156536001030020100001000060020100001000012190512179011500211091040010100001000001010894110450810782204129264811109451352114125031400482002412156950260115310669811000050010121789121956121751121937121925
5002412199991310000039708291728012812199980920121647357043050274101261000040010100001000010786634606121466261701218821218361219151151983115617600103002010000100006002010000100001219231219111150021109104001010000100000101091301145561076621614920442610902128213801510031400482004212166050266104911739761000050010122095122123121789121974121936
500241218239120000004240809173601441220807912012168425703795028210135100004001010000100001078240461032746563600121783121866122084115013311562960010300201000010000600201000010000121963121780115002110910400101000010000010108730104529107531942095948151094212511370390314002820042121568502561125113110661000050010121935122110121826122113121940
5002412208491300000041808291704096121869816201217072570394502901012910000400101000010000108032846104914661611012183012170812199411520131156316001030020100001000060020100001000012191212190711500211091040010100001000001010919110553510759213109313817109691332137007031400282005312162250272110612129261000050010122020121897121871121965122346
500241218059140000004150819172001281219647891112180125704035030610126100004001010000100001078681461111446663290121751121946121648115164311533860010300201000010000600201000010000121954121942115002110910400101000010000010108921109523107592121294042211092412521281310031400682005312167250304117011999151000050010121769121958122132121882122115
50024121772914000000400080526400124121803828111215222570412502861013410000400101000010000107927546092694661706012199812188212201511518031155666001030020100001000060020100001000012199912192811500211091040010100001000001010904011353210745203108733025109371351144008031400282014612173950272106011199651000050010122013122016121816121951121940
50024121820913000000433085217520112122036807201215022570388502701013910000400101000010000107907746054374662516012199012207012188111529431155546001030020100001000060020100001000012182412185411500211091040010100001000001010928010553710741216109437216109271312132008031400382005212189850268115711169231000050010122191122000121985121867122055
50024121838912000000453083717200104122062844201215542570451502621013010000400101000010000107858246103574665641012186112179012204511511631156276001030020100001000060020100001000012187412221811500211091040010100001000011010973111454010773218238854825109041202130037031400482003512173150266117098910401000050010121909122055121902121839122032
5002412194991400000041008481760084122012811101216712570415503061013210000400101000010000107898746078414660443012190512198212191111525331156246001030020100001000060020100001000012196812202211500211091040010100001000001010915012453510798208118863219108961422119031103140048200631216215024210909829151000050010121763121997121904121916122387
50024122024913000000447080817121120122056811101215522570418502681012710000400101000010000108058046091964651543012168912195812189611515231156036001030020100001000060020100001000012192812188411500211091040010100001000001010893011153310759204138967425109281272132035031400282002412163250244102110828931000050010121936121976121971122018122160

Test 3: throughput

Count: 8

Code:

  ldr s0, [x6, #8]!
  ldr s0, [x7, #8]!
  ldr s0, [x8, #8]!
  ldr s0, [x9, #8]!
  ldr s0, [x10, #8]!
  ldr s0, [x11, #8]!
  ldr s0, [x12, #8]!
  ldr s0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3676

retire (01)cycle (02)030508090b0e18191e1f2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)67696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a5a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802092968622020000007043078817041112042937677353019681838219525160147801518000080100800004007421298926482950029279293321933431950516010020080000200800002921529494118020110099331001008000080000010080885193875537085206643188695650078601369311054985450192651101171129423228004351552411680000801002934029293291832920229407
802042936521910000007128076716961101882942182443921302125227625160142801498000080100800004007491300728422928529589294211940131939416010020080000200800002940929544118020110099251001008000080000010080889194116067085169652128703852298601479411450244519190551101161129594288004760452810480000801002940929181293402942129315
8020429341220101000069150777162410810829475770462175115912225251604758015580000801008000040076412881683729720294012952419348319444160100200800002008000029411294931180201100993010010080000800000100809271937658940856416351583410248368573776613353994988190351101161129419278005051050911680000801002947229239293432938129256
8020429448221101100068950791155210510029249798495171217592002251601458015380000801008000040079012920214829523292302929819307319403160100200800002008000029603293381180201100991810010080000800000100809101937156610850676411485320450268553973012052505460190951101161129563228004855054111580000801002946229288294782932829579
802042939421810000007176076417281143082936078944717811742216125160150801428000080100800004007401302765372933429154295611911531943316010020080000200800002940429305118020110099241001008000080000010080879203775162085639646148543856328629581512351764829194551101171129473318005348158011780000801002957629262293432919729544
802042940722010000117074077016001051122939680245717861867206825160143801578000080100800004008181295049582952529412294421927131950216010020080000200800002963729338118020110099281001008000080000010080902373965654085276655138515257028594277313952085603370451101161129147198005455052311080000801002948629600296362943529177
80204296432202222100690407501752104962944482351319841975209025160156801528000080100800004007651304121452951729332293441933531922416010020080000200800002939129375118020110099351001008000080000010080900373465745085534620138515452838597269311951904420372551101171129447358005551954512780000801002919729490294672932829331
802042956322120000006834078717281051162942077559117831734195825160161801528000080100800004008341303531462947229243294211951531931216010020080000200800002943229189118020110099281001008000080000010080876373865636085303680158633654758565772612257245047373051101171129444348004651758411280000801002941829371295362951729410
8020429304219202000071230869169610022829450803426175819002076251601548015280000801008000040076312924294729579293182943519146319272160100200800002008000029377295731180201100993210010080000800000100809283834854890852336391388113449878592373012651265326353951101161129320298005057360412380000801002944229223294592946829338
802042948622121000006936077916001001162942775845616601840201725160140801428000080100800004007641296804552949129531294191932331925416010020080000200800002935229352118020110099261001008000080000010080877363535642085335661158804050858596284612546654733360351101171129476298005460849912080000801002948029430292522938829437

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3674

retire (01)cycle (02)03070a0e181e1f2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)606167696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a2a3a6a7a8a9abacafb5b6bbdcache load miss (bf)c2c3cfd0d2d5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
800292987822000007145084417121131362953382435518481759217425160053800628000080010800004002401297695104629450294692949819530319596160010208000020800002954029350118002110950101080000800001108096340359538524168411968624998861117251354893524003502000416242959223800495815768980000800102948529461293782947229372
80024293682210000677208341680120116296288063571835172421992516005080053800008001080000400264130449810542965729405294511920031947416001020800002080000292782936711800211095210108000080000010809573766060851036639943725028863007841364720552900502000216442948638800365845769880000800102926629353293282911329387
8002429637221000070360830172813596294597873281810173721642516005480061800008001080000400275129943215502948429423295441949131934516001020800002080000294832957411800211095710108000080000010808833855555855156729935705289863227771304929587102502050416242947526800436315479380000800102944429326294122951329338
80024295972210000687908381728971482921282734918891986232025160060800498000080010800004002891286429155529464295382945919250319567160010208000020800002958129512118002110958101080000800000108090636953128506268611947444575858868351254778538532502050416442986631800466375769480000800102937729586293342949529674
80024294712210000655788831181612012829544840360192215772283251600548004480000800108017840033513029720542293942937829592194243191041600102080384208000029337296311180021109571010800008000001080955386564685071692109417253968574580612646875981005020012162429519438005056954710280000800102952629679293142952429635
8002429307220000267500829168811112429486801347182718061946251600508004880000800108000040027212983951042292412921229465194573194411600102080000208000029404293151180021109471010800008000001080924428528785309684149156054938625474113144625681315020044167329523448004365160710480000800102953229535297012958129338
80024296262210000748508541720115128295208313421732162619912516006980057800008001080000400286129692911042292792928429508193933193021600102080000208000029447294251180021109341010800008000001080943407571585431713119276251648611384013547975556065020104216442939324800406245038880000800102947629397293982937829399
800242935822100006577083117201211402938181031218301779209725160049800518000080010800004002901299505105129372293552917619282319485160010208000020800002930329307118002110960101080000800000108092740456408566068113945445073863457471275158531703502000216242912022800435545489480000800102936129322294932935729246
80024294572190000694808591776128116294067883031976182921482516005980052800008001080000400246128573911040296222964729155194283194061600102080000208000029589293841180021109581010800008000001080954385531885207719199284251198591485014349185541005043057165629603348004154257611180000800102947529385293712950329443
80024293072200010630708611720118140294798373371635186821676016006080061802608019480000400285130199600352936029334295001950531946116001020800002080192293322947711800211094410108000080000010809293735625847816729937445136860448191284710540200502000416422921031800355935499780000800102940829527294642938429465