Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, Q)

Test 1: uops

Code:

  ldr q0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03080b0e0f1e202223293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a2a3a6a7a8a9aaabacafb1b5b6bbdcache load miss (bf)c2c3cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
1005105981000970000312102562561437252000100010001000100050746458231101510411040824389720001000100010401040111001100010000103628510548135083410551256783885007321622103701000473501000100010411041104110411041
1004104070000962410050102543441025252000100010001000100050738458241101510401040824389820001000100010401040111001100010000102408010476025005710351253993278007321622103701000352901000100010411041104110411041
1004104080000880100524102542566242520001000100010001000507384582301015104010408243898200010001000104010401110011000100001008096106210019084110491255894387007321622103701000162901000100010411041104110411041
1004104070000751200030102516512192625200010001000100010005072245816010151040104082438982000100010001040104011100110001000010216103104719029062510461254454752007321622103701000293901000100010411041104110411041
10041040800009125100301025323411302520001000100010001000507224582301015104010408243898200010001000104010401110011000100001023275105491230103510831254183181007321622103701000402101000100010411041104110411041
1004104070000872710042410252245523252000100010001000100050746458241101510401040824389820001000100010401040111001100010000101905810366019083710421254773069007321622103701000443401000100010411041104110411041
100410408000092000020102523461212520001000100010001000507544582301015104010408243897200010001000104010401110011000100001028357105361230143610541254353159007321622103711000433001000100010411041104110411041
100410407000093000040102537267262520001000100010001000507144582301015104010408243897200010001000104010401110011000100001031076105670360164110531254262452007321622103701000373001000100010411041104110411041
1004104070000770000401025246862125200010001000100010005067445815110151040104082438982000100010001040104011100110001000010000701027000003210321254783769007321622103711000483901000100010411041104110411041
100410408000076191003410251861171925200010001000100010005073045824010151040104082438982000100010001040104011100110001000010200991038000004310841255783169007321722103701000262301000100010411041104110411041

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6], #8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 10.0801

retire (01)cycle (02)03090e0f181e1f2022243a3f404346494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb1b2b5bbbedcache load miss (bf)dtlb miss (c1)c3cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5020913434999900102140710542130656105808129565257189151304106001000040100100001000011573614935505501227901299550129557130069123699312460860100302001000010000602001000010000129330129617115020110099100401001000010000010010015054810071136691007412504640684528000321017611128999511205450566052441000050100129653129656129687129470129380
5020412912598600001950710549130462759016129584257192451304105951000040100100001000011594944951753498137701295800130136130960123966312344260100302001000010000602001000010000129859129603115020110099100401001000010000010010015054210074337831007812504494674502000321017611129391512125887614554871000050100129561129745129828130171130005
5020412990698900002060710574129779758012130099257190651322106191000040100100001000011535224899631493624611291890129243129755122992312334060100302001000010000602001000010000129834129457115020110099100401001000010000010010014054510073296701007412504530684531000321017611130095511825974598152831000050100129809129281129649130273130144
50204129547987000019809105081298141155015129502257189751471107011005644235111141113512188144984465504313411316710130919132074124294227125689665143376211177112376740411218111961326321323453115020110099100401001000010000010010047257710109169609710074125045366545660003874129413131423514955932626956621000050100132760131956133297132669132551
50204132993100600013394413271056812955512600161294604772014513881061610000401001000010000115802748971634999258013029501296731233631237413123571601003020010000100006048610000100001293201289802150201100991004010010000100000100100170524100231773001002412504535154531000321017611128964511046043596152141000050100129300129229129057129085128787
50204130976972000016707015531299361010091290662571717513501063310004401001008210000115127248824654944157112907001296941294921229073122817601003020010000100006045010080100821295561292521150201100991004010010000100000100100140522100222072951002212504534164486000321017611128994509925222554251131000050100128876128792128722128661128643
50204130552967000017717690154712951578061292182571795512281058710000401001000010000114815448784984939162012877401294821294021224823123091601003020010000100006020010000100001291851290491150201100991004010010000100000100100170573100301691031002712504518184512000321017611128908510405657552649051000050100128761129132128773128678128840
5020413102897101101510701521129445119161284532571813511581052010000401001000010000114245348875424932721112982201296531291571232843122862601003020010000100006020010000100001290601287291150201100991004010010000100000100100160516100261272841002712504517174521000321017611128763509865547575050471000050100129199128541129018130055129108
5020413166997001001650701530129297108051288202571804512541054310000401001000010000114192948945534951974112998901299661293991227103122403601003020010000100006020010000100001290561284451150201100991004010010000100000100100170542100291091881002612504473164536100321017611129252510386102564851141000050100129061129923129190129098129132
502041309539680110175012015061292676706128818257161851180106021000040100100001000011506784901320495064511288480130043129861122460312355560100302001000010000602001000010000129110128612115020110099100401001000010000010010015055110029169471002612504511164570000321017611129320510445689587556441000050100128561128280128958128894128928

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.9254

retire (01)cycle (02)030508090b0e0f18191e1f2022243a3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a3a6a8a9acafb1b2b5bbbedcache load miss (bf)dtlb miss (c1)c2c3cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50029133181974100000001450710522128963744128848257157351080105431000040010100001000011545644911809496701111298380128897129175122519312280560010300201000010000600201000010000128571128983115002110910400101000010000101001705501002118101810021125046341345250000314048233128675511326319596256531000050010129318129236129566129592129029
500241298339710000000014501010492129371101412884425716815109810561100004001010000100001146774488264149507621129476012936112956912235631233386001030020100001000060020100001004913018512940711500211091040010100001000010100150563100181771510021125045121245560000314038233129190510705884599153401000050010129246129556129430129065129509
50024129133962000000001390910505129221104213025925715915113010508100004001010000100001146064491124349539361129342012941012892712232531231116001030020100001000060020100001000012932712897911500211091040010100001000010100170537100191571810019125045191345130000314038223128654510645698640957371000050010130818129944129008129104130079
5002412806896810010000217091051412933171312989725717745105810568100004001010000100001151926493154049683160128889012913412966712267031224356001030020100001000060020100001000012915512968311500211091040010100001000010100212567100201291910018125045321245040000314028233128620510166096598053351000050010128844129463129063128962131068
5002412892496700000000157071054112826174212908425716485106210516100004001010000100001150999491139549839880129626012927112897812239531222586001030020100001000060020100001000012891612889011500211091040010100001000010100140526100211561710018125045331145730000314038224129307511385882563561721000050010128469128632128887128538129389
50024129148966000000001810610525129191104312874325715615105810539100004001010000100001146405488060049417351129641012893412922912218331228536001030020100001000060020100001000012903912912411500211091040010100001000010100170576100211572310020125045291245701100314038234128618510965846636058561000050010129165129257131115129448128716
50024128358963000000001630111051912934293412889425714835100610499100004001010000100001143804486882149513501129694012885312894412231331225576001030020100001000060020100001000012853512932211500211091040010100001000010100170571100181692510018125045211045480000314038233128933511545601612955221000050010128332128805129077129105129763
50024128728972000000001710710513129171124212903725716455110610542100004001010000100001144434490339049386211129466012874412944312225031230596001030020100001000060020100001000012963012927911500211091040010100001000010100201563100201782110021125045761145600000314038233129653511165721583151811000050010128687129092129192129650128979
500241296909651110000016101110557128945104412903025716065109410533100004001010000100001158635498754549611171129420012934313015712248631225436001030020100001000060020100001000012960712933011500211091040010100001000010100191514100261672210020125045171145261100314038232128643510885806577552881000050010129627129140129343128841129850
50024128625961111000001410710482129075743128460257157951066105421000040010100001000011426254877998493344811293980129497129124122932312295560010300201000010000600201000010000129321129111115002110910400101000010000101001705161001923111810021125045631145090014103140312532128744510805939629853531000050010129625130422129216129679129203

Test 3: throughput

Count: 8

Code:

  ldr q0, [x6], #8
  ldr q0, [x7], #8
  ldr q0, [x8], #8
  ldr q0, [x9], #8
  ldr q0, [x10], #8
  ldr q0, [x11], #8
  ldr q0, [x12], #8
  ldr q0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3790

retire (01)cycle (02)030508090b0e18191e1f2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a5a6a7a8a9abacafb1b2b5b6bbbedcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802093083122810010007637085216244163083036180257113801681295825160132801478000080100800004007411341381139302703043330294201253203921601002008000020080000303553021311802011009947100100800008000001008100336385556660686315786158726854298593610000326421325556325656160451101171130426179800444064097480000801003018530022300993034830203
80204302102251000000754207741568448408302717765411477126329232516014580130800008010080000400771134906514330046303593048620249320236160100200800002008000030361302601180201100995510010080000800000100810023540856436318581578898927659738700310000326601256142324705340351101171130409153800413243697580000801002995630309304653035330273
80204305072262202000785408041552419100302897995431526144629932516014180131800008010080000400759132794714630101304533041420147320173160100200800002008000030361301661180201100993810010080000800000100809853542555286148621976088988058808682210000326061135852324994330351101161130112161800383604028580000801003041230308303403052230196
80204302442241001000714108121568392160301308215391225134928162516015080137800008010080000400808132778913830255302323026620171320306160100200800002008000030361304861180201100994610010080000800000100810404141355806048592670379486458748670310000326501255734324884372351101171130429182800463434018580000801003049530416300783035630150
802043031922711010007247078414484342003002075954813931431299425160140801428000080100800004007321330854149303273050330490203233200711601002008000020080000302073030511802011009954100100800008000001008098736396501962186171738118713657658657510000326321325734325132371551101161130523164800373804009680000801003025430402305463003730698
80204302522262020000750708631608426236301517856481534182730492516014680140800008010080000400745132797014330284301343037620469320214160100200800002008000030421303371180201100993410010080000800000100811803643153426718600673898705457628622310000327121125722324827150251101171130245152800553724287480000801003017430179303183024130378
802043034022733000007752084616324362003040977854115921540295925160156801438000080100800004007131344572138303253047030217203933202201601002008000020080000300923025611802011009937100100800008000001008094817423552960585710759118723458098629810000326181205704325082181351101171130228180800363913127380000801003039630075305803016630353
802043042722711000007089082315684382403040776960716081375282625160135801308000080100800004008251326148140302963012330326203213201481601002008000020080000303093046311802011009944100100800008000001008100936420534260586313756108884061358641110000325551275836325467353651101171130212181800624443947980000801003008829923301783037330550
80204302942261000000689607851560463104301668076051699146630282516014580136800008010080000400767134126514030320302713026520273320607160100200800002008000030387302731180201100995010010080000800000100809583541750386278563174279004263988632810000325971275741324952310151101171130392189800442853677980000801003014830328301173022630198
802043010822720000207855083616404372123068377060714421495294425160135801428000080100800004007451351006043304363021430289202483203841601002008000020080000300873019911802011009943100100800008000001008096517405507358385689730138663052938646510000326311266031325327170951101171130480165800453244318280000801003043830258301223033330248

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3765

retire (01)cycle (02)030b0f191e2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a2a3a5a6a7a8a9abacafb1b2b5b6bbbedcache load miss (bf)c2c3cfd0d5d6daddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002931009228000702379915924189630077827474181317132820251600518005480000800108000040024813288740443016330276301602038132041916001020800002080000300543031511800211093010108000080000010809334604899643857708561493024527786290100003259812346853249120950201531603230032169800422952925280000800103024830230301263011930359
800242999422600077437911624461168299617665111656154129232516005280054800008001080000400270133748714129928299762994120072320207160010208000020800003023430185118002110931010800008000001080969472519762185728833158514653748624310000326051315020325355355020031603330087149800433042586480000800102995030399303723015830135
800242998222600071728421760470212301347845141722149029002516004880054800008001080000400259134388104629839300323020220061320191160010208000020800003012529833118002110971010800008000001080934442506664485212807118894255328611310000325611174729325492365020031613430260168800443263036980000800102996529986302113013530156
8002430334225000772285012804552683027683145616741762282425160039800518000080010800004002991326969041301783025730093201483204191600102080000208000030274302131180021109161010800008000001080968468506562985063902149325055528629010000325621285571324691035020031603330132152800623363287680000800103014930268300333021630226
80024300322250007788843175235936829816758442186918133002251600498005080000800108000040025213426800402989429838300872033232041316037220800002080000302323039011800211093410108000080000010809884825587653849278921593513257008640510000325711214854324918045031331603329898155800392962737080000800103007430345302333011630101
8002429971226000677882317524552443012481551318171806261025160054800558000080010800004002251324737038303233054530162200713202071600102080000208000030228301711180021109171010800008000001081012460484167386402805119574057958700810000326961284820324983375020031603330292180800322873335180000800103027530113303473032430300
8002429872224000666877016243945483020377947718522013301725160055800568000080010800004003691345279034303453033230242202823200541600102080000208000030035301361180021109201010800008000001080948436533962585172847118785059158624410000326131155064324232035020031603330172146800332832725780000800102996830075302413026130421
800243009422500066657931736502128302748205231673176226562516005180048800008001080000400253132329804530214302473021720415320314160010208000020800003000330217118002110915101080000800000108093445253626158521582798987058088617610000326421225280324686655020031603330112160800453113376180000800103039730385302993027430162
800243022622600080048201416371208297287995371563175330392516004880048800008001080000400272132681904430167301503005520221319894160010208000020800003034430296118002110921010800008000001080984487548063085872851139248860138644710000325781205439324582605020031603330050169800433533315880000800103012930022301743003730276
800243014222500073308261752399200302377746022016187929852516004980066800008001080000400258133764603630161301313004520115320167160010208000020800003022830206118002110971010800008000001080990438541662385481832138864454348662510000325621275461325221025020331603330075175800443073055780000800103020730104303163008930105