Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, Q)

Test 1: uops

Code:

  ldr q0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030508090b0e0f1e202223292b3a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a2a3a6a7a8a9abacafb1b5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebld/st retires (ed)gpr retires (ef)f5f6f7f8fd
10051040811100093281003412102913457252520001000100010001000507544582311015104010408243898200010001000104010401110011000100011034627410511055123810691257311579270073116111037100025221000100010411041104110411041
1004104081110009919000050102512216232520001000100010001000507704582411015104010408243898200010001000104010401110011000100001008805010524130623103312556103054727073116111037100025241000100010411041104110411041
10041040811000187191000401025712226252000100010001000100050754458241101510401040824389820001000100010401040111001100010000103070861049603363310571253511286961073116111037100037221000100010411041104110411041
1004104081110007224000050102515292302520001000100010001000507304582411015104010408243898200010001000104010401110011000100001037707810731412403310441258312296060073116111037100023161000100010411041104110411041
100410408111000641410001010251112429252000100010001000100050746458251101510401040824389820001000100010401040111001100010000102282911058401722331034125726348560373116111037100031251000100010411041104110411041
100410407111000800000044010253222220252000100010001000100050754458241101510401040824389820001000100010401040111001100010000102382471054112462510361256593869713073116111037100039471000100010411041104110411041
1004104071110006216000040102520037292520001000100010001000507624582411015104010408243898200010001000104010401110011000100001043909910572134105010561255583950704673116111037100037461000100010411041104110411041
100410407111000972810001010258523212520001000100010001000507384582411015104010408243898200010001000104010401110011000100001018140115106361321841106312588752108711173116111036100025441000100010411045104110411041
100410408110010531810002010250222232520001000100010001000507464582411015104010408243898200010001000104010401110011000100001029808310611311012411046125598345970073116111037100049381000100010411041104110411041
1004104071000008122200040102510303212520001000100010001000507544582401015104010408243898200010001000104010401110011000100001049811051046611305010691257483742624473116111037100025161000100010411041104110411041

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, #8]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.8889

retire (01)cycle (02)03050708090a0b0e0f18191e1f2022233a3f404346494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb1b2b5bbbedcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50209133470987410004000017801110583129285712091284992571711511341052510000401001000010000114764448874304939164012878101305361293711224593124598601003020010000100006020010000100001294721292281150201100991004010010000100000100100198571100371511331002912504635214522480321157654129268513446284616151581000050100129514129626129795129419129679
5020412938996821200001001430910542129996101006128693257190051250105421000040100100001000011479504901354494760601293120129471129222122994312345960100302001000010000602001000010000129749129211115020110099100401001000010000010010023357710031916221002712504526244533400321147644128960511245881578950621000050100128995129543129046128696132023
50204131678990280070002728394625487105301316947110101306097047217751439105861000040244100001000011464294906064493434211300440130628130977123203233125527641473392611090109486758211219110671322091316332415020110099100401001000010000010010062254210073129929501006912544511224527300440241241414130858510986233594351721000050100129638129245128849130794131161
502041306191020312200302003430101051712909261206129417257174751212105511000040100100001000011378524846617491132701290820129728128841121734312263960100302001000010000602001000010000128458129478115020110099100401001000010000010010018456810032811231002412504549214511200321149144128848510905485525453651000050100129223129563129426128932129279
502041290019663003031000315070151612812091206128006257170551094104841000040100100001000011426464868387492840901287530128379128695121779312280860100302001000010000602001000010000128732128725115020110099100401001000010000110010017453710029117341002912504547234507330321147644128160511345582558448051000050100129151128817128745129392129334
5020412875496430030000003200701536129251711091280892571840511721055410000401001000010000113920548693994951167012864001287861293001219593122721601003020010000100006020010000100001291041279941150201100991004010010000100000100100173553100341412181002412504517204498330321147644127983510265012474143331000050100128270128402128545128463128546
50204128773965310010000031809016651297727806129098257185551266105201000040100100001000011371444882238491822401289170128139129036122387312257760100302001000010000602001000010000128913128437115020110099100401001000010000010010024257810028127231002612504553184518200321147644128611510865248510045471000050100128186128448128766128423128336
5020412825596230030300003490100150612888971317128718257170551186104981000040100100001000011384584859603491176001284540129086128704122420312250060100302001000010000602001000010000129448128962115020110099100401001000010000010010017251510030159241002712504628244548200321147644128507511785676518049651000050100128988128828129728128716129258
502041295479632000020000339090155112931878051282292571693511401055910000401001000010000114392148936554932131112896801294431289341215443121729601003020010000100006020010000100001291641292961150201100991004010010000100000100100256591100321514291002512504487234505300321147654129318511405381584350731000050100127801128148128979128546130791
502041291029693030000000346090151112889971315128161257162151106105111000240100100001000011405014857401493266201286160128527128562122392312280060100302001000010000602001000010000128507128566115020210099100401001000010000010010017951910028117251002912504503244535330321147644128145510985195595847891000050100128599128738128664129160128183

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.9504

retire (01)cycle (02)030508090b0e0f1e1f2022242b3a3f404346494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb1b2b5bbbedcache load miss (bf)dtlb miss (c1)c2c3c5branch mispredict (cb)cdcfd5d6d9ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5002913348497620001037707100530129885611107312862525721045133210601100004001010000100001144894489612149473760129443012923312897512328331228386001030020100001000060020100001000012940412906111500211091040010100001000001010017461010130147120101261250463912744852000003140482111128362512605031502846011000050010129236129623129213128999129158
50024129163965202000392071005021293141111506012917625719575124410688100004001010000100001145374488234549522570129252012990012987612257631235366001030020100001000060020100001000013022412983011500211091040010100001000001010016664910131319143101311250451211945332000003140185011129469512985480550751511000050010129472129372129621129316129638
5002412923597020020043408100607129685712706313044725719365144210651100004001010000100001149694491347649681720130063012917712893512237431234046001030020100001000060020100001000012939313063811500211091040010100001000001010019257610132209136101281250452012445002000003140197011129564512765295500646591000050010129487130248131072129787129307
50024129545967202201395071005331302389118178130466257195151296106431000040010100001000011473874923710499004101302160130226129910122368312282960010300201000010000600201000010000130949130925115002110910400101000010000010100192676101301511127101261250451311945532400003140182012129177513665618589449111000050010129259129790129708129786129784
50024129744969200000406010100577129511610806912920025720205126410616100004001010000100001148629491290049667600129650012963012941712266231228266001030020100001000060020100001000012960012947411500211091040010100001000001010021259010143169126101251250450011746572001003140182011129317512025062490245031000050010129273129416129214128900129097
5002412943696722000038307300541129930711707712996625718615125410613100004001010000100001145314491883749543990129880013039412976412289831223976001030020100001000060020100001000012919912896611500211091040010100001000001010019258510127168127101301250452612945012000003140182011130171512585364545649251000050010129850129105129013129362130129
5002412950497422200040107100550130170911306712933325719065143410655100004001010000100001146430493350549670570129329012920812965512249331229476001030020100001000060020100001000012987812970911500211091040010100001000001010024262810130139122101251250448712245242200003140182122129477512464985471349991000050010129672130325130504130225130085
50024129235971220200399071005271300289112077129390257202351294106141000040010100001000011529554907717495534101295080129694129963122860312327460010300201004810000600201000010000129366129252115002110910400101000010000010100214620101351212127101281250452811945682480003140182033129467512745891546549071000050010129368129712129638130451130591
50024129603971200200398010300511129846811506912868325718885128810608100004001010000100001152292491308349600790129880012937512966412335531234576001030020100001000060020100001000012951912923511500211091040010100001000001010019255510130149121101351250451511944522100003140182011129786513445149557148011000050010129220129782129783130140129438
5002412917996920200038809100504129496711707412965825721225140410691100004001010000100001144522491511849488470130364012947513010012327631227456001030020100001000060020100001000012952612967011500211091040010100001000001010031659310151128123101341250451512145442000003140182011129957512584988516349931000050010130221129058130235129736129468

Test 3: throughput

Count: 8

Code:

  ldr q0, [x6, #8]!
  ldr q0, [x7, #8]!
  ldr q0, [x8, #8]!
  ldr q0, [x9, #8]!
  ldr q0, [x10, #8]!
  ldr q0, [x11, #8]!
  ldr q0, [x12, #8]!
  ldr q0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3781

retire (01)cycle (02)030508090b0e0f18191e1f2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a2a3a5a6a7a8a9abacafb1b2b5b6bbbedcache load miss (bf)c2c3c5cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8020930949227000000007569085716243961003005077160514641396291696160461803258000080100800004007661351748144309193018930467200863201301601002008038820080000302523030011802011009929100100800008000001008122343450786308608772911900705956863181000032707123527432511103052871721130771161813254083957380000801003082731452316433143531374
8020431504235010000777587264791162439640831343807514149414033768269161680812468102681388812464071941369513043312573131331233210411502113916296320281344204813423147031422818020110099521001008000080000010081831458497593986687715129218611083872461011832589118537432453504052651811131068166815113673557680000801003134331696312003153131544
802043174623400000088778770477116004501803127079956714501182369738116139982160811708206581602409910136893415230891317863168421037181213701637202088188620881890317103183511180201100994910010080000800000100817404355572887872677216911134139128706510144326071195459324818070526911052131282168818823764147380000801003209831807315253224131854
802043202423701100091082667047971712437116319437895631563159035704151631458213480388819248191141094413915031483154931893315372085718821384161151202813442048114831344310108180201100994110010080000800000100809284605739634866757947912525859868101000032639136570532500103051101171130255190800524443876780000801003014130204300253018230406
8020430307228000000007031080815683912203010379053313091389278925160143801468000080100800004007421347219145302943045130316201283200291601002008000020080000302823031011802011009932100100800008000001008094744749466448569675810953705807866421000032622107606232567203051101171130354163800444003489280000801003012229985303623020330242
8020430036226000010006467084115844121843026878256513531471282945160151801508000080100800004007251343391042303423026930541199853204141601002008000020080000303363045311802011009924100100800008000001008094841054836238519977113923765325866651000032639120530232468903051101161130029150800313773507980000801003014730266300553010630077
802043019222600000000700308151520453188301217785471382122427592516014580144800008010080000400711133794104530071299873007820243320383160100200800002008000030292303131180201100994810010080000800000100809674225268649860537788912486123864161000032681127495432470833051101171130238163800463653618380000801003025429967302273028230331
802043033422600000000665508181616415124304018425081345150130282516014280144800008010080000400688134125304330463306103021020208320183160100200800002008000030166301641180201100992310010080000800000100809504134845618853567867875465447864651000032638114556932491003051101161130242154800484223197580000801003024230437302373003330443
8020430222227000001006868081912884381523036883549113321646303625160151801368000080100800004007561328661139302283026530498202593201721601002008000020080000302943047711802011009924100100800008000001008093549651166538520580810900525641866261000032621117565132455605051101171130254165800444004347880000801003047030310303303025830236
8020430218227000000007596083916244932243071783548713391528292125160142801418000080100800004007111335537049300973044430222201463204021601002008000020080000301013041311802011009956100100800008000001008093642656196578590472210917745505866641000032661118583932520903051101171130279171800493774077180000801003033330184303133032630305

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3787

retire (01)cycle (02)0305080b0e0f1e2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a2a3a5a6a7a8a9abacafb1b2b5b6bbbedcache load miss (bf)dtlb miss (c1)c2c3cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)dfe0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80029307722281000171057911656450100305157904621366151131942516005280071800008001080000400240133869201353003830304301752004732031116001020800002080000305903068011800211098510108000080000108094504525068616854777511492844522286599100003258513460083244561815502400031600023430504166802444204118680000800103033230280302673055730355
800243034222820010755678616323902003020678855116501459303025160058800538000080010800004003151335874013630224301853029620145320382160010208000020800003021230291118002110944101080000800001080998314765102590855657329902245419864341000032597117564832460937155502430031600033530392161800363293868380000800103011030251305543018530347
800243039722510110727384517764082242994080152114241473284625160053800448000080010800004002551337277004330299303653051220386320290160010208000020800003030530387118002110934101080000800001080971040157206288608074798763657918669810000326581136119325320005502400041600023430364182800493793508880000800103025230288300533022730285
8002430160227000107392819178444019230205839543142913932819251600548005380000800108000040030613455811135302603031130152200663202041600102080000208000030245303501180021109211010800008000010809280388497164785570766139133664508700310000326311335745325380009502400031600023430363170800443453388480000800103003930350301523031630201
800243016622800000744082016803791123044281162815511463306825160058800578013080010800004002451329377114030254303893019720313320117160010208000020800003064630352118002110957101080000800001080938040251666228613674298794856508671810000326171165961325079003502400031600023430515150800373783778280000800103029230231302403029030594
800243035122900000740878714884311163014578151415961590296825160047800468000080010800004002931328970013830070301583037720113320158160010208000020800003055030108118002110965101080000800001080969043053436298483373478854252138603110000326991265230325209007502400031600033530334160800344583929480000800103010030183303723006130178
8002430049226000007658813162439922830291815562132615433052251600548004980000800108000040027613446110142300133012630437205883201741600102080000208000030464303891180021109401010800008000010809641343851046088523070214882765213865131000032599109518932488716010502500031600033430180176800373013397580000800103007130300305133027930192
8002430286227100007047851162440520429872794463151116113012251600518005580000800108000040023813292170149304253060330332201613202841600102080000208000030290303511180021109441010800008000010809931544752686028574578459052860528582810000326031165661325079066502400021600023430387200800343833547980000800103011030290298773035030242
80024301732271100076008431264439963026880553013211462290025160058800448000080010800004002741318465014330388303123033320281320548160010208000020800003045830132118002110973101080000800001080966184255311653855397589885385217866881000032658130603032475116055024000316000324301531808004836734310480000800103023330388300273022430420
800243019622510100697381316164561723016279652114331480289425160048800588013080010800004002901342102014230078302733020820329320132160010208000020800003035430246118002110940101080000800001081009184215271632854757571086040552386084100003264711952023247451810502430021600032430152159800373883576680000800103030230504303033016330379