Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (post-index)

Test 1: uops

Code:

  ldrb w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)0f1e2022243a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200510408006000040102515319414252000100010001000100052820458240104010406993773200010001000100010001044441110011000100001000015910425016121810327021856731161110371000211801000100010411041104110411041
20041041800642210201025153881625200010001000100010005283245823110401040699377320001000100010001000104044111001100010000102801431031201781810323531556731161110371000202201000100010411041104110411041
2004104080086000501033951113232520001000100010001000528004582211040104069937732000100010001000100010404411100110001000010000056101801002310183341888731161110371000312101000100010411041105510411041
2004104080052181030102513219101925200010001000100010005283245823010401040699377320001000100010001000104044111001100010000101400691039101881310203021988731161110371000312201000100010411041104110411041
200410407005918103010251561581725200010001000100010005281645824110411040699377320001000100010001000105144111001100010000100000511025001661610415232764731161110371000292101000100010451045104110411041
200410408005200030103406256212520001000100010001000528124582411040104069937732000100010001000100010404411100110001000010240065101900001810253431948731161110371000322901000100010411041104110411041
200410408005116103010252631361825200010001000100010005280845824110401040699377320001000100010001000104044111001100010002101401521028601681210343342456731161110371000292701000100010491050105010521052
20041040800520002010339521131825200010001000100010005282045823010401044699377320001000100010001000104044111001100010000100800481040502061610224322872731161110361000363451000100010411041104310411041
200410407004218103010259314815252000100010001000100052816458240104010406993773200010001000100010001040441110011000100001014014310362022121610453943456731161110371000242701000100010411041104110411041
20041040800630005121025058131925200010001000100010005281645824010401040699377320001000100010001000104044111001100010000102201641018000123010334642256731161110371000292301000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1892

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e1e2022293a3e3f404d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50209723325382000044779717283108718298077160625507354066410150401001000061513227224290496884507179671946654843656365010040200100007020010000720153511402011009910010000301001000001001098361744951064527915912382410914152113910300261025811716624053610551131105710000401007192671903718467177171944
50204720435392000046982017361120719028177160325507704061610120401001000061496227214361496889907191371963654683654725010040200100007020010000718833511402011009910010000301001000011001090511714781064526316902762410932130112510400261015811716844048410501030102810000401007164871884718907202771903
5020471874539310003977951712112071983808714602550755405841012840100100006141182729543049687440718877189165387365554501004020010000702001000071980351140201100991001000030100100000100108871163460106432501192736341090113711261040026101581171617405201120900107610000401007179071883717567188671838
50204719765382000051683616802132717817967161325507654064010133401001000061584027249681496868707195471961652763655805010040200100007020010000719463511402011009910010000301001000001001090121444861062626014945762910913117213020300261015811717724052811481084102610000401007188871867719637184971766
502047194454030000454817169621167179580771462255073540616101414010010000614669272857414968936071912719986532336540450100402001000070200100007187935114020110099100100003010010000010010926215349510636282990884321093212921372330026101581171751405001090108399910000401007180471767718107188371812
50204718285393000047682217122120718058067156725507254059610132401001000061643427271771496884807201471790652003656345010040200100007020010000718523511402011009910010000301001000001001088931734721064628612928763010897134313330501261015811717914049611381150105210000401007187171953717937182072072
50204719735392000045481217201120719468227169725507204069210140401001000061611427269051496881807178371859652333654315010040200100007020010000718003511402011009910010000301001000001001093211574921063227412914523410931135112911111026101581171771404841071971109310000401007178872059718047185372017
5020471873538300204978081728211271859812716082550810406441013240100100486168222725308149687950719647187965368365610501004020010000702001000071962351140201100991001000030100100000100109122152475106682769888482210940114213520300261008711716364052011161033108310000401007196071886719837193071952
50204717965392001144881317441140718308047157425506804057210123401001000061385627345300496890207183971894653053655155010040200100007020010000718503511402011009910010000301001000001001093341664891066127611892762510924132212620900261015811717564050010481069106210000401007206371770718497189971972
50204718925383000046580117202104720248007174525507104062010140401001000061526427246891496871207189071996654213656305010040200100007020010000718543511402011009910010000301001000001001094521475001064425914904762010915130212023300261015811717244048810971074113010000401007187871972718997185772108

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.2122

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002972339542200000608083717202136721178054214718572550895407381018640010100006139852732591149690687204172073655613657885001040020100007002010000721093511400211091010000300101000001010957114350310696268109297260109461223613214400252001165610718464069288092893010000400107208872118720937229872296
5002472106539200000556081917283100721538103415719632550825407261017440010100006332322725932049690637215572010654353657745001040020100007002010000720373511400211091010000300101000001010939214650410707263139297674110611185111930400252007651377181340672852944105410000400107206672006720587206572211
5002472096539100000762081117121128720888294017717692550885406581015940010100006328342726453049688577240372006654933658055001040020100007002010000721103511400211091010000300101000001010916313449510703265989880731095212332134203002520076571471975406201000928106610000400107219172088721267225771941
50024720315401000006170840172021207197681035157209325508204069010178400101000061388127470191496874871762722036531936581150010400201000070020100007182635114002110910100003001010000010109071138516106872771090478661096611946134144002520088575720134066492481885610000400107189871892719457207372071
5002472258540202000568080517521927194479836217197625508304064610173400101000063393927350490496889571794720926541814658225001040020100007002010000719473511400211091010000300101000001010940414149410686278990666711093713139129103002520078568720834065692099291410000400107206772122721437209472258
50024719525402010006750797172811407213777745167190925508104073010162400101000063344527303871496916072229720526574036576950010400201000070020100007201535114002110910100003001010000010109254153485106972919937668110972117431322130025200126576719544063684894091810000400107203472131719397221171954
50024721255381000006770799172011247216379241167181025508054071410168400101000063198427290921496893472064720426546036570650010400201000070020100007198935114002110910100003001010000010109274172513106772601394972781100112141124207102520078576718544064498693891810000400107208372127719677214171875
500247215053900000061908151680210872026821331671751255087540698101544001010000612849273429604969001722327229465473365813500104002010000700201000072138351140021109101000030010100000101088901505181068926999084466109591253913300900252008857571918406681016902101210000400107218372128721957203772218
50024723435392020005970806171211287211182935187180025507854068210173400101000063330227260771496922972019720156584036563450010400201000070020100007196835114002110910100003001010000110109261131508106762611190448681096912937118143012520076574721444070486094488210000400107223672109719577229972097
5002471909541200000766081716883104720877943913720482550900406381017440010100006148482721942149690497202072194656193658415001040020100007002010000720663511400221091010000300101000001010908216851710678259696376851097512657138203002520058557718434062887485395010000400107210072141720867219372104

Test 3: throughput

Count: 8

Code:

  ldrb w0, [x6], #8
  ldrb w0, [x7], #8
  ldrb w0, [x8], #8
  ldrb w0, [x9], #8
  ldrb w0, [x10], #8
  ldrb w0, [x11], #8
  ldrb w0, [x12], #8
  ldrb w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3636

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092977321830000063148271688901162916075936819332144182025160149801618000080100800004007531286809061492592029176290418858391641601008020080000802008000028834351180201100993100800001008000010080922323764696085011681158993250758464577512944494713140405110116112926525800583983947580000801002884629097289762927129080
160204291092182000006482837173690168292077682951773194718992516014180162800008010080000400799127970416749260422897029200866038927160100802008000080200800002892735118020110099710080000100800001008090032357449208419470598673247688531980512648444769333505110116112912630800543232838880000801002905728878291712904629094
1602042902121820000066518081584881322912376736121201854179225160154801468000080100800004007701284127064492598129042291659163391861601008020080000802008000028968352180201100998100800001008000010080920554055395084918705128983646878591467311448554650513005110116112887221800613612918580000801002923528894288782903529084
160204291972192000006314779168811313629078741363194421001786251601678016980000801008000040074512824120474925991290692929588803917116010080200800008020080000290203511802011009991008000010080000100809385339049970848627351286336578585639663132460543995510051101161129023248006641836110380000801002890529192290852912728846
16020429050218200000687683117201079229161730312198721301886251601518014380000801008000040077812858560744925833289582915890943922016010080200800008020080000291233511802011009914100800001008000010080865163924726085335714108685653268557768512347354308350505110116112899444800643182967980000801002917429121291522913628932
1602042915421720020066537791576842242914678333717442035185325160159801558000080100800004007661278489050492627528946293779061392601601008020080000802008000029325351180201100991610080000100800001008091638426479208489069788783247058568179111245504251310005110117112907144800483123109080000801002911329133290992929329440
1602042909421830000066437681720981282913476436316922171170925160167801638000080100800004007521278256057492607328963292319218390571601008020080000802008000029261351180201100999100800001008000010080898344065578084792704118327249508566071712448354436362805110116112921825800543783758280000801002908728840291712900329106
16020429289220220000631182016408923628945778361198220671925251601408014980000801008000040076712836630444925970294862894692123885116010080200800008020080000292843511802011009913100800001008000010080845333545213085190696108463851418568673912442984863340305110116112914433800543573129280000801002904328926290062919529189
16020429073218220200603576516889613629154746353187722881627251601588016380000801008000040078612775830574926259292302905292493886316010080200800008020080000293273511802011009912100800001008000010080871373455198084422683108452847988554571711746675208353505110116112931630800603063139780000801002907428951288112904928705
16020429096217220200627876216721011242896576035721601904167325160147801588000080100800004007771288465063492610929046291719057389881601008020080000802008000029123351180201100997100800001008000010080932384085324084934642148862843548607876412847964516174305110117112931129800642733337580000801002895828929290782907029250

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3670

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929701223203110069707761688108136296778013631783178821612516008080070800008001080000400398130185617649262412936829359917703927416001080020800008002080000294073511800211092410800001080000108097479408636685406628109123451048579481412849605038513450205166429433318006052952210380000800102921529231293552920329283
160024294302184001100656084816969412829341746289182919762140251600718008680000800108000040032612842651594926165294492949594090392631600108002080000800208000029406352180021109281080000108000010809287242458308534464513898765435859448551365514518952035020516242912630800605065339780000800102943329283292252932029453
16002429359220440110067338091744931082931178033117521706212125160064800768000080010800004003921294130157492634029490292839164039297160010800208000080020800002932635118002110945108000010800001080919603935373857346571490228515985712820122537352603301250205162429426488005854257111180000800102935029360295252922629553
1600242917121940011006379798174499144293038012801877177420362516007180066800008001080000400364129746617049261452939229574946203940716001080020800008002080000292173511800211093310800001080000108089843378575085893667129191165307860418231314807505818035020316552934218800625405419580000800102931829474294542957029429
160024294692214001000674280217448711229556780332197518052012251600578006480000800108000040034012906231724926510294912944991330395371600108002080000800208000029103351180021109281080000108000010809116939754778528566312878685275859467481294397454970045020316242929126800565615359580000800102939829462292752932229429
16002429370219400100067048501712111148292448033531702182722722516007780073800008001080000400356129562117649262932935129481948303945816001080020800008002080000294033511800211092810800001080000108093645424512985204682108635251208579581913047934883700050204162429444308005954950110280000800102946929224294612957729511
16002429230221400100066288031704106148292567942981875189820142516007580074800008001080000400323130065916249264982943829257932003930916001080020800008002080000292423511800211093910800001080000108091165366595785415655148723848978579570813147535087460350204163629346258006656751410280000800102923829493291352952829552
1600242946322140010006496769175296156294137953421747190021012516006980065800008001080000400397129776516949263362926229366939803903016001080020800008002080000292473511800211092510800001080000108094851367551185641661118934054438686382312747984834513350204164229135278006152453711880000800102918329474292452950929228
16002429461219401100065117921720979629350780324163719422135251600568006780000800108000040036513051531714926117291582916892690392881600108002080000800208000029277351180021109261080000108000010809934236355288559469913888325635859937831285111486163035020416432923231800595355249780000800102941929459294112942629303
160024294442204001100632376416881211442955178333422901808205225160074800848000080010800004003661287903158492646029261291649350039296160010800208000080020800002921435118002110927108000010800001080973713865658848006499878265634858008321274692495052065020616422921517800595175419680000800102947229495293382931529360