Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e1e202223292b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051041810110792210003010251917717252000100010001000100052836458241104010406993773200010001000100010001040441110011000100010248048104721208311024347317260073116111037100024251000100010411041104110411041
20041040811001532200002201025012124252000100010001000100052840458231104010406993773200010001000100010001040441110011000100010287081105411260481024368315670073116111037100026171000100010411041104110411041
2004104071001070161000301025123211625200010001000100010005282845825110401040699377320001000100010001000104044111001100010001028705910441202414361042406254061073116111037100024191000100010411041104110411041
2004104071011039160000101025000421252000100010001000100052844458241104010406993773200010001000100010001040441110011000100010246064104912200311038396246464073116111037100020241000100010411041104110411041
200410408111007912100020102516244232520001000100010001000528164582411040104069937732000100010001000100010404411100110001000102270461045103783110463962948612973116111037100024171000100010411041104110411041
200410408111007916100360102512283242520001000100010001000528444582511040104069937732000100010001000100010404411100110001000103721053103841134281034286338071073116111037100048311000100010411041104110411041
200410408101118500000301025824822252000100010001000100052840458241104010406993773200010001000100010001040441110011000100010256074105241390241025404276461073116111037100049351000100010411041104110411041
20041040811010622220002010251414221252000100010001000100052832458241104010406993773200010001000100010001040441110011000100010449075104090214321032346375672073116111037100026241000100010411041104110411041
2004104081101083120000301025141711425200010001000100010005283245825110401043699377320001000100010001000104044111001100010001023705810421601712311037344264060073116111037100039261000100010411041104110411041
200410408111109122000024102513142272520001000100010001000528364582411040104069937732000100010001000100010404411100110001000102471481050302412411029455367262073116111037100024311000100010411041104110411059

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1924

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f202224293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50209720895392110000416084310672310871784804507165425507854063610138401001000061594927339214968817721187202965382036569850100402001000070200100007190335114020110099100100003010010000010010924211350910700271129195222109101261212220100261025711718314055210621057108210000401007204971970720717199472010
502047205154010000004610841105841124719298307271539255078540620101284010010000614988272582149689027205771657653360365488501004020010000702001000071916351140201100991001000030100100000100109071161498106742741590052341091712091331070261015811717394056410781133104210000401007204271894717987178971910
5020471868538100000048208211071219271962789827152825507654060010138401001000061358427261634968890719577186365379036544950100402001000070200100007197035114020110099100100003010010000110010907213648710677237129291783310931119101311330261015811716724055210721138112810000401007191471903718387202172024
5020471886538200000045108101071211527205979273717542550785406121013440100100006156842721858496902471891718316541103654555010040200100007020010000718513511402011009910010000301001000001001092321374931065425310920822410929128101282030261015811715954056010961087103310000401007199371862719217184672079
50204718965391000000468083310704172717668026071587255077540664101434010010000613432272876249689027196572000653120365510501004020010000702001000071972351140201100991001000030100100000100108991135502107232801593830281091114210129103026101781171855405769721167108010000401007203971894720347200871897
5020471873538101000045508211062411447192182182715172550760406041013340100100476172572729195496882671848719586542103654635010040200100007020010000718113511402011009910010000301001000001001090621434751074327814897942810895139111122340261015811717124053210211113109910000401007190871894720227189171826
50204719895391010000468084610544196718908045071577255078540660101254010010000617257273024249689517195971994654410365602501004020010000702001000071949351140201100991001000030100100001100108902140517107112692789632241092911961352450261011711718014055210131104106210000401007189272005719267183471829
5020471999540110000046008311053611127185780850716262550730406441013440100100006174382727854496889371917719876543003655595010040200100007020010000721463511402011009910010000301001000001001089911415311069626898901422710900119101281030261015811718074056810381022109710000401007194071921719887206772075
50204718565391110000410081510720110471835800737170725508454066410123401001000061550427262204968882719217192265389036550950100402001000070200100007176835114020110099100100003010010000010010922113046910660271129204227109201208117113026101581171532405121039979103410000401007196671916719337206672049
50204718375391000000429083010720110071865827937171225507754062810138401001000061411927279164968861717577180765510036564550100402001000070200100007209435114020110099100100003010010000110010866415550210677287259301023010919130111171070261015811720064054911911072106710000401007185271856716287200071984

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.2010

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002972195540510500470841174461087214781013271584255076040562101404001010000612885272637514968850722207201665546365662500104002010000700201000071924351140021109101000030010100001010915515553210661288994380371094212615124433000252198599721884059211931105107410000400107210072133721767200671990
50024720215415005004758501728412071924814103719932550785405541013340010100006148072723110149688497216971979657213656905001040020100007002010000719103511400211091010000300101000010109424149496106502681092672431092812513135417000252198588719674057611281052112710000400107184671912719057195272314
50024721135394040025108341720412472124800114716462550690405821014340010100006121472733049149689317203071998655053654725001040020100007002010000720873511400211091010000300101000010108806156511106512429964543110918122151284750002521985910717214060410841048101510000400107205071984718687215072019
500247200154240020450681017285104719988271187170625507504055810151400101000061366927284661496889672054720216545036555650010400201000070020100007191035114002110910100003001010000101092141515321068126499325034109931231513340700025211085111171919405641093115899010000400107213372221719767219272028
50024721135404000004288661720413672037797124717412550770406061014740010100006130732729511149689127216871933655223657315001040020100007002010000718883511400211091010000300101000010109115167510106702771090148431096912017132507000252188587720304061611641087113410000400107185972110720127193972018
50024719765394040004978631736413671980790114718002550630405981014240010100006135262730158149691047208772043654993656255001040020100007002010000719123511400211091010000300101000010109284154511106382827930503710961134191374070002521985810720264060010721067109310000400107209871891720567201672118
500247188854040000043682817126136720758121237176925507254053010142400101000061526527287441496888472040720996551136572050010400201000070020100007200935114002110910100003001010000101089941635081063027879345018851093813217121403000252398578718594055611521175118510000400107201772136719507199972133
50024718785404000044948311712510871932811111718112550710406141014240010100006124512732464049690627196572194655713657205001040020100007002010000718833511400211091010000300101000010109074139503106722748908403810929136151294090002521785910720544056410501054115210000400107187371866719917211672014
500247197653940000444382017204128718388171137169325507254059010138400101000061273727282041496887671962719696535636574850010400201000070020100007210835114002110910100003001010000101091861545071066127112930503410910129141374410002521108588718844055611991111116210000400107200671957719067198872080
5002472085540500000455809170449672087808101717232550690405941013440010100006143812728713149689187209471924655143656425001040020100007002010000721403511400211091010000300101000010109331016151110636279119315033109351201813641430002521985910719474058411611145109310000400107211472152720797225572144

Test 3: throughput

Count: 8

Code:

  ldr x0, [x6], #8
  ldr x0, [x7], #8
  ldr x0, [x8], #8
  ldr x0, [x9], #8
  ldr x0, [x10], #8
  ldr x0, [x11], #8
  ldr x0, [x12], #8
  ldr x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3681

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160209296542213001737182017281051882953980158316211819214825160172801718000080100800004007611321744163492655829449294979434403951916010080200800008020080000296013511802011009925100800001008000001008095018421482485374687992272469186403816134506855761803051101161129430298005563851912480000801002949329338293532935429463
16020429495221300167598371752133160295008115821839184721792516016680151800008010080000400789130106107249267112933729557937103951616010080200800008020080000295253511802011009954100800001008000001008093416455485485507634149105050758590477712555134810003051101161129382458006056552110080000801002949229355294682941529206
160204293482201000667285716321111442935582264214761786218925160162801598000080100800004008011316343152492621529563295359294039293160100802008000080200800002948235118020110099421008000010080000010080915040353358505667313914725072865887251185449508800305110116112944731800595705349680000801002935329354295362954229729
16020429358220200067918321712118962952582256916191810196525160152801528000080100800004007981314947186492629829423295359475039301160100802008000080200800002924335118020110099491008000010080000010080915041550598519571013900325096853367651274974490603305110116112940618800625625309680000801002955229688293942937529323
16020429383220300068578441712112922962083362815941658228325160155801558000080100800004008021307726161492647329407295119284039400160100802008000080200800002946335118020110099441008000010080000010080937184355437854536991091176549885185805124496654430017051101161129266308005266658110080000801002951729400296732959429370
160204292822202000739182617201341482943681557817171811237625160151801668000080100800004007851321283161492653329344295079398039422160100802008000080200800002942335118020110099321008000010080000010080956040551098484471489294851038641778612854825176034051101161129442288006660060510180000801002936629267293652947029404
160204292772203000722283317761091282924981756517231748235025160147801678000080100800004007671300630057492637329396293339859039632160100802008000080200800002930535118020110099621008000010080000010080960184135903851157032897048498786439782122500256881804051101161129432298005961351710280000801002940229426295122931129303
1602042948321830006451822178413510429323805573168815682051251601658016080000801008000040085913061820594926235296042948395120394631601008020080000802008000029478351180201100995910080000100800000100809020399525684900647149303247978559571012754264968003051101171129312458006755252411880000801002918829476293852938629604
160204295552223000661482017601241162950181251717491846221425160169801648000080100800004007831303869167492625129723293019374039431160100802008000080200800002967135118020110099521008000010080000010080892044655888494166410924485036861997071295319518200305110116112953927800586185309580000801002938529523294852968329350
160204294962203000676782017121311482935979855716151982232125160165801608000080100800004008291301528065492618129589294319472039470160100802008000080200800002939135118020110099601008000010080000010080927043751198564967713934345568860437361315029494600005110117112938027800486856319680000801002961129513293062948629464

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3689

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)daddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160029298682212000017800086517361311202946680365519861723221225160075800828000080010800004004281308958017649265170295342954693523949416001080020800008002080000295683511800211096310800001080000010809922839958628556969712925665323870987441404950568636035020131605429557258006563857010680000800102934629401295442960929555
1600242934522220000070540858176010514029391809639187216772329251600608008180000800108000040035813055160069982677802963729442937939506160010800208000080020800002964735118002110935108000010800000108099026372562285225669129423254118590183113153985432350350208160562942736800596075688980000800102956029486293992964129699
160024295742212100006785081617281211162954480362916791686236025160078800688000080010800004003471301917007449263560293992935493643942716001080020800008002080000293923511800211095610800001080000010809763341250968518065417950784764859968031355509568431235020716012629752338007064961810980000800102932029526294752938629503
160024294312212000007261083317601181362966381060618191723225825160077800748000080010800004003621303290007749265050294182953894863947016001080020800008002080000294573511800211092710800001080000010809683639949658549767110925485305862698241305164554532005020516071229773338007159352511880000800102954929621295242958129468
1600242968022120100070600852174411714029267817658158818782071251600708005680000800108000040036812979750164492643802948429443938039486160010800208000080020800002956035118002110955108000010800000108093227379583585858693109604252268652784113052485421321405020101601062946032800695845469680000800102955629679297412943429434
16002429464221200000707108441712109152295988166531586182220842516007280072800008001080000400299129755300574926512029481296959516395551600108002080000800208000029371351180021109321080000108000001080931263925741858897141493946539186116808120519152813400502061608629395308005757960710380000800102964229748295912975729725
1600242954222120020067960851176812613629438836673178516862461251600728007680000800108000040033813018950162492653002956729437930839350160010800208017780020800002945435118002110963108000010800000108098335391546885395634169287251788657577713254475740350450201215012112946340800816195848980000800102949929630295462956929549
1600242970222120000072031088411712137722960982363717041750223925160071800758000080010800004003231318306117349264300293952961892863956916001080020800008002080000295303511800211095310800001080000110809503439859618562863778907651588656983112151715474330350201116071129642388006160250510580000800102957029705296522954829681
1600242950522020000073670867174413212429640829710173317072123251600698007780000800108000040035313095820163492645202949229504941439466160010800208000080020800002947235118002110943108000010800000108095932395609285293690119346854038612974213051885351150350201116010102952629800575975468880000800102954029323294872964229656
1600242972522220000070040844172013096294208026481686180822562516006380076800008001080000400357131155101654926211029324296319457396971600108002080000800208000029439351180021109411080000108000001080926314275238859246801692472540686297821129525256981706502071605929310488005855662110280000800102959729496297492953129527