Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (post-index, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0f1e2022232b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005104070000462010000102515030182520001000100010001000528444582410401040699377320001000100010001000104044111001100010000101300411028402808141016487112400732161110371000283501000100010411041104110411041
2004104070000461410002010250043182520001000100010001000528484582410401040699377320001000100010001000104044111001100010000102001481010301601291012304144800731161110371000323201000100010411041104110411041
200410408001048010004102500602225200010001000100010005284845824104010406993773200010001000100010001040441110011000100001013005110243013012171010306856023731161110371000333101000100010411041104110411041
2004104080000481610000102512020925200010001000100010005285245824104010406993773200010001000100010001040441110011000100001014004210210014014101023245124800731161110371000323201000100010411041104110411041
200410408010062140000010250000162520001000100010001000528444583610401040699377320001000100010001000104045111001100010000100001421020201201012104424583200731161110371000343201000100010411041104110411041
200410408000032321000010251100017252000100010001000100052852458241040104069937732000100010001000100010404411100110001000010150057101000006111012426154800731161110371000442701000100010411041104110411041
200410408000056220000010251102210252000100010001000100052848458241040104069937732000100010001000100010404411100110001000010000048101400000101012245124009731161110371000323001000100010411041104110411041
200410408000032100010410259102172520001000100010001000528524582410401040699377320001000100010001000104044111001100010000101501331024401200121041365124800731161110371000283001000100010411041104110411041
2004104080000541710134102511010182520001000100010001000528484582410401040699377320001000100010001000104044111001100010000101200411038201204101026306145600731161110371000363201000100010411041104110411041
2004104070100400000020102510202152520001000100010001000528484582410401040699377320001000100010001000104044111001100010000101501681025301508151037486164800731161110211000343001000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1897

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e1e1f2022293a3e3f40434d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50209721895391000046908221648311271810780071665255076040572101274010010000614462272504414968619071737717316546136553350100402001000070200100007184335114020110099100100003010010000010010934215950010633260119054626109081252128203261025811716034053610661106109410000401007182471681719607203772134
50204718965382010042408651736213671690802071581255082040600101324010010000614069271805614968852071958717926532636561350100402001000070200100007188535114020110099100100003010010000010010879415949110651282129287638109561232134239261015811716804045211211043107010000401007175071850719077159671992
50204719895372000042808591704214071805790071766255075040672101274010010000615675272604714968859071742721256537736565650100405381000070200100007205435114020110099100100003010010000010010932415548410644261169596820109281292128203265115811715674051210401099110210000401007180972040717887182371868
50204716585382010040108171464214471886874071519255070040624101214010010000616084272988614968842071897719706555736565250100402001000070200100007181135114020110099100100003010010000010010907216354910667256118961361301098613521082310261015811718204051610221117101510000401007202571768718227189871869
5020471895540202014390856172821207189081507149625507854057210131401001000061578027234960496879207198672019652693657305010040200100007020010000720553511402011009910010000301001000001001089521614751062726811946503210897132213324526101571171578405041055890108710000401007193571775718157186271783
50204719295392020045408511712211672065826071615255072540612101414010010000614696272019114968710071832717376542236568750100402001000070200100007189735114020110099100100003010010000010010881213747510616236109294832109061232118251026101581171596404969851005109510000401007168271879720277196171815
5020471885538202014750821176829671909816071522255066540564101364010010000616081273015514968907071808719006545336555150100402001000070200100007184735114020110099100100003010010000010010906212750210633259119263635109531402123209261015811717964044410971076110710000401007198571795718047189971819
50204717595382020144408151704213271972789071720255069040640101274010010000612612271759914968921071957719226532536559850100402001000070200100007190935114020110099100100003010010000010010903215751410634271119445432109621232136223261015811717634050811451147115010000401007188471947718997201172016
50204718665372010043708151728211271822805071715255078040604101274010010000614553272607204968868371736720396532736566250100402001000070200100007176276114020110099100100003010010000010010914215850910623266159088229108981202131209261015811716384055211841148105710000401007168571980719087179371982
502047199353920001399081717122124720418430717022550675406041012440100100006145112733486149688900719407209765384365512501004020010000702001000071990351140201100991001000030100100000100108872159487106352531288678261093013721102311261015811715704050410881095103610000401007223371908719087170571780

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.2069

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e181e1f2022293e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9e9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)c2c3branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029722895390004670868176814072070821647163525507554058210116400101000061386627316260496885572049718936540136560750010400201000070020100007214435114002110910100003001010000010109520161517106642742195546581096012761320302520058501414718564058811871169116310000400107217172081721197220672189
50024720955400004380867163213672106850727178225507754056210150400101000061477327288440496888472177720856566936570450010400201000070020100007209435114002110910100003001010000010109510147519106842841996272291099014191380100252005850514718584054010921113109110000400107195671844719717210472000
5002472015538000491085917441407209884963716252550760405381015040010100006152592730058049691347205372186654733656085001040020100007002010000721233511400211091010000300101000001010955015755310662292179707452109541467140070252001185041471997405739681110101210000400107205271971722157202272102
50024720865400005030875172014471947845527188425506904060210133400101000061397227251360496893671831722296557736566950010400201000070020100007209835114002110910100003001010000010109500152539106672781995048461096913412142330252001410101212718434061611011166114410000400107205372165722757197972018
50024719305390004360876175217672122838647192525507404059810150400101000061382027330650496880572069719526571936568950010400201000070020100007214235114002110910100003001010000010109700156509107042762297974501095414591450902520014850514719074058012211219114310000400107216372044720077199771907
500247211154000043308691760152720648438671739255067040598101314001010000616267273819504969040721787201565508365744500104002010000700201000072142351140021109101000030010100000101098001525161069228319986485410952138514307025200148501414719334063211581103118010000400107204971989720427220171962
5002472121539000516089817681407204884592717802550730405661013540010100006148522734050049689937193671966655463656195001040020100007002010000720183511400211091010000300101000001010953014453210677280219628259109791416127330252006850614721374059211141099107610000400107223372050718667200672176
50024720445400004480872175211272123845467195025507604060610141400101000061379227304160496906072130719016540436576750010400201000070020100007229535114002110910100003001010000010109630146528106582801895976571095414961440302520014850144719064057611001186120610000400107210471825720647215471901
50024719235400004660846172813671932827727171225506904058210159400101000061359727363010496903972052719916555936566950010400201000070020100417212135114002110910100003001010000010109640151484106832912197238441096414151420302520014660514721094060811791170118810000400107241572114720587205572185
50024721315390005360853175211272255841637180125507454053010156400101000061456227279180496896772138720176547436574850010400201000070020100007202035114002110910100003001010000010109560146535106842731795876431095913391621530252005850145719674053611231184115710000400107231372062723947204872010

Test 3: throughput

Count: 8

Code:

  ldrsb x0, [x6], #8
  ldrsb x0, [x7], #8
  ldrsb x0, [x8], #8
  ldrsb x0, [x9], #8
  ldrsb x0, [x10], #8
  ldrsb x0, [x11], #8
  ldrsb x0, [x12], #8
  ldrsb x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3670

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0f191e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16020929658220202002688378317281059629114782415186316312059251604638051880000801008016940073113036780584926280293752931292823919916010080200800008020080000292063511802011009927100800001008000010080941373875312085809681128962846068581472211753034566370051101161129423368005747850811180000801002957429399292282951329226
1602042952422122010072078141736995229464791350182517762186251601608016880000801008000040077412869051634926142292952928891323916716010080200800008020080000293283511802011009939100800001008000010080922274445396085199653178852247028555867310645124706362351101171129379238006552453910780000801002932829740294102928129488
1602042960521822200065847841688931002910975637320571671201425160157801648000080100800004007241295816159492619129173295789473391641601008020080000802008000029363351180202100991810080000100800001008089636394564208468264912902305214857698201215161517738075129116112937537800515005049180000801002948829590297222939229457
1602042942821820100070637861744911482925476436217541836217425160162801588000080100800004007761299455066492621829456292519278393831601008020080000802008000029288351180201100992510080000100800001008090937405550508487862698547246978583774411043624973371551101161129373368005554454810680000801002922829689294182943229300
1602042932922121210072547991568961162926375043217211941214225160170801578000080100800004007751293181064492619229255293239336391001601008020080000802008000029314351180201100992010080000100800001008091336451532708505768812869364896856296901374706491037425110116112953627800665716189980000801002935629414291692933229367
160204293602212200006878845172010112429299757342191216682015251601678016480000801008000040075612969911724926385293532937493643936416010080200800008020080000294323511802011009935100800001008000010080907364025993278564864889063461118576474013550284190370651101161129673258005454553811380000801002942329505293182918029399
160204299352242010026787805172010510829402798304185917842247591604498033980223802658016940160612941730564926317292262950292823923216010080200800008020080000293053511802011009929100800001008000010080943354125203085576703149184453798577671512153255578373351101161129389258005850347511180000801002933129387295822923129553
160204292442182100006806804172092144292917843911817189321632516015880162800008010080000400806129168706249262642920029253928839218160100802008000080200800002929835118020110099281008000010080000100809125434961120849586729868344944854546021415378526955345110116112941021800584864749680000801002926029544293912920929358
16020429110219300000703277616969911629151775378180918732179251613678015080000801008000040073712956740584926289293942925491173918216010080200800008020080000293353511802011009933100800001008000010080946514235533085483678129337054338612572812448744508524451101161129412288006759554210380000801002916829837293852962529610
160205294502183203006758802172011710429303781349175218342197251601598014980000801008000040071713075270594926034291092940793363938816010080200800008020080000292603511802011009919100800001008000010080947524336047085486707168684050528570482512849124979532251101161129171298007048055411180000801002961029525294432943929373

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3651

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)67696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929799221301000726107881720806829233789378194921931930251600758007980000800108000040027512928935449262532984929582942039054160010800208000080020800002915335118002110913108000010800000108093663445537184978756139151305100861578471384673513033035020416442930834800663913888080000800102930829252293012911229292
160024291992193030006824083117041081122916478937018332213178725160064800728000080010800004002831295531644926355296302947392373935316001080020800008002080000291783511800211097108000010800000108096752423542685925821119373449218600780014052114746490050205165629001198004837541110580000800102913329193291652945729352
160024289282174000006284078517369711229226802347212219071742251600688007280000800108000040029812902715549260892966629314919639351160010800208000080020800002929635118002110912108000010800000108094571425573085135703138751325232861877471244753431468035020416662905824800674113698980000800102934529241292852921929449
1600242910822030330070750787168890108291277943022095241918402516007080064800008001080000400294128477453492629329969293229042391771600108002080000800208000029087351180021109710800001080000010808963240855508471870216861904580857017211245164470635035020416462934026800513933507480000800102924829098291122928129062
1600242915321830030070040820169682722912274136319402209168525160063800718000080010800004002791293184524926010294082920991643924016001080020800008002080000292313511800211091410800001080000010809333542158878533870511869244511859587861255214517732055020716472923924800743553709380000800102926729143291402929929202
1600242929821920020065950810173691100290227773112177227417502516007880077800008001080000400300129277052492623729381293709094391991600108002080000800208000029193351180021109810800001080000010809223340652548462465213888384791855728681305199484170035020916782907930800703504178380000800102925929059294802921829137
160024293092183000007205079215601002602919779837220292081193925160067800748013080010800004002871289586644926033294642927292613902316001080020800008002080000293893511800211091510800001080000010809403446153658511670013895384456856647091215035470814025020416752898726800633894258680000800102895429208291792912529051
16002429264220100000712808051744108100293098063472188218416952516005680067800008001080000400328128517157492607629343293029088390571600108002080000800208000029194351180021109810800001080000010809533438953898551570016917765714860237061265189512137035020416452913021800584354049980000800102932229213291852909929175
1600242909121910000172260820170493922927977136520482136201125160076800848000080010800004003451293017614925975293082914890633925916001080020800008002080000290513511800211094108000010800000108091916417548585771745109303250008586072611352534506180050203165429029218004738631910680000800102913829023290062901329109
160024291412182110006439084916888476292868013021903214716472516006180079800008001080000400338128338661492602129584293449340392051600108002080000800208000029266351180021109910800001080000010808861742151698479070613891325436858367231245297522617035020416652927824800513772789080000800102897929055289842920729119