Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (post-index, 32-bit)

Test 1: uops

Code:

  ldrsb w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022232b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051040711101680000101025190402125200010001000100010005284045825110401040699377320001000100010001000104044111001100010000102780521048101112161040515236471073116111037100031401000100010411041104110411041
20041040811101472400010102561101225200010001000100010005284445824110401040699377320001000100010001000104044111001100010000100880641044302032211012305215671073116111037100043231000100010411041104110411041
20041040811200711620030102500021625200010001000100010005284045825110401040699377320001000100010001000104044111001100010000101780561027102501810123051556740073116111037100037371000100010471047104110411041
20041040811210801210010102516011172520001000100010001000528404582411040104069937732000100010001000100010404411100110001000010237137102810270201008424185672473116111037100031401000100010411041104110411041
20041040710100550000112104000001625200010001000100010005284845824110401040699377320001000100010001000104044111001100010000102170331029120012151020365205660073116111037100030271000100010411041104110411041
200410408111007214100101025902213252000100010001000100052828458241104010406993773200010001000100010001040441110011000100001021704110493012101610143642440711073116111037100028291000100010411041104110411041
2004104081110039910020102580101525200010001000100010005284045824110401040699377320001000100010001000104044111001100010000102460571054102312171029365205671073116111036100031271000100010411041104110411041
2004104071100083122002010258000112520001000100010001000528404582411043104069937732000100010001000100010404411100110001000010527057103610216181031304215670073116111037100031271000100010431041104110411041
2004104071120047140011410250042112520001000100010001000528444582411040104069937732000100010001000100010404411100110001000010437142103561266211040365224062073116111037100029321000100010411041104110411041
2004104081220047141000010251310011252000100010001000100052836458241104010406993791200010001000100010001040441110011000100001027200481027402012261024305195661073116111037100029291000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1885

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e2022293e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020971861539000004378191696807183379332715572550715405801012640100100006135672725286496869072004717296521436551750100402001000070200100007193235114020110099100100003010010000010010878141487106112691690213260109091152121005261075722718464053610331166107310000401007191871759716877194971951
50204717855380111141280916881047184077632717592550720406121012940100100006136982722913496884471937718416528736555350100402001000070200100007183135114020110099100100003010010000010010890152559106202739884842910891119212500426102582271823404961047104195210000401007196071882718397183871930
50204718985380001048879317049671831766327168925507354062810128401001000061412927223994968961718677198365245365509501004020010000702001000071883351140201100991001000030100100000100108701284671062225278663223108881233119005261025822716694048810041036102510000401007194772057718507195371979
502047180953900010451786168814071800781227163225506704066010128401001000061419627275924968753717697196265339365449501004020010000702001000071907351140201100991001000030100100000100109101314771061427010893282710910137213103526102582271651404801052963110810000401007188371759719947181372055
502047183553900000340796167296719218193271474255075040632101254010010000614568272796549686837181571931651573656855010040200100007020010000717063511402011009910010000301001000001001088815650310614263128663220109251312128036261025822713264054010471046103510000401007193871944718507202171902
50204720845380001044880816961447202981722716972550720406041012040100100006120272719233496891071795718516535036543850100402001000071362100007183435114020110099100100003010010000010010870172506106232587926721865109221101125003261025822717464056010871030106310000401007199371856717657182271811
502047175753900000489782170496720057712271637255078040624101264010010000613431272803349687647194771773652043655995010040200100007020010000720253511402011009910010000301001000001001088612750010667278129023818109161162135037261025822716594052810081081111410000401007190671868718007195271842
50204718545380000047682416969671664814227145725507304059210139401001000061331027292514968929717897187665346365663501004020010000702001000071885351140201100991001000030100100000100109381405051061127810902122231091611721240011261025822718194043210611004109210000401007186371821718617195571991
502047199153700001426784170410071995779227154125508154056810128401001000061367027233304968769719887174265309365559501004020010000702001000071751351140202100991001000030100100000100109261634881063224912894642310899131212300526102572271591405041018984110210000401007205772005719347192871958
5020471714538000104958031728100717117684271505255076040564101244010010000614331272084549688237181171791652923656155010040200100007020010000717753511402011009910010000301001000001001083315049910597270138787220109081313120034261025822715954049610691060102010000401007182972059721157196071899

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1942

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e181e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029722355403000004368381704310871845803627187525507454059410138400101000061347327281160496882171792718566542136576950010400201000070020100007194835114002110910100003001010000010109204191521106782571090576351090313871345030252098542718424053610821003107310000400107206772077725037190772052
500247195453940001042581017044124719527858271936255070540586101484016010000613585272500304968862719647186265471365713500104002010000700201000071964351140021109101000030010100000101090510168499106672861191612428109271339120503025202852571731406001086100398610000400107209071894720497207771999
50024719715585000004178101712310872045801637192525506554049810137400101000061432327246511496894471991720656538036549350010400201000070020100007206535114002110910100003001010000010108993154507106482561090778311090412581213070252048542717214054011301089107010000400107204271815721507211971998
5002471879538300000446837171221087196478752717963750745405581014440010100006126472726713049689007186071994653423656745001040020100007002010000718203511400211091010000300101000001010899417551610640251791278361090713181334640252028525717614054410831124105710000400107202971989721527196272139
500247193553940000041082617124120720788095371705255064540586101374001010000613052272800404968896720377208865492365612500104002010000700201000071962351140021109101000030010100000101090781855041065227312899323510913124111314030252028544718414056011121051106810000400107198372107718287205871766
50024717415394000004107971712311671912838537188925506454050210140400101000061312327320620496881972113721126550336571750010400201000070020100007209735114002110910100003001010000010109084189520106362598904723010916121101414040252048524717114054810671074107010000400107208572069720257205971895
5002472100538401000404834172841207201478962720602550780405861014240010100006144732723804049689037190872137652773655795001040020100007002010000719663511400211091010000300101000001010932419149910654265890815655109231329119403025203853571650405481123106392610000400107197571980718307200971900
5002471876539400000364816172841167209977942720212550785405861012640010100006139522720598049685677204072016654283657635001040020100007002010000719263511400211091010000300101000001010905517955010670228889468351091212671274040252048543719494056011101049111210000400107218872063720947196972048
50024718395384000004788061744411671876814727172825506854059010136400101000061441627290610496873371894718986535736558650010400201000070020100007205635114002110910100003001010000010108934161524106482709901762010920135814245110252048544717734054011411049116710000400107201071751721327196472008
50025720435393000003838311688311671975809527186425506804059010147400101000061407827344250496900071886720236550036557950010400201000070020100007193835114002110910100003001010000010109093183483106522501192674291090111671472080252048542716764052010651125120310000400107185272037719957195471907

Test 3: throughput

Count: 8

Code:

  ldrsb w0, [x6], #8
  ldrsb w0, [x7], #8
  ldrsb w0, [x8], #8
  ldrsb w0, [x9], #8
  ldrsb w0, [x10], #8
  ldrsb w0, [x11], #8
  ldrsb w0, [x12], #8
  ldrsb w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3683

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602093002322130000810682917521249229525754723184719572178251601958017480000801208001740086012973301634926460294372948694376967116014380224800208022480020292433511802011009931100800001008000001008094633364562985432606139025060178658687614652525102533311151171160329342328006661963711280000801002937029420295272940729728
16020429583222200008222837167211013629385803720166217772209251601708017980000801208001640089813032511634926409292262929194636951116013580392800208022080020294033511802011009926100800001008000001008089236362587985420655109029256858654770312556715284320311151171161229736248007958661713480000801002965029726293682943129621
160204294002223303071558421656120156292837636391660179222822516017880166800008012080015400935130068717449262772948029448947869527160134802208002080224800242960735118020110099221008000010080000010080979543905434856626861392310047688673574611357795067551211115117116152976036800735645499780000801002954529460294362948729165
16020429621222200007880830174411514429392746591179719842135251601828017780000801248001440087112922861594926450294952934095866933516013480224800248022480024294513511802011009925100800001008000001008092232400514685925649108873453088631079213557794853320311151173163129477298007447752910180000801002942429463293412950729550
16020429375221200208272830169611710429298768670194618472038251601708018580000801248001740090413027270644926242294732914993176960016013680224800248022080024293033511802011009920100800001008000001008097272354615586245666198993056688640677813056475053721311151172161229584278007652654110880000801002962329442295202948229194
160204295312233303073458021696969629220758654179817902175251601868017980000801208001440081612985731754926233295162929193156941716013580224800208022480020294143511802011009927100800001008000001008095048355541286190632139006650278601176212455405064690311151171162029405178006962850411880000801002954729476295452942029381
16020429371220300008216821166412211229459819600174619942219251601718016680000802928001240084313025010544926187295672946495096953916151880220800208022080024292913511802011009936100800001008000001008094736375596785956653148995052078657681012557634677173011151172160229622238007355159810280000801002971829553293602953829539
16020429414221200007952795166411313229407810394155920002173251601778017880000801248001340086212908780814926362294672947592216940616014180224800248022480024294113511802011009933100800001008000001008096655359587885276675118664460018645485012155954610520311151170161229561298007552755610980000801002961729469294392931029180
16020429348219400008164832168812012429446804638186618302086251601808017280000801208001540090212997081614926398296022966894016953516013380228800208022080020295523511802011009938100800001008000001008096769388586985362636128947860278607279013654294271735611151172160329395238008160353910680000801002917829501295762969429188
16020429637220200007469789168811114829451797534160318742125251601728017380000801208001540083313032581644926687296222943494066935716014180220800208022080020294223511802011009929100800001008000001008091836347561685259653148984255208638968412750214906330311151170160229337338008258959514180000801002925429561295172953129588

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3641

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0f1e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)daddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600292972721910000679307901680941362926083137717542143201625160071800598000080010800004003051282146045492604229080291979257039002160010800208000080020800002912235118002110991080000108000011080888048357060852667491488917251138602276312751524855033050205160222921542800654023618480000800102927629474295082926829141
1600242920321800000638207911728961082919078535718502065194725160064800748000080010800004003451288476056492600329273293089173039101160010800208000080020800002926335118002110913108000010800000108091719516560608509266398804846558596775111947004980003050206160222915640800613763358180000800102908529002293452936329127
1600242891021800000676407921624778029308761370185021141809251600668007580000800108000040031012922020714925973293092933793040393171600108002080000800208000029119351180021109410800001080000010809161851560570857367271190850490786357773106494351101804050202160622929139800533433499580000800102906329124293392955329451
160024290472170000069330769168890100291477393801913188819242516006680059800008001080000400318128455716549261842912628915901503903016001080020800008002080000292893511800211091410800001080000010808650463578508522269412928425627857318241115080513318350502021602329068408005428836510180000800102931529162295912937229080
16002429042218000006585078416809213629046796371187121461874251600778006280000800108000040031012873081684926076292012896392460392491600108002080000800208000029342351180021109151080000108000001080871185176042085195675138574645588595468612551184553030050202160662910233800583893257680000800102930529137293142941829312
16002429241218101006645080517361071002924481538722702075194525160070800758000080010800004003811283764049492608529307291328939039084160010800208000080020800002914135118002110981080000108000001080863194855676084845699990250585285443788122493848573600050202160322935029800413413029980000800102901529044295492922929380
160024291272181001068870764172081144291117473951848201617302516006180075800008001080000400294128787006949262912917129157915803931716001080020800008002080000293373511800211091810800001080000010809263754558930852167071189754487685592711132523542621930050202160222911833800633693639580000800102916629144295522924529325
160024290032192111065140776174495156292477763792078217419482516006480078800008001080000400310129718705649259012915929225892203901916001080020800008002080000291543511800211091010800001080000010808821945752230856437421286336562385445747133486844843503050202160222925024800783383348580000800102949329051294592925328986
160024290162191100068220806168092136291187433521854203519312516129780059800008001080000400306128118906449257562907329000912803913716001080020800008002080000290833511800211099108000010800000108086204905488084975699178594255038593371011550384935004050203160262904844800543493529580000800102911529291294542923929259
160024291122161010067250804171210210429266797372205821931798251600728006880000800108000040035212826000584926017289942908391110388541600108002080000800208000029050351180021109171080000108000001080877047066120852067111188642489285999732133506248313723050202150322938231800554113597980000800102939329368295102927029304