Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (pre-index)

Test 1: uops

Code:

  ldrb w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e20222b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005105771000015216004010250642093325200010001000100010005280045821104410557003784200010001000100010001040461110011000100001033817311031203861041093426101637450731161110181000303081000100010561052105210521041
200410558101001392010412103811661014432520001000100010001000528004582210401040699377320001000100010001000104044111001100010000102681811105703316104111860710539710731161110371000241801000100010411041105310411053
20041040711001140181040102596591632252000100010001000100052760458251040104069937732000100010001000100010404611100110001000010766310010921212309011083768263610731161110341000312801000100010411041104110421053
2004104071000014412007010441058910392520001000100010001000528084582410401040699378520001000100010001000105246111001100010002103683591102403418106110652811247710731161110211000282201000100010411041105310411042
2004104081010013212003410256638183425200010001000100010005277645823104110406993773200010001000100010001052461110011000100001034727611145124129210954848647710731161110191000252051000100010411041104110411053
200410518100001613114716102506015164025200010001000100010005282045828105210407003773200010001000100010001040461110011000100001036835311177024682109046598487044731161110061000302501000100010421041104110411041
2004104071010014100021210250601711402520001000100010001000528164582410401040699377320001000100010001000104046111001100010000104284891123110296111110440711154700731161110361000211901000100010411060105010581053
200410528100001372410524102521562193225200010001000100010005278045825105210517013785200010001000100010001052461110011000100021032628410917061610811463179163610731161110361000212001000100010411041104110411041
20041040711100127000401025859910292520001000100010001000528164582310401040700377320001000100010001000104046111001100010000101883611092501208511254669463710731161110181000212001000100010531053105310411041
20041040810000150151050102510501414322520001000100010001000528164582310401040699377320001000100010001000104046111001100010000102782981125100231281108752510762710731161110361000162001000100010411041104110411053

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.2086

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e191e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50209721035391011106268221704216471915786431717632550955407241017640100100006149082730196149690887183971927652350365657501004020010000702001000072209351140201100991001000030100100001001089731695091065726412921488010973134431341080261025811717914068888884891510000401007203171950720047199472280
50204721325381000006438192608112071956797434719062550915407401017040100100006152652730062149689757190871951655440365699501004020010000702001000072150351140201100991001000030100100001001088711664951064126314921487910951122431233390261015811718224066077082292510000401007192272207719587211571974
5020472039540100000628793172019272138804464718312551025408281014940100100006163992737801049689727201872164654450365577501004020010000702001000071990351140201100991001000030100100001001090111675011065527814921327610950121461221030261014911721034066090899694210000401007212272109721087208972059
50204720405411011006058141752210872128793464718742550985407641015240100100006157512731524149690997214972147651880365675501004020010000702001000071858351140201100991001000030100100001001092311524961065725511894328210953128461131350261016301719954063284886697010000401007196771882719607222071992
50204723245401001006218151704112872238821463716962550950408081018240100100006162512726770049690567214172233655367365727501004020010000702001000072047351140201100991001000030100100001001090121585081067125615924489510932107451201070261015611717304074498495683010000401007208272150720617205372086
50204720445391000006648001720214872213803423718412550910408481016740100100006160192737423149690227192672093655610365822501004020010000702001000072113351140201100991001000030100100001001088111555041065925313878368010958110381101350261025711717034066072895894210000401007221271785719207207371994
50204719225401010006688051696210072092783456718622551020407761017040100100006157252736041049692047205572213656680365724501004020010000702001000071988351140201100991001000030100100001001089911835101066328114901928310945118481331390261016312719754072897690095810000401007213472142720257211472118
502047206053910000064683317123116720208024457197525509154075210175401001000061714027290460496891171869719346542203657555010040200100007020010000719233511402011009910010000301001000010010893115550910683288118886883109611244412810702610157117189940708910100496410000401007214872117720507198372074
50204722175401000006527932672114471899765453717262550960408121016940100100006168372733361049690197202772236655490365735501004020010000702001000072215351140201100991001000030100100001001090311475021066226714886328710947119451311130261015611720254067292891292310000401007202272005720087208472054
50204721405411000006287871712111272083784412719402550905408081017340100100006145182731997049689767217672013655560365980501004020010000702001000072245351140201100991001000030100100001001087511524871067826112891687110931126461161030261025811719594071688492490610000401007207272154719187206972198

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1802

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f191e2022293a3e3f404d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5l1d cache miss ld nonspec (bf)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500297195253800105648271568011271694838714252550680405661012340010100006310812718033314968494718457183765229365434500104002010000700201000071745351140021109101000030010100001010939015748010665282129236836109061351300700025202650127158540540968101299410000400107180771809718627172271914
500247159253800008808411592012871921809715662550725405861012840010100006312002716593214968730719747184665267365466500104002010000700201000071877351140021109101000030010100001010932013449210662271109351263610960116132030002520265021717374055694095898610000400107183071829718137185371728
50024715915370000526825172001007175581971675255066540506101314001010000630956272231230496854671935718756523136559950010400201000070020100007176235114002110910100003001010000101095101654921066224599661768110936123125080002520265021716734052492495498210000400107174271845717697179271562
500247186453900005018601704092717468117160125506904055010125400101000063114527187052049686827174271740652063653775001040020100007002010000718513511400211091010000300101000010109350158491106582721293542331101512412403000252016513271595405369981000100610000400107166971647719307200171881
500247159953901005698301768014471783872715132550635405181013040010100006308362713615304968614717487178465261365436500104002010000700201000071794351140021109101000030010100001010913014551910668275169284439109441241453300025201650137166340492954886102610000400107170571633717627188371734
50024718405380000617844172801167169880671535255069540566101194001010000630386271669630496875071821717226529536554650010400201000070020100007180835114002110910100003001010000101093101515031066228012910524510931130128030002520165021715334051698692693610000400107175071875718157169971805
500247175353800006018301776011671737863716862550675405301012540010100006305872706689204968836717917192565322365461500104002010000700201000071803351140021109101000030010100001010954114151210688272109316839109491331450900025202650127162040568940100495010000400107186171771717967174771692
500247186053700005288551736012471817823714502550720405181012840010100006303592722183204968630716787177165172365498500104002010000700201000071877351140021109101000030010100001010985016249510652256139367227109451341260300025201650117164640488970950105210000400107193571845718727176571761
50024717525370000485844172801407172082071696255069540562101414001010000630512271897720496862671816718396524936555850010400201000070020100007164935114002110910100003001010000101094701454941067026313925483310942137123090002520365011716894048495894484410000400107189171825717157191471987
50024718035390000538826173601167189280471704255066540562101394001010000630459271652130496880871583716766527336561650010400201000070020100007184335114002110910100003001010000101091001465171065727514944502110909136126030002520265021713834048894090093210000400107173371803717647197071799

Test 3: throughput

Count: 8

Code:

  ldrb w0, [x6, #8]!
  ldrb w0, [x7, #8]!
  ldrb w0, [x8, #8]!
  ldrb w0, [x9, #8]!
  ldrb w0, [x10, #8]!
  ldrb w0, [x11, #8]!
  ldrb w0, [x12, #8]!
  ldrb w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3680

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092989522000000068008311704110116294498163612042191121312516017680173800008012080015400869129631805749264832928429437954206941716013580220800208022480024294763511802011009941100800001008000010080925204086355861176782288938523086385796124530252473501111151172162129560258007655161111980000801002940029581294282952329342
160204293442221000017300819163211510429353788351213518812270251601658017480000801208001540084312939130694926447294452932293190692551601348022480020802248002829462351180201100994010080000100800001008090919409585585635646189154853528602076714054045130191711151172164229289268008962358510180000801002943229603296052942229391
1602042943922010100169468041688105882920577928817652030220625160167801708000080100800004007831291374063492658429329293879118039497160100802008000080200800002948135118020110099311008000010080000100809381936758948565265013922564910862188381375326501619130005110316442948516800535105549780000801002943229550295062941429577
160204295702211000007323787177613014429614797367177118242066251601608017180000801008000040075413053640674926383293572951093690394731601008020080000802008000029505351180201100992510080000100800001008094519375598186081644149164657458637877313950195696190400051103163229261338006458160710680000801002931129744292982933629527
16020429468221100000663279917121131322942878434917782078177925160168801578000080100800004007761299980079492629529675294939559039181160100802008000080200800002929635118020110099411008000010080000100809171937857128546064512888565457864817471355068536600600051103162329488238006445563812180000801002954129321292302936229381
160204295582210000017224854177610614029400781366185020342266251601718015080000801008000040084912957880684926518293332942693180393511601008020080000802008000029446351180201100993510080000100800001008093903846363850826612089684562985948749125517048080000005110316332956833800656505539780000801002951529317293842933929521
160204292882210000016865807175212010029303787316206420282087251601558016380000801008000040079013009060814926213294692949093330394881601008020080000802008000029411351180201100992510080000100800001008093019387560185455618159519657178654081912156575182195400051103263229249318006361252110580000801002965429632294162953329447
1602042948822110000066728181720117882967278630119531977204825160157801598000080100800004007771297904063492620529573294829158039335160100802008000080200800002957435118020110099371008000010080000100809131933355298545263813909485681863848371385375542419020005110316332970434800666525089480000801002921829371292312930529475
160204293232200000017325803173698132295517963131922187722452516016580164800008010080000400772129402807849261212943829381933903958716010080200800008020080000292573511802011009939100800001008000010080948036658138569569414884365093863717701385140547700000051104164429352318006063255911480000801002927629494293982955629419
160204295952230000007242798170410796293117823561785195321422516015680161800008010080000400804129816706149264812955929403912103963316010080200800008020080000294183511802011009940100800001008000010080915035458518552062217859545138862797491395475569100400051103163329314288005856253911880000801002927829408294882964529400

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3647

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f606167696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dbddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160029298172181001000068270798170499144292258403152023214218882516007480076800008001080000400301128631400059492611529330290288938039123160010800208000080020800002920435118002110912108000010800000108097334398613685134716238943457948623682213548334523350405020001416001182919429800533523838780000800102933129103290912923329262
1600242919722020000100698707811680919229255758337195721451867251600588007680000800108000040033512843390007349260692936529146909903914416001080020800008002080000293903511800211097108000010800000108090829396544485571704188793248528552583212352184806340305020008160010102930431800483493418780000800102905429145290442922829180
1600242927121920000000666607961704901242900177234720631897187125160072800658000080010800004003601301078000674926253290862923390350392101600108002080000800208000029362351180021109510800001080000010809253739354778514674421901705024860137121244755509836200502000916001272906719800642993689680000800102915129051291752920629302
160024292092182000000066380845172010610029265805331213619301827251600638007180000800108000040031812855010005549261022918829133897003911716001080020800008002080000290483511800211094108000010800000108093934395514685767747178964450468611688012450234981360305020001016009929095228005734933910480000800102924829354292132909229530
16002429001218200000006397078616889513629009759289238822281965251600638006380000800108000040030712939470005949260292926429238905303927016001080020800008002080000293593511800211091010800001080000010808783636156358582574420902445313857327831325072519637030502000716001192913836800683463428180000800102936629155292122919129400
160024289232202020000074160821170480156289457813211981188718552516007480064800008001080000400372129137000054492610629297291189029039124160010800208000080020800002918035218002110971080000108000001080947373455458860977551790324526485799829124492951473609050200081600892902517800582972798980000800102915929152290982896229297
1600242939221720000000718208121728891482919577733021062056192825160074800658000080010800004003551284127000494926136291032908892570391941600108002080000800208000028979351180021109210800001080000010809301739055008558076925868785115859208571134290484218000502000121600972921427800543083298080000800102920529165289922925628958
1600242906021810100000688208311768961282907278033821372102198025160078800628000080010800004002851276521000554926250290982930191600390121600108002080000800208000029267351180021109101080000108000011080859183445533854247082392728506285801834126472754611803050200081600692897422800623834149580000800102913929282290862913729208
16002429354219110000006830078217289413629141793299220519901776251600908006880000800108000040033212815760006449262062922629281951003918916001080020800008002080000290593511800211091510800001080000010808783435654278588477022875765030860677621234652491635050502000101600972897625800603483378880000800102901129290292052902229209
16002429121219200000006890080717049310829319783305223021391913251600698007180000800108000040034212918310004449261022920429197904803914516001080020800008002080000295533511800211098108000010800000108090439380540685264774248433049858561479912352045035352005020008160071029339308005236634810580000800102897129263291892932429141