Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (pre-index, 32-bit)

Test 1: uops

Code:

  ldrsb w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022232b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005104081020167131001010250115172520001000100010001000528284582401040104069937732000100010001000100010404411100110001000102280801030201063510474474055600731161110371000314091000100010571041104110411041
200410408110006920000201025712321252000100010001000100052844458241104010406993773200010001000100010001040441110011000100010288041102200002310204172239620732162210371000271601000100010411057104110411041
2004104081100053000044102501351725200010001000100010005284845825110421040699377320001000100010001000104044111001100010001025805210393013124010373353748630732162210371000211901000100010411041104110411041
2004104081010053100001010250114242520001000100010001000528324582411049104069937882000100010001000100010404411100110001000101780501040401703210272252539630731161210331000333201000100010411041104110411041
2004104071010055140002410251224111725200010001000100010005283645824010401042699377320001000100010001000104044111001100010001024903610321014102110272942947730732161210371000241501000100010411041104110411041
2004104081110061200002010251610210252000100010001000100052836458251104010496993773200010001000100010001049441110011000100010078044102300102410322852447710731161210371000292801000100010411041104110411041
200410408111005328000201025133141425200010001000100010005284445824010591040699377320001000100010001000104044111001100010001008815110330015122510312151947640732162210371000363501000100010411041104110411041
20041040711100532810050102516222122520001000100010001000528364582401050104069937732000100010001000100010404411100110001000103360901034601203010212253119700731162210371000251901000100010411041104110411041
2004104081110053261001010251211516252000100010001000100052844458240104010426993773200010001000100010001040441110011000100010077057103300003710233953363710731161110361000293101000100010411041104110411041
200410408112006014100101025141522325200010001000100010005285245825110401040699377320001000100010001000104044111001100010001007705210340000291021505284773244732161110201000403101000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1982

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020972281538100000048780717201124720067935171914255077040648101344011010007613195272888404969087071830720106544976553850117402321000870256100087184435114020110099100100003010010000010010919115551410658258990210834109021315131133111261901600717424056810411071109310000401007183671995718957207372036
5020471994546100000046780517680112718057974171780255083040628101334011010007614276273260904968851071958719486544176568450117402321000870256100087198635114020110099100100003010010000010010910014647310622262128974835109141285129004111261901600718794051211141036100110000401007202372055720647181272064
502047202153900000004698301704011671987778557169025507154062010145401001000061410627246450496885907196571986653543655685010040200100007020010000718963511402011009910010000301001000001001090701554781063925988987438109421191213600500026101571171724405769321151111910000401007197672252719797199771984
502047195753900001004038271720011672004812817182325507954062810144401001000061639627303280496880907205471909654783657715010040200100007020010000721733511402011009910010000301001000001001092111604741065927012943504410933132813813300026101641171725405201057999108710000401007211972140722267218471980
502047191254010000004008221672211672006801967181425508454065610134401001000061385127322890496909907202371940652623656135010040200100007079010000720563511402011009910010000301001000001001092811525081064328578904242109331174133107000261016511715574054011271140101010000401007182872019719537176471905
502047187054010000004458301672114071931831567157825508254070410148401001000061437827325050496879307202772058654953658095010040200100007020010000720543511402011009910010000301001000001001090511574851064026099069256109531317124133000261017011717434055210171028116710000401007183472213720687206371931
502047227453910000004408011696192719457959771843255074540644101424010010000617194273416904969083072175720026536936553550100402001000070200100007210535114020110099100100003010010000010010930115448310651266129187441109061336138110000261015811715324050411221143100610000401007208472198720117193271998
50204717195380000000421844172801167190078955717952550825405841013340100100006157652730425049687490721147204565409365597501004020010000702001000071998351140201100991001000030100100000100108800163478106792691591042341093412810121004000261015811719164048810731184105210000401007182572238720007201671871
502047187653800000004328321704111671883800337174525507704064410134401001000061479327310210496891307177571780654703656795010040200100007020010000719913511402011009910010000301001000001001095401545121066328311904463010919130413700300026101631171885405601144103094410000401007184672001719657199271985
5020471881538000000041680518000136716467914171962255079040617101294010010000616032273330904968752072066720426556936567850100402001004170200100007199135114020110099100100003010010000010010916115447110665244790912629109031348118103000261016411718214057611551154110410000401007205772265719567207472067

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1914

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029721925373000000556829172821407175682243716252550715405781014840010100006313992722918149688107200071758654993656755001040020100007002010000718653511400211091010000300101000001010960315253810677258895138341095812171261165002520265117166040576100099092410000400107175972142717377190871886
50024718145392000000581866174421287189281244717202550685405661014540010100006307452720045149688407191471769652903655815001040020100007002010000718883511400211091010000300101000001010939314851910659266129474831109591337140333002520265137177040552100299890810000400107190271895718637193071706
500247180353830010005748541736310472064827537167625507154054610147400101000063199427239921496876171914718966525236562750010400201000070020100007184435114002110910100003001010000010109516132509106682841396648351095813771442350025201652271383405721036900100410000400107200472020719177192672053
50024719635403103000546851174421487191582743716552550745405901013240010100006317632719286149688787201871898653773655395001040020100007002010000719503511400211091010000300101000011010952314551810676305119895043109741397135305002520165117175940621992870102810000400107191771870719277209372015
5002471904538300100060584717442116718458354271683255067540602101464001010000632109271811214968835718417176665323365664500104002010000700201000071967351140021109101000030010100000101094861625061069326799587438110111318138331100252016521717874053699099499810000400107183672103717897185771788
50024718495393000000527837173631087182285652715152550685405421014340010100006308672721853149689217197171801655513656545001040020100007002010000718733511400211091010000300101000011010953313555810642288139634040109391451214020300252016521718274054497293491410000400107190672032718337196871923
50024719155373000004571830175231487178682653715202550700405501013740010100006306112721070149688697191771881653173654195001040020100007002010000718373511400211091010000300101000001010950316952810693268129224639109621417141339002520371227150740588932101693210000400107190771791718657201971879
5002471949540303000059386317523140719308025371522255074040566101594001010000630929271799314968768719277178665330365489500104002010000700201000072131351140021109101000030010100000101090361774971067128710940343710924131714130300252036511718704056410281100105210000400107201271895718457187071942
500247201253830000006108631736214071865819527170425507404055410152400101000063054027208951496885871799717166535936540150010400201000070020100007187535114002110910100003001010000010109631015352810669291994436431095814871302330025202651371836406001056104095210000400107217372067719117188471776
50024719575403100000534852174431127187081243716632550695405861013540010100006318652728360149689137189671918654363655545001040020100007002010000719333511400211091010000300101000001011003715252010685288109475035109511307144239002520265227183540576967958101410000400107184371922718637199672110

Test 3: throughput

Count: 8

Code:

  ldrsb w0, [x6, #8]!
  ldrsb w0, [x7, #8]!
  ldrsb w0, [x8, #8]!
  ldrsb w0, [x9, #8]!
  ldrsb w0, [x10, #8]!
  ldrsb w0, [x11, #8]!
  ldrsb w0, [x12, #8]!
  ldrsb w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3648

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092974521944406677787172011510029321789290231219121974127160176801758000080120800164008521276267053492601529110290499169692131601358022480028802248002429092351180201100991110080000100800001100809587444752838532276714907464817862087671174761451866101115117016102926134800804083596680000801002920829038291622918629099
16020429132218414065507971728117962916780033420572160184125160169801668000080124800164008381294677062492596528863291819044690341604938022080020802248002429355351180201100992410080000100800001100811096339047718478471812896505090857576991364248557573000005110116112925221800553714366180000801002926229077292582922829197
160204288262164000683283117361061002942580031718942144184059161375801688000080270800004007501285362063492599729138291679127389651604778020080000802008000029164351180201100991410080000100800001100809856744446388417875915871464690855457651404659478371030005110116112923527800563973725880000801002890929214290782923929145
16020429254219400060878271720861522905379236918531908198625160158801468000080100800004007851277374057492582329203290659150390961601008020080000802008000029043351180201100991610080000100800001100809776840055768496173513898384761857366881374385515272430005110116112918445800543973396280000801002926829129290732915429114
160204292012184000664382917521191442903880632319092154205525160153801608000080100800004007331293426055492633429188293359180390781601008020080000802008000029131351180201100992310080000100800001100809547241955658480078211878684712854278131264715556463000005110116112942827802404284245880000801002919729213293052934029473
160204293362194020649183716961101482916477137122072085185012816016680328800008026480172400773128648506849261422916829087915939173160100802008000080200800002933435118020110099910080000100800001100809596744257698465379311935744579855508251335011551571090005110116112906534800564033936080000801002926429150292212921029205
16020429063218440067557951736111922913177234819841906164725160169801618000080100800004007471286400051492604129299292279039391761601008020080000802008000029099351180201100992410080000100800001100810126742648838477874814917464839861638571394445517370030005110117112904128800473893786880000801002913029014290802910829306
16020429263218400067808231744981002920281635821202269194925160160801608000080100800004007781286014061492604229094289549116391531601008020080000802008000029194351180201100992210080000100800001100809787242356468507775712930764994861107531444100506971130005110117112912731800664054126980000801002913729125293022950829228
1602042915721940006158785172895962903180833119661989181225160159801538000080100800004007351292477049492604729141290989099392271601008020080000802008000029362351180201100993010080000100800001100809457248547298489778511897464971855197631125086528372330005110116112911435800573814007180000801002915829276294212940029254
16020429017219410066658411704120108293587682972068233320992516017580161800008010080000400751128722506449260812905629078898639057160100802008000080200800002909535118020110099301008000010080000110081009634584972846317148921704500852957551164392495870440005110117112908830800553984516080000801002935929288291462909929149

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3664

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cdcfl1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600292950122001100718187517601051162936875861221132204199825160074800688000080010800004002821301672164492626629450294049248039448160010800208000080020800002950735118002110991080000108000010809913944148218596775516936385728861007861445054540837230150200121616929466328006440235310080000800102926229209291292930029359
16002429417218010007574812171210410429278819522209021101985251600778007180000800108000040029013029030584926418296042946593240393621600108002080000800208000029404351180021109181080000108000010809453637252548595274116880725436860158451225584504036030050200171616122934536800683974008880000800102944329368293382945729212
16002429309219020227812824174497112294107955442214207420312516007980076800008001080000400340129963217949262552935029408944803936816001080020800008002080000295483511800211098108000010800001080928413845478856567231493172512885824842140575949753725005020081617172958140800573774339380000800102936529469293472942829435
16002429188217022027117800172810211629407785593211621242013251600938007380000800108000040034313006571614926237292652937693320390391600108002080000800208000029292351180021109141080000108000010809323738156578529968717884385473863738031415411579516350050200141616152928735800703803518680000800102930529240295832932929373
16002429342221010007831837177697100292298105361940199218702516006880068800008001080000400367129695515949261642936629274922103922516001080020800008002080000291463511800211098108000010800001080885839158718524766816912725677858107561415518535618030050200161614162937735800593303829580000800102922029252296502937429330
16002429489220010017120823170498136292227815312198224419372516007780075800008001080000400345129388316249262742931429047901203942416001080020800008002080000292803511800211091110800001080000108092816421577585000739159273656298655382314953695447172300502001716171429267228006039742010380000800102916329225291612940929370
160024292492190101075478391744108802937680262622112137187325160063800708000080010800004002661296128150492615429388295749200039439160010800208000080020800002953135118002110971080000108000010808871838553398515171213953265061860968631375420545636060050200161614172950326800472943718980000800102933129207293652942629340
160024293722180220066498302792937629520781540181822622212251600658005680000800108000040035712923111624926067292642931393590392661600108002080000800208000029323351180021109151080000108000010809613937549968527071519966846072860408571275336507437870050200191610172909238800524034139480000800102952129273293692928229185
160024292962210200278068071752105104292658345822124218619492516007280068800008001080000400333128601215749261272943629424932003947516001080020800008002080000293073511800211091210800001080000108098438417555885947725109293459268658482313257735280366500502001716161429337218006937434510880000800102945629111294892925229276
160024293302200200069148401752120112294078125702191212420472516007180067800008001080000400343129023517049262902946529182921703928016001080020800008002080000292303511800211095108000010800001080976393925678856537481686534568086160777136491049293731700502001816161029306288005935138010780000800102922729383291642930829340