Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (pre-index)

Test 1: uops

Code:

  ldrsw x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f191e2022233a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051054700010853010241027121952925200010001000100010005279245828110401040699377320001000100010001000104044111001100010000103600861058803812147105628739550731161110191000251801000100010441041104110411041
20041042801000117220010102501517252520001000100010001000528164582411040104069937732000100010001000100010404411100110001000010260296104570261036106258824550731161110371000363901000100010411041104110411041
20041040801000672000101025160114232520001000100010001000528284582411040104069937732000100010001000100010564411100110001000010260247104362221027104546740580731161110211000223201000100010411041104110411041
20041040800000542810101025013824252000100010001000100052828458230104010406993773200010001000100010001040441110011000100001000008110318122027104127733710731161110381000332601000100010431057104310571041
2004105680000090401020104117442122252000100010001000100052812458230104010406993773200010001000100010001056441110011000100001032240601022020031101924626790731161110191000272801000100010431041104310571041
200410407010006424102010250110421252000100010001000100052820458240104010406993773200010001000100010001040441110011000100001020036710857124640106656538640731161110211000243491000100010411041104110411041
2004104080000062160030102510269152520001000100010001000528404582401056105669937732000100010001000100010404411100110001000010240199104850401440107039722470731161110191000292791000100010411058104310561043
20041040800000542610201025211991725200010001000100010005144045824010401056699378920001000100010001000104044111001100010000100000761020000026102633720470731161110201000262701000100010571041104110411057
2004104070000072000201025142331625200010001000100010005282445823010591040699377320001000100010001000105644111001100010000101600551036211614281064431024390731161110211000262701000100010411041104110411060
20041059800000620002161025013315252000100010001000100052828458240104010406993773200010001000100010001040441110011000100001018036510593415042102039722470731161110381000181801000100010411041104110551041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1866

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097225354000000040683017360128717848083271816255082040588101264010010000615369273045604968682719317193165478365570501004020010000702001000071851351140201100991001000030100100000100109030165483106472831491170141091911021220090261026311717314050011601031108710000401007169572038718737214671861
50204719995381000104478091704110471954786237164425507004059610135401001000061480227170060496890572071720126527636554650100402001000070200100007197135114020110099100100003010010000010010899115750210629277889844211089612431350030261015711717044048010301046109110000401007168871878719407182271986
50204718575371000004007961696112471916790227172425507604060010131401001000061368527306490496865971814719326521636549650100402001000070200100007191935114020110099100100003010010000010010896013947310641264788744141093613831310040261015711717114044410041010114710000401007202871919720077185271853
50204718105380000004168101744010471859806227158825508054060010145401001000061251627219720496887471749717546543936547050100402001000070200100007185735114020110099100100003010010000010010908013849810644269993548141089912221230030261015711717474054011251072104310000401007179771709718927207771841
50204720045390000004707881736112871934797237157925506804066410118401001000061646327296720496898871995720096534336552850100402001000070200100007194635114020110099100100003010010000010010901116349810661260889628221095512931321011026101581171837404801066990112910000401007179772089716067192171838
502047185553900000039884517361100720018092271601255079040596101364010010000614517272689404968994719717185065444365623501004020010000702001000071812351140201100991001000030100100000100108651161490106292581293576271086212541221090261016411717464050810811056111210000401007180471865720597184271780
5020471766538100000370836168811447181376722717512550735406241013440100100006154362721239049688677190972095652853655345010040200100007020010000719013511402011009910010000301001000001001092511615021064128010896138271093913631061330261015611717494050011441041108710000401007202271829718247174371749
50204719345391100004357901728010471943804327169025507104062410134401001000061280827171470496879671989718606529336560950100402001000070200100007206835114020110099100100003010010000010010914015349510632275108994823109191212136039026101561171810405281012979101110000401007206071875719767197271967
502047183153900000045780217200104721517862271662255076540604101324010010000614991272854404968803718337198165327365648503134020010000702001000071836351140201100991001000030100100000100109130155496106522691089172171094013621210350261015711717114047610281129112910000401007189071929716747194372069
502047188153900000048079717281140718257872271520255072540616101294010010000614227273247604968919719007196065236365665501004020010000702001005272088351140201100991001000030100100000100108930163516106232781192848231091313121250090261015811718174052010481039102310000401007202171971719917187971982

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1757

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e202224293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002971902537200001509818107282112717947881171712255067040550101364001010000630625271490104968593071772717346524136552350010400201000070020100007178535114002110910100003001010000010109282136490106562701288380361089213511231031025201285013137163540532964938100010000400107188471675718757179671846
5002472007541300000531821107122164717108091171617255065040562101264001010000611565272397104968733071645718906528136576750010400201000070020100007173835114002110910100003001010000010108811016649310672278128943439109441413114207002521136501414716624054493493895010000400107193171938719227182971863
500247161953720000150386510728415671811811117157425507454056210129400101000061025827195650496867407177871858653023656125001040020100007002010000717793511400221091010000300101000001010913912548910649273129367829109251282122103002520146501212716364051688677882210000400107177271919718387186571818
5002471681537303000460817106801647166879811716782550695405221014140010100006303542716393049685950717097178365312365502500104002010000700201005171883351140021109101000030010100000101092821355001068029089323245109411373127205002520126501114718844052492689699010000400107189771709718217171071682
500247176753720000053683110720110071963803117169825506654055010139400101000063139727141530496864107181971626652483659555001040020100007002010000716483511400211091010000300101000001010928414448510660267129087836109561393126203102520136501412715934050093098492210000400107191171734718877173371951
50024718495372000005308651074421087182582610714062550660405501013940010100006128352720877049687630717427189065210365683500104002010000700201000071894351140021109101000030010100000101093491615511065228010938823010919133312020300252013850613714394054090693692410000400107164371563717027159071665
5002471629535100100534846107281100716437971171517255069040554101354001010000630377271250804968726071729717486516036560750010400201000070020100007160335114002110910100003001010000010109092146493106442771193078291094113421161030025201285013147151840564974100094610000400107180871903718907169071724
5002471664537300000590796107280144716548081171544255071040462101284001010000630654271064204968554071458716596514836559250010400201000070020100007171935114002110910100003001010000010109313140468106712688892742710971128312023900252014650146716814051693298890210000400107182771671717547174071759
5002471949537200000532839107360120718318031171338255062540442101414001010000629599271239004968703071733716766519836577550010400201000070020100517173835114002110910100003001010000010109152140487106952281494680441091411921281340025201265051471663405401024101299610000400107177471680717917175071572
5002471694539110000506834107202100716697961171347255069540518101204001010000612812272314504968529071700716486529936566850010400201000070020100007139935114002110910100003001010000010109131139474106442651091232291087112221371030025201385013127168540500104679691410000400107182071866718837178971775

Test 3: throughput

Count: 8

Code:

  ldrsw x0, [x6, #8]!
  ldrsw x0, [x7, #8]!
  ldrsw x0, [x8, #8]!
  ldrsw x0, [x9, #8]!
  ldrsw x0, [x10, #8]!
  ldrsw x0, [x11, #8]!
  ldrsw x0, [x12, #8]!
  ldrsw x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3657

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160209296652191000010636781916329422029063764319192718471872251601688014880000801008000040075912901451534925996292362912290753926816010080200800008020080000292163511802011009910100800001008000011008091018417544284631677138904052008578768511948624375180005110316332914825800723303647880000801002926429232294412909929143
1602042930121910000106549829153681136292337933701864191018902516016380167800008010080000400782129645317149262442921229377943439277160100802008000080200800002918635118020110099101008000010080000010080902204025870848517401691842481385976747130482250981801005110316232933529800723493399880000801002896429387290902898129306
160204290332181010000705486517849211229201793330207522861817251601608014980000801008000040084312928731654925990293472935489793942816010080200800008020080000291573511802011009922100800001008000001008091416476524285468715159353049688582472412046955806161505110316352928230800633843349580000801002930729039291182909729068
160204290712211100000665084115848710829324823377210620891961251601558013780000801008000040075012912821684925884292802931191333922416010080200800008020080000291353511802011009961008000010080000010080907154125537850647261291458480686068785118507649741802405110217332924738800683803388480000801002954529201292492921329148
160204294152181000000624578217129210829298775350222923571908251601568015880000801008000040079012967900514926211290102898890773907916010080200800008020080000294503511802011009910100800001008000001008090318377581785776694179417858838619469313347794613161615110317332911333800704253318380000801002931029291292962934129538
1602042935322010000006650836164811410829140784379196320471920251601508017280000801008000040079812876320604926012291252931890273910516010080200800008020080000291053511802011009914100800001008000001008089717409529084783716178544658538557781611848635077160505110316322928125800674173497480000801002913129172289312949429191
16020429152218200000064927791672769629006778411202222081711251601638015680000801008000040073312849920594925978290642929590233938716010080200800008020080000290873511802011009911100800001008000001008089134338629585156762138666449258604677311950665501170205110316332918920800653113609880000801002937929262291212929529158
160204293162201100000663679917049415229503798385214620691953251601588015780000801008000040077512858080724926061290662931191803912616010080200800008020080000292073511802011009911100800001008000001008090336408515284938647138784852428566076213752435131350505110217322904625800653453338780000801002903829379292432920828926
160204295652181000100646978817207315229311784438197020002184251601498015580000801008000040078112916400594926147293772961092463932116010080200800008020080000292513511802011009910100800001008000001008091136388513884584688189124053528599980013245615161140305110216332930725800453893509180000801002914429228294332917229186
160204292212211000001693885618089014829095796449212020471830251601548017380000801008000040070512879430824926100293422925994223927716010080200800008020080000292873511802011009917100800001008000001008090414408541585160744129014651088575880911950255057363005110316332915123800643734079180000801002920629281292112909629011

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3677

retire uop (01)cycle (02)03090e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)67696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600292973322000068537861728127124293838073791655153020102516008080069800008001080000400333129742065492631429397293989647393071600108002080000800208000029280351180021109321080000108000001080938415550285428639893568526286034682139489855190750203161129496468006461160311480000800102956329512294362956629534
1600242935222000066428481728116124295508273451704165220222516006480076800008001080000400330129947476492646129326295569551394191600108002080000800208000029394351180021109471080000108000001080938388564385551665692870484586181754119445549773450201161129418298007259561512080000800102946629421294832939529314
1600242939622000064218611696110108294978093681782191021822516008480062800008001080000400362129156471492636329465295259825394631600108002080000800208000029483351180021109391080000108000001080906367520085321674119037252358606068913447595392375020116112920634800636185629480000800102946429474293342924029606
160024293592200106359835170413796295298003981781214120142516006880065800008001080000400370130062877492653229590293389400395621600108002080000800208000029532351180021109481080000108000001080952399512185238681893528490886213765138473652203450201161129544468007163761710380000800102929729425294532949629552
160024294932210006764828174412776295807893991692182622772516007580077800008001080000400341131021455492655529252293509579395161600108002080000800208000029360351180021109511080000108000001080912373518085246628892268537485887753118510949490750201161129129238005963761112080000800102939329349292672930229454
16002429364228000641683617281421122941981139116491806220425160069800738000080010800004003151293771604926431294782939592613940516001080020800008002080000293773511800211095010800001080000010809224165434850767121390942504586391808133484053470350201152229552338007064859111480000800102957629474295702951429466
16002429296221000664785017601251002958581135816201844211325160072800678000080010800004003391282520644926298294552944293753926616001080020800008002080000295563511800211093010800001080000010809223915469854036541293246537285911816139543251450350202162129381378007462061611880000800102934229507294652953629613
1600242945622011167258241712131124294677993681752172822312516006980071800008001080000400379128749761492637629292294099396394351600108002080000800208000029436351180021109471080000108000001080914387534385220673995074523485878736131484955210950201161129463438006560659810280000800102948029333294852949029462
16002429528219000672581817201242922959881141619241817211725160074800808000080010800004003391312761584926122295352954394653937216001080020800008002080000295043511800211096610800001080000010809284355587857986601190444526786508831135522147060350201161129343388007871659110080000800102947529357295002947029291
16002429518220000669782117041131402939979936418342075242625160077800798000080010800004003361299745664926345296542968595393935316001080020800008002080000295513511800211095410800001080000010809023855618856806031193068495386117820140548554610350201162229593388006864763010080000800102952429292295342935229605