Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (pre-index, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e1e2022233a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005105480001590001241035111163614325200010001000100010005280445832110401040702377320001000100010001000104044111001100010000101060391152001201141153438115470731161110371000363501000100010411041104110411041
200410407001144000101025169918223425200010001000100010005279245831110401040699377320001000100010001000104044111001100010000101600541126601261181137426121550731161110361000333801000100010411041104110411041
20041040800015500060102501013532252000100010001000100052808458271104010406993773200010001000100010001040441110011000100001010605311347020121141125305114470731161110321000333101000100010411041104110411041
200410408000152180030102517106916312520001000100010001000528084583111040104070037732000100010001000100010404411100110001000010000055111600001241116306122310731161110361000343101000100010411041104110411041
2004104070001601810501025179812039252000100010001000100052792458321104010406993773200010001000100010001040441110011000100001020004411141001681131127245115470731161110371000313801000100010411041104110411041
20041040800018212004010251010561535252000100010001000100052796458311104010406993773200010001000100010001040441110011000100001028037111273030121141128307125550731161110331000433701000100010411041104110411041
20041040700014400040102549222312520001000100010001000528084583001040104069937732000100010001000100010404411100110001000010126047112412020121171137486116550731161110371000273201000100010411041104110411041
2004104080101510106010250994036252000100010001000100052828458291104010406993773200010001000100010001040441110011000100001014006511363026121231139305117550731161110341000333401000100010411041104110411041
2004104071001481510420102511105204125200010001000100010005282045829010401040699377320001000100010001000104044111001100010000101600641157001661171142405132710731161110361000473101000100010411041104110411041
200410408000143160030102501081013925200010001000100010005282445831110401040700377320001000100010001000104044111001100010000102001391127001401141120426115310731161110371000333201000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1728

retire uop (01)cycle (02)030e0f18191e1f2022293e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020972121540100068808031736100719278041171282255083040636101114010010000676103271825414968769716727158964938365321501004020010000702001000071500351140201100991001000030100100000100108881504971061927110922503110894123112133261025111712854049693690082310000401007168071742716157158571661
5020471709536100058908051736120716368161171412255069540636101154010010000677786270482914968511716957190165158365428501004020010000702001000071515351140201100991001000030100100000100109431544911065127089125237109151251129010261015111713004051282277297210000401007188571654716097164771838
502047173453800006280801168010871728815117141925506454056810128401001000067789427176961496858771777717106518236523950100405321000070200100007192835114020110099100100003010010000010010920154531106262597935803410923121111107261015111715164046880787291410000401007166071675717027151971556
5020471859538100063407921776100715287331171496255069540584101204010010000675128271047514968613718367171765178365297501004020010000702001000071640351140201100991001000030100100000100109051595131061327612923523110926133112437261015111715794050492092495910000401007170671597716487173971615
502047162553800005680793174410071626799117148125506904057210119401001000067658927091701496874371763715306500236537250100402001000070200100007167635114020110099100100003010010000010010928156524106112331091812640108901261113318261015111713594054479078685310000401007178671727718487171271486
50204714955370000578078716801327148678211713082550764405881011840100100006774942718949149685807169471696652423653905010040200100007020010000715933511402011009910010000301001000001001088716951510666275139237446108901291128173261015111717324048491491493510000401007158671577716967165871854
50204714705380100535079617521327161982611714212550675406081013340100100006752982703607149685177156671694650153654215010040200100007020010000716943511402011009910010000301001000001001087215450710648267108817444108921181138032610151117160840468101097882910000401007169271635716427177571709
5020471537537000059708111680104715867781271334255077040568101404010010000677885270911014968639716417163965155365259501004020010000702001000071777351140201100991001000030100100000100109061414921062827647932282810902122113734261015111716034050093478887110000401007148171695715967167271717
50204715465370000576080317049671767768117130925507604058410121401001000067516927074721496858171832716926514636531450100402001000070200100007171435114020110099100100003010010000010010904153541106612829884803410875128113005261015111714434047685286095210000401007162471584716817178771699
5020471692535000057307991688144716898012171564255076540644101184010010000676418271568314968635716187171965088365518501004020010000702001000071705351140201100991001000030100100000100108571424761065725011939504010896131112403261015711715814054095098292210000401007166571860718647194571749

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1880

retire uop (01)cycle (02)03l1d tlb fill (05)0e1e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)c2c3branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029722655400040283516402112717927937371573255070040546101364001010000614191272852414968747071849719466516936568450010400201000070020100007205335114002110910100003001010000101086101524921063525913892761979108801207114050025200285137170740516971901113510000400107169771895717977202371882
50024720475370041181117040116716358028671592255069540522101264001010000611817271832714968705071834718936536436546850010400201000070020100007166235114002110910100003001010000101091901384861062925978923019109041348119370025200185117171940492992997105710000400107191971868717367181871904
5002471859539004398171752010071853812647162525506154051410119400101000061295627254830496879507196671739653573654985001040020100007002010000718203511400211091010000300101000010108700150481106572641092074221091411931230300252031851271705405481105996112910000400107178872012720627201972066
5002471978539004348221656013671923799757168425506754047810126400101000061298327211471496877707193972014653433656275001040020100007002010000718713511400211091010000300101000010109570149507106622691189478301087912871180300252031851171637405601137955106210000400107182472009719337192671900
5002471962538003768321712011671970803737166525506804053810125400101000061298427267580496860207188771951654053654205001040020100007002010000717333511400211091010000300101000010108940164511106522646912661610930117710703002520037111718644054810291028112610000400107183271982720027191971978
5002471852538004268051760012071987775757144825506854054210124400101000061258227296981496884907192271931652073655945001040020100007031010000717633511400211091010000300101000010108720170480106202596895502710924122612003012520038511716384051610271008109210000400107201071910719687191871893
500247190053800432873176001087193076564716642550710405341013740010100006117742719699149690470718637199765224365670500104002010000700201000071933351140021109101000030010100001010905012450310622288794930231089113071130300252001851271745405609561078103410000400107185572005718107200171887
5002471992539004418281640010071870759747178325506604050610141400101004961472627272141496895907213071879653023656285001040020100007002010000719303511400211091010000300101000010109280160488106622668902782310923129513703002520017111717754055610721031100710000400107180771824719107184871908
500247190053700393808172811527199577383716122550735405181013440010100006127532719772149687780717087187665345365450500104002010000700201000071915351140021109101000030010100001010882014747410667262695772281094612071210300252003851171797405361046114794810000400107179671766719157199571930
5002471996538004508751704011671982779637177425506554049010133400101000061184027267740496883607198271658652763656765001040020100007002010000718343511400211091010000300101000010108930150478106392689905722910899137712533002520028511714854050810521088100810000400107193571718716927185071811

Test 3: throughput

Count: 8

Code:

  ldrsh w0, [x6, #8]!
  ldrsh w0, [x7, #8]!
  ldrsh w0, [x8, #8]!
  ldrsh w0, [x9, #8]!
  ldrsh w0, [x10, #8]!
  ldrsh w0, [x11, #8]!
  ldrsh w0, [x12, #8]!
  ldrsh w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3643

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160209295122200000000641407941720125140291947863582009198118522516015280160800008010080000400842127921205949261792914329199896039120160100802008000080200800002929735118020110099231008000010080000110080905044552210846777211192080471086501737132431847890035110216112895923800494214287480000801002923629001290362924329061
16020429257217000000061130819172013096292438073471910202719132516013980167800008010080000400802127540805549261672931929101917839202160100802008000080200800002910135118020110099191008000010080000010080900040753880849097821387078422085085793123440848510335110116112920740803834213616380000801002922929228290582915029500
160204292412190000011690008361696117128289208033721950215717492516016080160800008010080000400798128313305749261162924029172893238910160100802008000080200800002916335218020110099171008000010080000010080891042952450849607411290134463585507637129438252160035110116112921737800704663817780000801002925029120294292911529169
16020429216218000000065720794176895112291338323921998190720702516016180160800008010080000400758128958017049258032930729306902339025160100802008000080200800002933535118020110099161008000010080000010080897044358980845218091192378497685734727133454150090005110116112889631800603473266180000801002898229103291082899929087
1602042891521600000006304079717609910429182788342177120842016251601688014880000801008000040076412873190514925941291922912190633904716010080200800008020080000290143511802011009911100800001008000001008088704275764085274734887572503086176759123428945080055110116112898826800573683006480000801002895329332290902920929167
16020429171218000000063010831171210910429203805344200221561996251601528015580000801008000040077812786230494926043292392916490433915916010080200800008020080000291803511802011009930100800001008000001008086804085258084494786992280435585705731132522643200035110116112931934800483963927680000801002908329130291962926929000
16020429275218000000064760793167212113629215778339191422181976251601618014880000801008000040078112869660534926147293312897290093907716010080200800008020080000291473511802011009925100800001008000001008088504614837084857755992434475285272811113450052990335110116112906735800563473575880000801002924929303292262906729102
160204291112180000000653208291712104128290578083451998197618832516015480161800008010080000401562128146505849259602921129219902739287160100802008000080200800002930335118020110099211008000010080000010080934042851120846607721095146460086086699138482645720045110116112896622800423563357080000801002920429155289752912729133
160204291792190000000581808441672107328292748223932016182419522516015280161800008010080000400773127681806649260652922629202910239014160100802008000080200800002899035118020110099251008000010080000010080873044857100849007811088272496886105813123477152040035110116112913725800464373776980000801002911529405291652903429053
16020429306218000000061850827169612014829191804307195719982007251601508015480000801008000040082512858220494926008292442905988503895616010080200800008020080000291373511802011009916100800001008000001008089404304642084786763988448456985581718133459348320335110116112898531800484043807680000801002925029256290542926229271

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3666

retire uop (01)cycle (02)030e0f18191e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929670222200066910838169686104292528183501912195320342516007680081800008001080000400349128959307149262172913329242931003933516001080020800008002080000296013511800211093410800001080000010808723855596084974664108645848138591675611251205060005019316112929727800585245099780000800102943129414294912926229170
16002429291219000066880805166490212293737673531663179219822516008080068800008001080000400334128844606949262392950229303935803941716001080020800008002080000293153511800211092310800001080000010808974114938085415646148723250608579067911646295038005020117112912933800665825808980000800102947729229293012926229288
1600242942421900005881080116881001562911180034518261674225025160071800748000080010800004003081291004060492642029300290869261039521160010800208000080020800002942335118002110945108000010800000108090438354030856606851389948479485618779118544148430050191171129195288007951950811480000800102904029203290922940629537
16002429457219000067990840170411414029252749329162018272096251600998007980000800108000040032912969650744926281291442931891800393551600108002080000800208000029498351180021109391080000108000001080904354522108538963611883132530685413666118502147580050191161129336318005853950010480000800102930629340292752930829471
1600242935721900006454082414961061242943380834318422193219525160080800748000080010800004003141292647172492613329316291899282039317160010800208000080020800002925935118002110942108000010800000108086137954750852546531189030550285999727119474648470050191171129103258005958760810980000800102953829482292482919929246
16002429323218000070840804168810227629406804325181517802048251600548006180000800108000040034612992991664926304295222934292970395161600108002080000800208000029242351180021109331080000108000001080874388567808563965213894140492285941818127503349990550191171129474188005950352710680000800102946029389293582929829195
1600242936622000006691079417368714829164817312160217751980251600808006980000800108000040032312848551504926544293732920493720393491600108002080000800208000029374351180021109301080000108000001080901385555708552366810874112478885864778129522049383450201161129381278007947349110580000800102915829201291762926929388
1600242933721900006829082216801102922938280733618141754205625160068800858000080010800004003541288538057492613829282292889136039354160010800208000080020800002925735118002110919108000010800000108089439353800852706291286822488085861755109528651680350201261129249318005954350610580000800102959329313294482941029355
1600242960722000006699084717601061562923079132218651595201425160067800748000080010800004003021283802164492640229420294169216039272160010800208000080020800002919035118002110933108000010800000108087739854570850796771290274522585848803123548049733350191171129331418006051656310380000800102928429321293082945229417
1600242931422000005888083317041101042936979833416291741201325160075800778000080010800004003891283767173492612329224291509431039387160010800208000080020800002939935118002110933108000010800000108091036251150845886311595026491085605627120487152830450191161129193318007052852911080000800102926329270294112914829325