Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (post-index, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e20222b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200510588100005421103010251041291925200010001000100010005284445824110401040699377320001000100010001000104044111001100010000101800481020000101710233922856010073216221037100018181000100010411041104110411041
200410408000105820006410250310111425200010001000100010005282845824110401040699377320001000100010001000104046111001100010000100781681035190176241030444224871073216221037100028191000100010411041104110411041
20041040811000511410581025172651325200010001000100010005283645824110401040699377320001000100010001000104046111001100010000102172821040801102710162842140730673216221037100021251000100010411041104110411041
200410407100006714104410259287162520001000100010001000528364582511040104069937732000100010001000100010404611100110001000210278242104140216221030363233271073316221037100023251000100010411041104110411041
2004104081010060141030102510363162520001000100010001000528364582411040104069937732000100010001000100010404611100110001000010237241104431406201035342196461473216221037100021191000100010411041104110411041
20041040710210411410501025112831925200010001000100010005283245825110401040699377320001000100010001000104046111001100010000102180321036150146261024355224071073216221037100021191000100010411041104110411041
200410407101005000050102505772025200010001000100010005283645824110401040699377320001000100010001000104046111001100010000104370411035301420161024394264871073216221037100021251000100010411041104110411041
2004104071110058141050102517397132520001000100010001000528444582511040104069937732000100010001000100010404611100110001000010086033105935200221016333224063073216221037100021191000100010411041104110411041
2004104081010056161040102511456152520001000100010001000528204582411040104069937732000100010001000100010404611100110001000010288048102732326221029294294070073216221037100021211000100010411041104110411041
200410408101004920106010251537141325200010001000100010005283245824110401040699377320001000100010001000104046111001100010000101982651028203112291034364234071073216331037100021211000100010411041104110491041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1873

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0f18191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50209720715394000000041508111704527271986781407167225507704064410146401001000061505527279150496889371936719986529903655065010040200100007020010000719563511402011009910010000301001000001001089612148485106552911190912839109061478133438261035822719034052089889289910000401007187771768719437196371843
502047199853940000000475079016886144718057913371493255079540632101354010010000613857271394304968720718427189165401036546550100402001000070200100007192935114020110099100100003010010000010010885415048010660262129017232109081158130303261025822717104050010191076100510000401007197971750719697200671717
5020471824537300000004830768173631007193276941717242550765406601011940100100006143352715780098689267191372013655180365695501004020010000702001000071924351140201100991001000030100100000100109344151508106352891093044381087712861221372610258227170640512954930107710000401007173271958717567189371811
50204720925362010000050108221720311271844792437146925507254056010127401001000061470127246880496881071914717916520703655945010040200100007020010000719753511402011009910010000301001000001001089421625151064226512944484510891126912420926102582271678405609581074101110000401007174371892718867200371922
50204717195392000000039708461632322471879788217184625507404063210133401001000061483727228680496873271870719976526803654635010040391100007020010000720953511402011009910010000301001000001001093661674671064926810883723110903120811733926102582271501404889161080103910000401007175071852716927167671912
50204718165393000300047108421688110071861793237142925507054057210146402611000061467427207680496894771942717966540603654695010040200100007020010000717393511402011009910010000301001000001001089341514841064427212906683510917125710720626102172271766405081057101099510000401007177672176719957197572025
50204719235393020000047307881680311671843789427145425507654057610137401001000061569327256150496860771984714736509903656835010040200100007020010000716603511402011009910010000301001000001001091331665141060327611920683710935128513822326102582271701404681034988106910000401007176872061720147171971921
5020471875538200010005220826269621207169978513716572550765406241012040100100006140882719523049685887201771844653730365599501004020010000702001000071889351140201100991001000030100100000100109311015348610636281129527835109641265121233261025822713654050491994095810000401007169271948716977200671733
5020471996539200000005230819168821167201181122714772550765405721012840100100006127092717193049687507171071892651090365566501004020010000702001000071872351140201100991001000030100100000100109123142481106392791390180311093812761282092610258227161440468933108398410000401007195171998718767170971683
502047154753720200000459082916563128719127934371679255068040544101254010010000613048271828404968837719097181965246036571350100402001000070200100007165835114020110099100100003010010000010010879316152310653266149284825109291335120103261025822714524050099797597610000401007179571843716777185371838

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1758

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)0e0f18191e1f202224293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)c2c3branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029717355370000006170822107200152717517994271584255070540494101354001010000629831271499814968505718707182565134036546850010400201000070020100007181735114002110910100003001010000110109160154481106832831492578431091814561300502520076533717694053693893894410000400107187071848716277187172034
50024717875380000005630801107446927184978652715522550650405691012740010100006317592719608049687967176871915651780365518500104002010000703771000071712351140021109101000030010100000101088301634941062426013879824310908126613005025200265227171240528866100690410000400107182271854718357185272042
50024717155370000005550813107041100719427504171495255064540538101344001010000630108271831804968680715937177465202036554450010400201000070020100007184835114002110910100003001010000010108620162490107002561187034411097012741220502520026522715504054896897893810000400107168071765717247167571888
5002471763537000000577080610712010071776788517147025506904052210136400101000062932427166590496868971826718126531703656125001040020100007002010000718803511400211091010000300101000011010886013850910623268989180351087913331270202520036522715724051295095493210000400107190071817717987183671838
500247171053900010059008151068811487172377552714892550685404901013040010100006305752716803049686007188471647652340365494500104002010000700201000071837351140021109101000030010100001101089201494731063926111902783510889122513006025200265227145940520934938102610000400107180571629718107181171863
50024717025380000005530825107360108717147644371442255062540530101284001010000630955272066804968697718727186665026036541150010400201000070020100007155035114002110910100003001010000110108950152506106362571186786401093611841290312520026523716024054495688498210000400107167171716716937160971731
50024716165370001005700807107680108717508064071510255068540522101234001010000631069271455004968610718347172965140036526750010400201000070020100007159735114002110910100003001010000110109300165534106242671490438451089813241310302520026533717434053295684491610000400107171071904717907189871842
50024717445380001006140826107600152718277623371449255066040510101204001010000629615272164704968672715837172465258036539050010400201000070020100007154235114002110910100003001010000110109300147498106362441288080401088512051170402520036522716244053292088683210000400107188771559718397171571623
50024716975370000005080828107120104716707784071473255063040498101314001010000629803271381604968507718387159865372036561550010400201000070020100007179235114002110910100003001010000110108730136497106382731489378401095812231250302520026533716494054890690495810000400107181171773718237183571828
500247176853700000060907991068811167172678414271394255064040510101304001010000629677271931404968755720047178165405036540750010400201000070020100007176362114002110910100003001010000010109290156516106802591290380491090013441190902520026543716584058098895884410000400107183971745718157189471803

Test 3: throughput

Count: 8

Code:

  ldrsh x0, [x6], #8
  ldrsh x0, [x7], #8
  ldrsh x0, [x8], #8
  ldrsh x0, [x9], #8
  ldrsh x0, [x10], #8
  ldrsh x0, [x11], #8
  ldrsh x0, [x12], #8
  ldrsh x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3653

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092961222110010067967991688103220294157874152091235419472516017680168800008012080017400835127450914749261512924129106919806912416013480220800208022480020291433511802011009922100800001008000001008089136394569208563769598405051628645769112554565322509181115117116112923025800793703508780000801002914129173291972919329255
1602042928421930310062497621656941322922882645619571867197925160161801668000080116800144009051293791159492608329496290459313068975160135802208002080224800202946135118020110099810080000100800001100809346739053700857027139863384828861556951075156556471381115117116112936429800843233107980000801002917429091290442912629382
1602042940621940000067957941680999629206752429205019251800251601748016980000801248001240088212885701504926082291282933591560690481601338022080020802288002029244351180201100991110080000100800000100809525340549300855637078890384700860997681325239510469041115117116112930231800742963647480000801002921328927291492922329369
16020429392219400000719079517129214829386780493203020251862251601688017380000801208001540085912965100574926235289192919691040239560160125802208002080224800202929135118020110099181008000010080000010080919513585459288635971412881385027858787081335519516549301115117116112924520800673893547980000801002921629492292112924529032
160204290132183000007147806168094100290387884432101204120052516018580181800008012080020400860128294416549260812922628954923703919216010080200800008020080000292283511802011009916100800001008000001008094768415549808537371015874285793857867281314843538251330005110216222941436800523393649280000801002922729258291712926029213
16020429258221303000709578016881161162942377348320262069212925160164801558000080100800004008061299618066492606029312292299118039083160100802008000080200800002952335118020110099161008000010080000010080891693416178085535667138633849508594879413353815374651150005110216222915538800603183866680000801002939229468292162932329326
1602042948721840400072347781672991162941278340321892049180625160156801458000080100800004007661286384150492622929123292019179038935160100802008000080200800002932635118020110099111008000010080000010080890583875585084912701118904256058616477113255565412513300051102162229055278006140730810480000801002905729115290532907629195
160204291422173300006708787166485132293438015032211209819492516016580163800008010080000400753129285518849260352911929542919903906416010080200800008020080000294493511802011009913100800001008000001008096575390567808554875815894485519862848331245453485855040005110217222914130800513993699580000801002923029371290452913529252
1602042955222030000073338151712861362926977249720031938211625160163801638000080100800004007621291719064492617329520293659450039232160100802008000080200800002919735118020110099151008000010080000010080924643655561085526680108811025142864007011294828539269360005110216222907026800643493569580000801002916629348294042936429317
160204289352203000006237781169683128290368194951938201419512516015480167800008010080000400787129294605649260922919829505893703910416010080200800008020080000293973511802011009913100800001008000001008094343427514908484074412910685409865026971274904538667000005110217222913729800553933268980000801002943729254294442952729191

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3675

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd0d5map dispatch bubble (d6)d9daddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600292973422001000106809789169699112298077963071872190220512516007480084800008001080000400299129608105749261062963129420928803933816001080020800008002080000293713511800211093010800001080000010808801837960168530062311929365437860028161094861529417155020162516009729406348006157153010780000800102950929248295242940929323
1600242946522001000006765808173690140292038113091850182521262516008680078800008001080000400373130190006449262472926929238919703928916001080020800008002080000291383511800211093510800001080000010809131837451978520170212898305200860527531224904459718045020157150071129373208006362744312780000800102942029317292982927929266
1600242942322101000007614864174410418829423795329186317122148251600788007880000800108000040032413084500644926452294812944592740394101600108002080000800208000029583351180021109281080000108000001080986173726044854706501191976540686472812123554446261664502015131600772921029800695635059280000800102934429222293572954729239
1600242937722001010006827793172811322829361771350204420362047251600648006080000800108000040034612904100594926227294032921694150395601600108002080000800208000029587351180021109381080000108000001080894193905383861136541287280522685951743131535052661607502015816005132937120800615644889680000800102936729362291602943329268
16002429391219011000070027771768921162927878930618131919236925160068800778000080010800004003511303069058492634629388294609466039481160010800208000080020800002945735118002110941108000010800000108094820441600985278695148533457768589275313950665018180750201551600862923429800795135259780000800102921729467294362940129302
160024291432190100000673178117128421629258790307180418142134251600728007880000800108000040042412872350704926146291632956793760394971600108002080000800208000029250351180021109361080000108000011080920184445482849716961088174538085709712132484742491704502018916006729256358007850355210580000800102944629561294732972529607
160024296312210110000654781117041091202935681632517521961195825160075800878013080010800004003581293406062492637029571292959688039423160010800208000080020800002925435118002110928108000010800000108089118367525885108650138652451768578576513949924752156550201581600792945126800735615328980000800102932329566292472928529210
16002429412220010000069428091688115148293438033141676179220402516007680064800008001080000400321129840207349263182955229359928203962516001080020800008002080000294273511800211092310800001080000010809501837962128505062011882305368860427741315040535818385020159160013929429248005056045810280000800102940829511293992939029289
1600242951222011000007028813164899196296077983381910185221442516006780073800008001080000400366128887805549261612928329324927003942216001080020800008002080000292333511800211094210800001080000010809261838656898550865612904724929855527151244753458718135020151016006829324248006448652010580000800102936929361295662943429340
1600242940422101000017366777171212024029353791341177317872198251600878007780000800108000040036613007620654926367295612924994120392911600108002080000800208000029502353180021109401080000108000001080922184025321854026291286648509785686793128501553411803502015121600682921824800535775579680000800102929029419293672941829362