Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (pre-index, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200510568006321104121041012941725200010001000100010005282045824110591040699379220001000100010001000105944111001100010001020038210306116240331058648246407311611103810004036121000100010601060106010601060
20041059810843410001039191125142520001000100010001000528364582301059105969937922000100010001000100010594411100110001000103800601046441200131052498216407311611103810004553121000100010601060106010601060
200410598006418000121044007320252000100010001000100052832458241105910596993792200010001000100010001059441110011000100010000064103212000019104455975607311611103510003845121000100010601060104110601060
20041059800732010001044112123272520001000100010001000528124581611059105969937922000100010001000100010594411100110001000102400611018602806121045436197207311611103810003445121000100010601060106010571060
200410597008328000010441701761425200010001000100010005282846921110591059699379220001000100010001000105644111001100010001038017610341022012171042615153207311611103810004038121000100010601060105710601060
200410598004012000010441101001325200010001000100010005283645824110591059699379220001000100010001000105944111001100010001026008510406020012131044439155607311611103810003733121000100010601057105910601060
200410598005916100010251922731725200010001000100010005284845828110591040699377320001000100010001000104044111001100010001012613910393016012141046544236307311611103810004042121000100010571060106010601060
200410598006422100121044002562125200010001000100010005281645824110591059699379220001000100010001000105944111001100010001016007210299015320191016501094707311611103810005442121000100010601060106010601060
200410598004420101010391302892125200010001000100010005282845824110591059699379220001000100010001000105944111001100010001034005110351020012261031437249607311611103510004341121000100010601060106010601060
2004105980156201000104411063252520001000100010001000528124582401059105969937922000100010001000100010594411100110001000102000881011002600151055519327207311611103810003436121000100010601060106010601060

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1844

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50209719925380000051708701728010072040841117142325507884066810135401001000067748927131320496883271601718516531036565350100402001000070200100007177135114020110099100100003010010000110011015149518106632677951502910942138212307026102511171800405649201090111910000401007178371853716727182271907
5020471963539000005390833176001367178580211715092550780406361014440100100006780952723411049686337183771919652793655445010040200100007020010000719443511402011009910010000301001000001001092515651110646295894870221092614021353902610157117156440524896988101910000401007182372058719437174671793
5020471811538000005260861176801407181881411714932550755406811014440100100006807772721265049689437184671907651613653235010040200100007020010000718523511402011009910010000301001000001001092916149110657269149043831109121442133030261015111717304052493883090710000401007173771834717547172971850
50204718255360000050708251744213271666812117165025508184062410121401001000067779027129910496872271740716686528036549350100402001000070200100007193435114020110099100100003010010000010010900145520106562831192644371091913411293702610150117168040571994101095710000401007182471734716857178371786
502047179154900000553081617040100716948322171449255075540630101444010010000695355271506504968614719697183865116365444501004020010000702001000071855351140201100991001000030100100001100109541385091066427210931803210975128112537026101511171633405289321076110310000401007179971873718867190771896
502047186756400010495084817120116719638141171625255078540616101424010010000678566272463104968788719207184365251365438501004020010000705441000072044351140201100991001000030100100000100109001474901067727969205022109731351144090261015111717074054489291290910000401007194571783718517177371835
5020471860538000005180854172001007182382421715152550790406481012540100100006765502716751049687567182371901652283655325010040200100007020010000719553511402011009910010000301001000001001090414346610682286139183428109921331131070261015111717974052099298490510000401007190771995717297170971745
502047178753800000530083617201144718388101171608255079040660101324010010000677665271866314968631717377174165207365235501004020010000702001000071872351140201100991001000030100100000100108951535051065427999125032109471451127070261015111717584054882891488910000401007179971675717787176271714
5020471799538000005360814174401047184181711717422550735406001014740100100006786412723282049687127178571695651443656245010040200100007020010000718573511402011009910010000301001000001001090016849010649275991646281094912911413802610151117154140504100093098310000401007193771755717077186571896
502047189953900000554083217120108718418092171554255073540673101224113010205624881273359604968709719167174665410365450501004020010000702001004271757351140201100991001000030100100000100109651394971065126999275429109431391132070261015111716524051294086498710000401007184671937719247187371825

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1933

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f191e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029722005403101010043581516885128720848106071795255073040566101434001010000614230272999414968842071854720126562503656895001040020100007002010000718413511400211091010000300101000001010917415949810651268119197227109401234118319252317711719717744054411281131118710000400107192471743720497207271915
5002471752539310100004668451736611271957813237179425506504056610146400101000061298127326281496889807193371893654020365449500104002010000700201000071958351140021109101000030010100000101088831544971064123489214625109311359126319252312851512717694052411131130103910000400107189372020718957196771852
5002471934538310100004108081728414071882861537191525507454057810149400101000061411527275470496870907187971925654790365694500104002010000700201000072025351140021109101000030010100000101088431505111065827610923962810963125813033425251571141671868405809401089108010000400107198871848720547207872011
50024719895393101000045979117127124718147964371694255073540554101514001010000612777272324904968842071883719176540803655135001040020100007002010000718593511400211091010000300101000001010905314150310642271118993831109031268119314252314711316716124046410401057110510000400107187471774719757197271843
50024718805383101100043582817204132720818066071573255068040506101354001010000614160272871504968760071831718676544003656655001040020100007002010000719253511400211091010000300101000001010924515151810638264118975225109351199136436252314711415716214056010651076106410000400107183671896718087190271817
5002471753539410100004728521720612071935774207177125507104051410146400101000061513527325080496879007188471737655000365469500104002010000700201000071869351140021109101000030010100000101093741585301064326811898542310939127612343425231571161671884405721248985105210000400107179472065720277201171962
500247202453841010000432805169651447192781623717992550685405101014440010100006147702732056049688540719667180565339036570850010400201000070020100007200135114002110910100003001010000010109124154497106582781090378251091012411125436252312711016716204050810041238111510000400107207871942718697202071940
50024719145384101200043182317925108718697855371789255069540534101424001010000613618272462114968979071959719716535003654385001040020100007002010000719513511400211091010000300101000001010904515951310645274579007430109351355125405252317711114716864059210911067106410000400107195071935719997181071877
50024720175384101000049287817046108720307971371633255071540562101294001010000614591272096214968701072031717226550703656215001040020100007002010000719713511400211091010000300101000011010899415751210651279149155039109441405124309252315711316716514054410441095109310000400107197671983720047186471949
5002471844539310100004028091688411271788800537170825507104050210147400101000061312827304620496887707184072027655390365588500104002010000700201000072097351140021109101000030010100000101096431614851064526718899463510922120912633625231685141371712405289331144101910000400107199772162717727199571981

Test 3: throughput

Count: 8

Code:

  ldrsh x0, [x6, #8]!
  ldrsh x0, [x7, #8]!
  ldrsh x0, [x8, #8]!
  ldrsh x0, [x9, #8]!
  ldrsh x0, [x10, #8]!
  ldrsh x0, [x11, #8]!
  ldrsh x0, [x12, #8]!
  ldrsh x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3671

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092990922110100619980917041041362930178932717151639219725160157801658000080100800004007361308508061492636529611292489233394891601008020080000802008000029305351180201100993910080000100800001008086719384548708537259113875484706859658321224810542136005110117112928131800555776159180000801002939529501295392963329242
160204293232212200069288191696112112294588024141894174122352516017080163800008010080000400794130013106249260952939829280923939229160100802008000080200800002939435118020110099371008000010080000100809153739354530854716201290876504386037792133508051603715851101171129190358005957758310180000801002958229533293852942329226
1602042952022022000730778916881151322935978743417031796211625160162801568000080100800004007341306235067492629129156294209364394401601008020080000802008000029343351180201100994010080000100800001008091236381543708517064868867251518608178413243805302375651101171129196418005255557110480000801002941129538295202938229365
1602042937222021000663480317121101082949880538817591887214625160156801538000080100800004008071322923153492632729465295069165390781601008020080000802008000029292351180201100996110080000100800001008093940394531608498561798964852798556973813352065205377351101161129409368005859459410080000801002935229503292532937429409
16020429382220202007288799167212812829208778339163717102031251601448015580000801008000040079512965760574926210293792930992973954116010080200800008020080000294233511802011009955100800001008000010080954383755795085555666108847252038608076412949705288370351101161129397358005353853510880000801002933229504294942951129244
16020429426219220006932807168014013229389778334163618772046251601548015480000801008000040076513167090484926385294502926892703942816010080200800008020080000294203511802011009933100800001008000010080941383575438085350677119026651598573376613150505270360351101161129255318006663257010780000801002942529234293712929029485
16020429324220202006361782169612113629680763306159418142170251601578016480000801008000040077213048221494926365293842940194523931216010080200800008020080000294913511802011009952100800001008000010080953373925422085207639109117854938553175913249274931333351101161129487288006057858710180000801002937629598295352930029526
1602042931122020000638677516801191122938181042417601625225625160159801688000080100800004007821301916075492638829199292899238391181601008020080000802008000029486351180201100995710080000100800001008088937445543908511168110910824906863167341395109596234035110116112932331800695885148480000801002964629588294092954329357
1602042926122020200723077816961371242933077034816801749199425160176801588000080100800004007441302209057492638529309293779412393381601008020080000802008000029273351180201100995710080000100800001008093937375560908569459498774843648571075214049934351373351101161129480438005556461410380000801002935029410295432949629158
16020429531220200006796790167210917229338781373217417002076251601608015580000801008000040072012862640604926378293932933893743965016010080200800008020080000295953511802011009956100800001008000010080950364195434084848576109244847658568480514145435077370351101161129469378005659256310480000801002903729448292272938529508

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3666

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cdcfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)dadbddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929901219100718080416321021042933076634619331871208025160091800608000080010800004003211296618116649263582937229406900103938316001080020800008002080000294903511800211093010800001080000010809282737655728517467913908324941858118501194928497213140502000716004229427238006855060012480000800102928429487292982934429392
1600242909221910169858271744104140292688023172003177718822516008580074800008001080000400291128893600524926338293252936692730392351600108002080000800208000029166351180021109281080000108000001080877845959288544359811830625245859768001274933502538030502000316002429443188005857651512380000800102917629068295582946929364
16002429281220200652782416641081042944280234022132018210125160079800768000080010800004003071303360005449262682938729669920703919316001080020800008002080000292913511800211092710800001080000010809123532755128526470711935385131857297401275455576618000502000416004229262258006353552511380000800102921629421293272932329338
16002429349221102684482316729215229392790329200819692012251600708007980000800108000040032013022690069492641229066295099355039485160010800208000080020800002929335118002110920108000010800000108089920370508085194626785730508385948660112443754701863050200021600242945518800515504959480000800102935129427293302916629253
16002429221220101684076717281091482923576034117401885214125160075800808000080010800004003571294541015749261142929829458953103940216001080020800008002080000295523511800211093010800001080000010808800386625285286651138744651158557075112949685317002050200021600422939942800785975219580000800102928029146292892921029392
160024292622181006936843168010196292968083101540166719432516006580065800008001080000400343128949400704926033292412924293140393021600108002080000800208000029269351180021109241080000108000001080929404155786852036471290642478186112817134505249821903050200021600662938719800754985618980000800102931729539294852937329348
1600242949622010068168011760961282939175835217391952187725160077800788000080010800004003421295965006149262532933929290930803941716001080020800008002080000292853511800211092310800001080000010809051740651428521066114896325692859087061385046526537180502001616003629234238006453251410880000800102923829397292522936229321
16002429345219201636478416881001562902879131615721853191225160078800708000080010800004003471292156016449264652945029500924603952516001080020800008002080000294213511800211092110800001080000010809133733359848528064613919685390860997381375272505916000502000416004429322168025051158310680000800102939929381293092923529216
1600242918621911070088191728921402921580635216431824203325160073800788000080010800004003411288860006049260752930429269904103928716001080020800008002080000293593511800211091910800001080000010809041742159388511666014891114537985940741127509648813645050200061600242929020800605344758680000800102946329400293532920329390
16002429387221400661383917041112322937779733419202175198925160079800538000080010800004003441290087007549263072917329229922903930416001080020800008002080000292713511800211091810800001080000110808941836853678597765614900484879860307021434929513118030502000316006529485158006855952511880000800102936729355293922938829334