Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (pre-index)

Test 1: uops

Code:

  ldrh w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e20223a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2005104080011721914010250494162520001000100010001000528444582401040104069937732000100010001000100010404411100110001000010280039102810101026102535314553073216111020100026191000100010411041104110411041
2004104080000410124102511591114252000100010001000100052844458240104010406993773200010001000100010001040441110011000100001012006310270001027103524318310073116111038100026191000100010411041104110411041
2004104080000431614010251151112142520001000100010001000528444582411040104069937732000100010001000100010404411100110001000010240063103510102020103333214310073116111037100024151000100010411041104110411041
200410408000052201320102511421053252000100010001000100052840458241104010406993773200010001000100010001040441110011000100001000004710231300016101628222630073116111037100026211000100010411041104110411041
20041040800004700401025061081525200010001000100010005285245824010401040699377320001000100010001000104044111001100010000101301661040600016103024312390073116111037100031231000100010411041104110411041
2004104080000581303121025175108222520001000100010001000528324582401040104069937732000100010001000100010534411100110001000010160143102714013626102639317390073116111037100026191000100010411041104110411041
2004104070000511414010251151181825200010001000100010005283645824010401040699377320001000100010001000104044111001100010000102801621038151151021103629126710073116111038100024191000100010411041104110411041
20041040700005313120102597138142520001000100010001000528444582411040104069937732000100010001000100010404411100110001000010130166103010282018102423218710073116111037100021241000100010411041104110411041
2004104070000411612010257659132520001000100010001000528444582301040104069937732000100010001000100010404411100110001000010110150103515360018102138118550073116111037100026191000100010411041104110411041
2004104080100451314010251538817252000100010001000100052852458240104010406993773200010001000100010001040441110011000100001023005510241200017101839320310073116111037100021191000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1944

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020972005540110000004740831172811327192578342718142550750406321013240100100006156732734809496891271832721086531536570250100402001000070200100007199535114020110099100100003010010000010010903114951910646273892252191088912331280030261035612717774056011091078115310000401007197671867719067208771961
50204721215381000000043808201712110471902798327164525508554066010125401001000061505827289314968934720237184765387365657501004020010000702001000071872351140201100991001000030100100000100109281155512106592641489250201094113941301130261026422717924060811951139113610000401007180572154718487195871978
502047199753810000000466077216722128719358154371543255079040632101334010010000615885272358349688707196771862655683655755010040200100007020010000719813511402011009910010000301001000001001092121525181068928989635826109621305134104026102582271774405561112975106710000401007199371914719757205472020
50204721415391000100047708021696110072062855327196225507754067210127401001000061578627260124968844718557202165268365681501004020010000702001000071891351140201100991001000030100100000100109151147510106322558925802910882129413010100261025822717494053211301120108810000401007210971964717997203172267
5020471912539110000004110844166411487189078932716572550745406361014240100100006151932733410496887571862719646547636569050100402001000070200100007200635114020110099100100003010010000010010916113251510659262991676221093013451381390261026322717074056411311140116810000401007207772117718517195771990
50204718885381000010044008241720212471958795337170325507404065210139401001000061514827282894968750718457220965295365570501004020010000702001000071913351140201100991001000030100100000100109271152486106412651493982311090513441291330261025122720144057211051055105110000401007216771879719597201772041
50204719245381000100045608311712110071646820327165625507704061610131401001000061472527285304968960717357174965515365796501004020010000702001000072154351140201100991001000030100100000100109281135479106502761291990191094212151341340261025622719534053610291042114610000401007194371780720727199772127
50204719815391000000041908121720210871809796327174425507504065610140401001000061328127206654968837718977195765394365650501004020010000702001000072051351140201100991001000030100100000100109231146497106482631391050221091513741341350261025622717914054811301134109810000401007202971985718387199171808
5020471935540100001004500785170411327186580632717782550785406161013140100100006138112724088496891271907720726541836565850100402001000070200100007191635114020110099100100003010010000010010921113361310693282894250251094913231341030261025622718484057210801097107710000401007204371827720927197772116
50204718195391000000041507891704210071744797337156025507704063610120401001000061401927289694968757719907185565493365565501004020010000702001000071949351140201100991001000030100100000100109301140492106242601391632171091513041211040261025622719624055611491074107010000401007201371934718997207372006

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1901

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0f18191e1f202224293a3e3f404d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029720015381000000427081810720014071871827715602550675405461013640010100006131792726585049689807195672004654753657235001040020100007002010000718283511400211091010000300101000001010894015747310643274792836271091412201420392252016851313718704048010821037111410000400107208472078720537211671947
5002471919539000000047708421070401167194479571608255064040530101204001010000611746272114004968714718847175365343365498500104002010000700201000071860351140021109101000030010100000101090001545021061124816918722110899137012703302520585514716884050410511095100110000400107208172300719547192872059
5002471965538100000144008251066401087188179271547255065040478101324001010048613583272542804968785719577194165219365585500104002010000700201000071762351140021109101000030010100000101091201314861064027016883682710931115012200402520571131271658404881144944114810000400107197072018719247193671808
5002471901539000000046308171072001007183079371715255065040534101274001010000611754272269014968725718437198165303365818500104002010000700201000071871351140021109101000030010100000101090911694811064526698887427109211140122105025201085135718884054010611030101810000400107185571910718277174771878
5002471937540100000049208131072011287178381871559255062540486101244001010000612593272645804968732719317187465345365293500104002010000703121000072055351140021109101000030010100000101090901624761062123713897502110918125011610702520485414717284054010601056110410000400107200071976720157201671974
500247195653900000004310835107360124718648297151125506154052210145400101000061405127234110496883771870718616544336552150010400201000070020100007181435114002110910100003001010000010108940131508106302731088310027109141270122003025201485144718644050011051110105410000400107197472016717967181571841
50024718415380000000416081710768012871966795714852550725405501012540010100006135372726294049686977194572065652833655845001040020100007002010000718243511400211091010000300101000001010917014848210621282118704812109171241136007025201485144716994050410271090106510000400107187371654721797192571899
50024719105390000000428080510696012871894811715662550696404981011640010100006135262721327049686387194871822652723656595001040020100007002010000718393511400211091010000300101000001010874013851310605255169077628109211291124032025201371134718344051610771097110910000400107188072073719567191772129
50024720045370000000455083010720012871751785716492550675405381012740010100006130152726048049689837177071971653683655325001040020100007002010000717433511400211091010000300101000001010875015551910648265128824831109151231118107025201471135715944052010641049104910000400107194571874720417189071800
5002471880537000000049508161072809671788773715742550605404781011540010100006121152723257149688617191471743654043657065001040020100007002010000718743511400211091010000300101000001010896015544810627285139184821109111321117034025201271313715574049210261136103610000400107193671762718357188571996

Test 3: throughput

Count: 8

Code:

  ldrh w0, [x6, #8]!
  ldrh w0, [x7, #8]!
  ldrh w0, [x8, #8]!
  ldrh w0, [x9, #8]!
  ldrh w0, [x10, #8]!
  ldrh w0, [x11, #8]!
  ldrh w0, [x12, #8]!
  ldrh w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3658

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e181e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16020929719219101100613779217128996292867842931650174619332516016080161800008010080000400753129136206949262302941929249940139140160100802008000080200800002925035118020110099201008000010080000100809671938946178529765813893244959864626861305253500719005110117112943635800595104249280000801002941229332293832947329328
16020429160218100000646676917041051122921277631218441430197625160151801678000080100800004007951282860058492639929291293329133392251601008020080000802008000029498351180201100992310080000100800001008090219408504985498627119372649878636277213850404778193105110117212951327800535726159680000801002913729215295282927929479
160204292782201000006626844170490144290347703062017161219902516015180159800008010080000400744127941215749262152911229305907739191160100802008000080200800002931335118020110099411008000010080000100808851941653168552562313868344935856087631154831494119335110116112966523800656354929280000801002923329313294282934729302
16020429120220100000654879227441011002934180728419141839207125160157801618000080100800004007391298159050492628229262291749358392321601008020080000802008000029293351180201100992610080000100800001008089620350490485309627118764449198574172610848524778190451101171129707308006150151110880000801002939029305292522936329376
160204292562201000006672816171284108291217513221697182118692516015580155800008010080000400745129399106649262772923229267910739273160100802008000080200800002912935118020110099211008000010080000100808872032750418479167913934484935862556701204976518818045110116112974931800645934819380000801002941229130295132943929308
160204293172181000006653756165698128293237472911676172718992516016580151800008010080000400719129917815349262232939429336909839288160100802008000080200800002923735118020110099221008000010080000100808532134554528539763711845464691850317571294746468419617751101161129726248006353044510080000801002922529272293132943129289
16020429257219100100695380016561071002926074731017021767213225160171801618000080100800004007681290091068492641129408293779232392991601008020080000802008000029438351180201100992710080000100800001008088820406539784943635119143648208583966512145975406190351101161129325198004043842310480000801002936129435294732943829381
16020429387220101000688778616881081002932076532618861732199925160158801498000080100800004007891299790073492585329105291389098394601601008020080000802008000029290351180201100992710080000100800001008090519389584285317621128643248788609666410344834745192351101161129481268005856449610480000801002914729578293652921529318
1602042905522010010068368031712100922930374428719251959199425160149801528000080100800004007881299019060492615229179291829218391941601008020080000802008000029370351180201100991910080000100800001008090119328533584781648118932047558592968211647804561190115110116112958928800495005029680000801002912329229293302919429200
1602042915722010000064238281672931282922079630419831696199625160143801648000080100800004008241294447054492611129547295889326239390160100802008000080200800002904535118020110099221008000010080000100808822039357218509261110854384862858667261174707514519375110116112945229800584945159580000801002931829256295152920229337

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3667

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e1e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929777222200006391075417041241002914578736420842038189825160070800738000080010800004003411291664074492635929181291279385039520160010800208000080020800002941435118002110943108000010800000108087251381604685273600148403452788614579211552754888516305020416332916130800625824798680000800102933429387293952962429229
160024293982212000064870785168011210029406790325182819282068251600788007180000800108000040034912917390614926247294692947890760391591600108002080000800208000029179351180021109251080000108000001080938493845417851536491289746513786127746130542650265006050202162329150338007359655710780000800102946929405292502939329380
16002429269221310006676077117521051002922077332216122032204725160081800658000080010800004003341300567173492612129429293589237039468160010800208000080020800002931335118002110924108000010800001108101069413569284909637138671044365860417521435203572874103050204153329415378006251245110780000800102955729142291442921729336
160024293342214011071130774167211910829299788318190618222205571600808007780000800108000040028212986520534926200293102941292590393111600108002080000800208000029432351180021109301080000108000001080973673515980853886971385840522886743763131545955156645050202153229359298005457859611880000800102947329398296152933429277
16002429184222400407196082017201041482918977935518281923197125160078800758000080010800004002861301129161492632429165293689412039251160010800208000080020800002915035118002110929108000010800000108093670428574985807630158665046838602068511351854901663405020216232937818800785274638980000800102933029479292692928529508
160024295382184001074300781168010713229383793306187320092001251600788007980000800108000040033313105960664926305293562908993310393541600108002080000800208000029455351180021109341080000108000001080952733585141855745901288572538485829835142538055986466050203164329517218006658156811480000800102929929268293252943129362
16002429358222400007336076817049712029282778332198219712176251600838007680000800108000040036012968751774926314294782935795130394601600108002080000800208000029193351180021109331080000108000001080957703535938852616731087472533085906724130519753367306050203153329455198006652750810680000800102922429344295682942029207
160024291412204010069260732171211210429363782287200019181995251600698008880000800108000040030312995041514926048292842937790800395281600108002080000800208000029247351180021109221080000108000001080977723515243859906831487042536486304702133555450147201050203163329426268006253054311580000800102931329427293662923829237
16002429250220400007064078717121161362950377233119181897191325160080800828000080010800004003141298663070492615829311294019306039377160010800208000080020800002930335118002110946108000010800000108091251377519985055648149125255858633767412954965009501605020316332944235800585105029680000800102932629363294492945929249
160024293342183000066530787167210413229440752337177220382231251600728006580000800108000040037112961111764926124292252931492520393021600108002080000800208000029206351180021109261080000108000001080913553665211853026631785436528185752776130493848375060150203163229654268007056553010580000800102945929133291482933729380