Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e202223243a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051040290000551800040102595571125200010001000100010005284845824104010406993773200010001000100010001040441110011000100001013007110476020262910574722531073116111037100035311000100010411041104110411041
200410404100005720100341025105139142520001000100010001000528444582410401040699377320001000100010001000104044111001100010000101201461039602803210593233239073116111037100031281000100010441041104410411041
2004104090000590000541025115998252000100010001000100052860458241040104069937732000100010001000100010404411100110001000010130051102100001510152831563073116111037100026191000100010411041104110411041
20041040900006414100301025113991425200010001000100010005285245824104010406993773200010001000100010001040441110011000100001014003310334112122110263611063073116111037100028191000100010411041104110411041
20041040900014311000201025936410252000100010001000100052840458241040104069937732000100010001000100010404411100110001000010120039102010208171041332947373116111037100023181000100010411041104110411041
200410408000060141003201025105651325200010001000100010005285645824104010406993773200010001000100010001040441110011000100001016003310206012101110142821955073116111037100028191000100010411041104110411041
200410408100053000070102504811122520001000100010001000528484582410401040699377320001000100010001000104044111001100010000101061311035101412910313432255073116111037100021271000100010411041104110411041
2004104090000621200040102505582025200010001000100010005284845824104010406993773200010001000100010001040441110011000100001000003110333012141510323921147073116111037100024191000100010411041104110411041
20041040800004022100201025106810172520001000100010001000528484582410401040699377320001000100010001000104044111001100010000101000321030501201710332451639073116111037100026261000100010411041104110411041
200410408100044150013010259566142520001000100010001000528444582410401040699377320001000100010001000104044111001100010002101801571023211261310342431063573116111037100036221000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1695

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097206753800000054108091616376716128002071591255076540624101324010610007676374271722504968799717667181665277765605501134023210008702561000871586351140201100991001000030100100000100108760151507106492421089612037109261235119003111262011611717944055691494495910000401007163771754718497186771885
5020471882537000000512078816963100719387513071273255082540644101224010610007675638271804019868599717827170565126365483501004020010000702001000071788351140201100991001000030100100000100108980154461106162801090670431087111631250274000261125122715334053691892888910000401007168071655715567180671801
502047180453800000054608371656484716937823071360255076540660101244025210000676270271972214968601715197181065001365413501004020010000702001000071654351140201100991001000030100100000100108660145498106652451188812042109491243112003000261126322714954053685090081710000401007168271851715977175471715
50204716475380000006220823170431247159681730714432550745406641011940100100006769762709808049686427181371715652833654385010040200100007020010000716743511402011009910010000301001000001001087601595281063323198873638108981224135005000261125122714504054083090693710000401007168571780717627196471628
50204717975380001005080808168011007158377640713892550745406521013640100100006760162711540049688767189271724652413654705010040200100007053610000717693511402011009910010000301001000001001089001724801062222112894138431093212841280363000261125122716734059595899693410000401007168171773716957148171800
50204717245370001005430795170431087164979050715332550730406161012140100100006754032712744049687017176671590650003654175010040200100007020010000718293511402011009910010000301001000001001088001464921065824898987839109171343117004000261125122715044055295884492410000401007175271667717657172871663
502047168053700100051508291664210471582785407167025507304062010126401001000067554527146741496860371771717886510036539950100402001000070200100007166035114020110099100100003010010000010010890011547510661234108723637108901245117003000261125122716404051290082483210000401007189071627716677181771799
50204716635380000005330803168031567173680240715782550805406641012440100100006777372715731149685987179471726650733652545010040399100007020010000717203511402011009910010000301001000001001089801255161064825789168432109331324117004000261125122716954057288885894510000401007181071797716797188571675
502047168053700000055508041688056718107796071592255074540584101334010010000678086271341504968485717227170365078365459501004020010000702001000071692351140201100991001000030100100000100108780136516106512511489013646108811244134007000261125022715804052099892694410000401007172971864717567183971651
50204716845370000005500802168021127169977250715392550710406041012940100100006771852709316049686437170971950651503653745010040200100007020010000717753511402011009910010000301001000001001089801335021066525813886361851109261244114004000265327922715824051699495683910000401007180371848716497165971689

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1835

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002972232538212010540085117361104717278374371491255068540602101214001010000612930272953014968601720487180865341365571500104002010000700201000071741351140021109101000030010100000101092911484901061326110922483810939142514113902520485437168040552952970100010000400107195871884717937178671942
50024719985381100005020806170411087195479553713662550710405261012340010100006122812725762049688027190971849652663652365001040020100007002010000718763511400211091010000300101000001010924314844510674291109187427109561265131109025204173371612404801008958101210000400107191271895716287182371899
500247174153810100051908301720113271861796327156325506754056210135401721000061366727179530496883071948717056534436561150010400201000070020100007186035114002110910100003001010000010109161131502106522549932383910933144512314902520485447165340552876988100010000400107187371845719007184871856
500247191053810000054808211712113271919821427152525507754051410133400101000061014327221680496883171982719486532336541150010400201000070020100007176535114002110910100003001010000010109132145493106622761192634321094913061271050252047143715684056098897090610000400107176371984718717192971811
5002471648538110000553084417441108718498043271623255071540542101374001010000612479272360614968622719587186665459365551500104002010000700201000071740351140021109101000030010100000101090311545081067426610952782610952144612114302520471547159440576944946102010000400107184671969719297189171723
5002471765539100000600083117841112718398073271576255068040562101294001010000611334271888904968684717077191665052365578500104002010000700201000071756351140021109101000030010100000101089921484871066325112931463910939123612717902520485347165740576866966102210000400107197571608718797175971689
50024718715381101005280822171211087177778942716082550690405341013840010100006125962726606049688047178171770653643654525001040020100007002010000720653511400211091010000300101000001010900114650810626250889350381096413461401030252037154716294048499299297210000400107194771822719007164871779
500247176853810000054608321712113271853805437146425507504056210139400101000061230827132780496867071770718486541236541350010400201000070020100007169935114002110910100003001010000010109253134498106372869916783410926147413010302520485437149740580920103488010000400107171571930719687173471665
5002471775538111000543081217041108718238253271674255069540530101354001010000612425271771104968683719047182465312365321500104002010000700201000071780351140021109101000030010100000101090821235011064927712924484110929141514113302520457447168740584934103092410000400107179771732718377186071793
5002471922538100000500083417281124718458185271543255068540594101444001010000610011271898404968793720707190565321365545500104002010000700201000071789351140021109101000030010100000101094521355061066525713930743510935139413610302520571447184540580964100695410000400107198971977717867184971878

Test 3: throughput

Count: 8

Code:

  ldr w0, [x6, #8]!
  ldr w0, [x7, #8]!
  ldr w0, [x8, #8]!
  ldr w0, [x9, #8]!
  ldr w0, [x10, #8]!
  ldr w0, [x11, #8]!
  ldr w0, [x12, #8]!
  ldr w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3661

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092943022000006557843170410710029359811447189223492083251601678016380000801008000040078912867140534926117292212927990893915716010080200800008020080000293333511802011009922100800001008000010080925446529885166755119254248448585771511949445588045110116112946832800574304687880000801002944029270293632939929234
1602042916422000006663811176811714429148814479190820922080561604608016280000801008000040076412750050604926106294432927293663930816010080200800008020080000293123511802011009914100800001008000010080914464522585015801890952520785787805136493946790195110116112921929800744075366680000801002918829326293282937529318
16020429218220000058078331728120124294038104852063218519882516015880152800008010080000400766128971015749262602947629234928739400160100802008000080200800002937535118020110099910080000100800001008092547755008492175698808448168590581312949585284345110117112914131800464274237180000801002925429117293242930329097
1602042944821800006794854172813518428965827441204720062078251601468034380000801008000040084512912411574925871292462921493453929116010080200800008020080000291143511802011009915100800001008000010080938439507484637756139647852718560176413850375216075110116112913849800584353816880000801002934829318293622919629282
1602042942022000006735880174412210429064797462222121892025251601558015380000801008000040077412916861464926230296942922392163924316010080200800008020080000292083511802011009918100800001008000010080906492513885363775119794244178573579412644915144035110117112921628800533593978180000801002917129305293302916529339
16020429270218000066998461784112152292957904651905204620902516017080154800008010080000400766128786405149262652906829140919125920916010080200800008020080000293653511802011009921100800001008000010080924422489685440779109765648898569779313449375062035110116112949134800743894319180000801002932629321293572937429312
160204292582200000665580317361321082918681848521392014181525160147801588000080100800004007641285705064492622629296293729392391801601008020080000802008000029306351180201100991910080000100800001008092347551138509375399563649928556576812748035257355110116112926734800553794788980000801002936028954292102939629390
160204292462200000652985617441121482914680048618882015189125160159801618000080100800004007541290470070492596929251292049337389861601008020080000802008000029102351180201100991010080000100800001008092147153758455374139763048838551179613250945185085110116112919437800544104626380000801002913429197291642923829225
160204294432200000660080917529212829086815468185018922030251601608015380000801008000040074112940081634926182293072931990963945116010080200800008020080000292483511802011009913100800001008000010080903480563485193777119805647918576975813353414787045110116112940634800343983807480000801002945729675293422938829458
160204294342200101680484416881341202908085045522731920201825160154801608000080100800004007791293925061492604929293292309092391831601008020080000802008000029317351180201100991710080000100800001008092150558948501278879934047588631681313350955474335110116112927337800523644339180000801002924929217291482939529245

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3687

retire uop (01)cycle (02)03l1d tlb fill (05)l2 tlb miss data (0b)0e0f18191e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)67696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160029298052202001006912829150412013229542806521173016772326251600718006580000800108000040037213188065549266812932229305963503954616001080020800008002080000296363511800211095510800001080000010809333495420853006891292444551086144776128518458330950204162429560308005959158510480000800102946229514294782927729626
1600242954422110000070468451704126120294597954461819186421321561600778006680000800108000040036413093396249264162944829341960803944116001080020805268002080000294383511800211094010800001080000010809384095858855766911291550501285812774140523255826250203161329509328006050761410280000800102953129637295822943729485
160024294452202000007054825171211914029347794441186019672373251600778008780000800108000040036513083906449263892942529626959703955216001080020800008002080000295123511800211094910800001080000010809403455236854956561490872490886836884127530854210750203163129568358004760961110680000800102930829553294822941729575
160024295292222000006926808174412111229846793449185620432182251600908007680000800108000040037913102736449265982930029611950503942216001080020800008002080000294133511800211095910800001080000010809063885875856786491093048612986561813141543358530350204164329560288007461053512180000800102939229461295272954929703
16002429519221200000683084616081149629676823457189919062275251600738007780000800108000040038513163807049263852944529467946303952616001080020800008002080000296153511800211094010800001080000110809113835317861456371293670526487089798140512952646450206164229597378006763558310180000800102938229498294392947529419
16002429795221100000671083817681141162919180351019131734232025160066800808000080010800004003511310227614926223297202928293450395591600108002080000800208000029406351180021109351080000108000001080911420554584912651129257253568628283313250375429095020316432954732800625616019780000800102951129391296942943729416
16002429253219100000698086317441221082961876746519151783230825160068800728000080010800004003571315037764926339293082961693860396371600108002080000800208000029545351180021109391080000108000001080910407630786035636892274537686489781143556758600850201163529444328007656860310480000800102940029540295692946229617
160024294302201000007170840172812420829478819490170019412180251600818006780000800108000040032013112776349264762942629585931003948016001080020800008002080000294703511800211095010800001080000010809124435604852916441098048550286719710134520456050350205165229714398005655560810580000800102965329357295942955529552
160024294752221100006882905172011212029724805476200418542112251600678007580000800108000040034113151996249266992943829701940003950216001080020800008002080000296883511800211096110800001080000010809493735757852816461192850511085887786136507852946350202162429426308005760557610180000800103002929384295712969029580
160024296732201000007255845170413710029336849471183517452346251600848007980000800108000040033612988726349264232927329552945303957916001080020800008002080000295103511800211094610800001080000110809054405491854016541397340578286247773139531852780050203152329628328007362161212880000800102941329453295252955029642