Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (pre-index, D)

Test 1: uops

Code:

  stp d0, d1, [x6, #0x10]!
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3e3f40464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)cfmap dispatch bubble (d6)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
9006116681116100101101151820253000100010001000100010001000599910875800011145116611663253000100010002000200011661166118001100010001044043127100430142610101953951016116310001110001000100011671167116711671167
9004116690000112210110115120002530001000100010001000100010005999108758000011451166116632530001000100020002000116611661180011000100010140361141005201688101903551016116310000010001000100011671167116711671167
90041166800008161050115100025300010001000100010001000100059991087580000114511661166325300010001000200020001166116611800110001000100003523110030022163101604751016116310000010001000100011671167116711671167
900411668001033010441151161602530001000100010001000100010005999108758000011451166116632530001000100020002000116611661180011000100010160352181004101880102044351016116310000010001000100011671167116711671167
9004116690000616101401151432253000100010001000100010001000599910875800001145116611663253000100010002000200011661166118001100010001014046232100510221011102952751016116310002010001000100011671167116711671167
90041166900012103210100115165025300010001000100010001000100059991087580000114511661166325300010001000200020001166116611800110001000101604431610050016167103643551016116310000010001000100011671167116711671167
900411669000671610901151120325300010001000100010001000100059991087580000114511661166325300010001000200020001166116611800110001000101804032910051022108103253951016116310000010001000100011671167116711671167
900411669000650101141151009253000100010001000100010001000599910875800001145116611663253000100010002000200011661166118001100010001020039616100502161814102253551016116310000010001000100011671167116711671167
9004116680106734100011510062530001000100010001000100010005999108758000011451166116632530001000100020002000116611661180011000100010360395161005002080103403551016116310000010001000100011671167116711671167
900411669000093610151211518212530001000100010001000100010005999108758000011451166116632530001000100020002000116611661180011000100010000352121003101203100003151016116310000010001000100011671167116711671167

Test 2: Latency 3->3

Code:

  stp d0, d1, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f202229373a3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2022410040765005001039880221812216134142212100252165227235362530275101001012010000101001000010000543907468848801900010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012481651561163031700108531522024815044011248026898007100011611100371000033211000010000101001004110041100411004110041
2020410040755050001032359220412232144160212100252165249212512530235101001010610000101001000010000543881468848802630010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012489301650164601718108281519424815044021248234782007100011611100371000038011000010000101001004110041100411004110041
202041004075404400104765322111222487130212100252172268234322530232101001007910000101001000010000543561468848806830010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012481381638168501695108351526824975044861249530817087100011611100371000032531000010000101001004110041100411004110041
2020410040754004001049776220412216701501960100252179262218472530192101001009110000101001000010000543923468848806401010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012485301481164301668108321510024815044051249838812047310011611100371000034031000010000101001004110041100411004110041
20204100407544400010335101219712216138130212100252151200226292530174101001010210000101001011510000543379473273803400010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012497351346163201712108181507024815044001247935758707100011611100371000037341000010000101001004110041100411004110041
202041004075404000102877322181223214619021210025216526921934253026710100101481000010100100001000054396646884880248001002210040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001248927151316300170610849152702505504409124682488001271000116111003710000325151000010000101001004110041100411004110041
2020410040756660001047372221812224127130212100252179208234412530204101001011610000101001000010000543904468848803891010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012497491544163401688108291525624735044391247028748007100011611100371000024031000010000101001004110041100411004110041
2020410040765050001047380221812240771602121002521792302313525301811010010160100001010010000100005439744688488024700100221004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100125023215601642017031082514990250550444212489317390107100011611100371000042751000010000101001004110041100411004110041
2020410040754400001037472220412216106180212100252165254219262530215101001010210000101001000010000543930468848802800010022100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012472351467165301683108301511424895044221248226773007100011611100371000027611000010000101001004110041100411004110041
202041004075440400104106522111222474210212100252158227224492530231101001009410000101001000010000543951468848802581010022100401004074243749830100200100001000020020238200001004010040111020110099100100100001000010012501271562166501683108411494024735043561248032729047100011611100371000027061000010000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)031e1f202229373a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)aaabacafbcl1d cache miss st nonspec (c0)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2004410040751024510722632166423306521002522630417390512530445100101032310000100101000010000543289468848800811010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124940196615600160110929153424581004404125385711266400031633100371000036441000010000100101004110041100411004110041
2002410040751010796224821728281408201002522420402422492530430100101063910000100101000010000543377468848815391010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124940210515700160410906151424701004473125185411236400031622100371000042491000010000100101004110041100411004110041
20024100407510266115225321648428306441002521993395422492530439100101032610000100101000010000543345468848819151010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124500195415530160810957154024781004427125065512076400031633100371000024081000010000100101004110041100411004110041
20024100407510320102224621720279406881002522360380415762530656100101024910000100101000010000543321468848816000010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124620206915480161510926150924701004434125155512236400031623100371000035071000010000100101004110041100411004110041
2002410040751012294228421696542206881002522360396356462530238100101002910000100101000010000543337468848808781010022100401004074463752030010201000010000202000020000100401004021100211091010100001000010124700203515660154810953151524781004404125176311666400021623100371000024721000010000100101004110041100411004110041
20024100407510140121227521544542308201002522320458390392530294100101065610000100101000010000543337468848801081010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124580204515840161310942152924501004405125256311716400021633100371000022061000010000100101004110041100411004110041
2002410040759930107226021704174106561002522080428366602530256100101034910000100101000010000543225468848810361010022100401004074463752130010201000010000202000020000100401004011100211091010100001000010124540204815860159010931151024701004403125295611936400031622100371000026531000010000100101004110041100411004110041
20024100407510044114223921680212507441002522350336373642530281100101016310000100101000010000543369468848807161010022100401004074463752130010201000010000202000020000100401004011100211091010100001000010124740200315650158510939150824581004507125265411376400031633100371000041241000010000100101004110041100411004110041
2002410040751030585225221720215306361002522490413372622530332100101054010000100101000010000543313468848805731010022100401004074463752030010201000010000202000020240100401004011100211091010100001000010124540193716050157410935149724701004383125305611296400021633100371000039161000010000100101004110041100411004110041
200241004075102099222912168823007001002522280443415562530274100101054510000100101000010000543193468848811871010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124660203716100162310921152024661004385125294712056400031623100371000028531000010000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  stp d0, d1, [x6, #0x10]!
  stp d0, d1, [x7, #0x10]!
  stp d0, d1, [x8, #0x10]!
  stp d0, d1, [x9, #0x10]!
  stp d0, d1, [x10, #0x10]!
  stp d0, d1, [x11, #0x10]!
  stp d0, d1, [x12, #0x10]!
  stp d0, d1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5029

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f20222429373a3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16022440264301330000101881312257101688864150228401732230700904742524320380102825848000080100800008000040053118508006443110240175403224022320239320222240100200800008000020016000016000040233403591180201100991001008000080000100825005023102376182442801161502024682544793825601331984000051101161140276800028000080000801004023840158402054025740317
160204403273026060001012817122391019761817110516401912237713643852524301680102838418000080100800008000040053118497926487071240195402344024020177320150240100200800008013620016000016000040203401921180201100991001008000080000100824843022752417724238011415221024922544718825581422855000051101161140194800028000080000801004023040144402314021040248
16020440252301505000101071692225101936201915026440223217793268771252427448010282804800008010080000800004005311852480648850124029840287402312017832017724010020080000800002001600001600004022240213118020110099100100800008000010082500352734237152433801771511524762544754825399225960100051101161140292800028000080000801004023940211402454021840236
16020440194302505000100681552246101960349315026440203221265668165252427358010283610800008010080000800004005311852384651655124026040317401962014032019924010020080000800002001600001600004020140269118020110099100100800008000010082468141472238013244780123151302476510473382561931680000051101161140251800028000080000801004019340205402164019040265
16020440177302300000984011222711017042628120284401672244615718632524355280102811878000080100800008000040053118496486487021240224402174024720154320142240100200800008000020016000016000040261401991180201100991001008000080000100825022014542424102448801141472024725104698825581372017030051101161140291800028000080000801004025340207402374027240335
16020440207301444000100952012250101704309613026440225221191575383252431408010283003800008010080000800004005311847680649089024022340236402022018732019924010020080000800002001600001600004018940309118020110099100100800008000010082476241767238122480801381502024723584684825379019821300051101161140268800028000080000801004024240282401504026840170
16020440294302404400100321432271101704823150264402432237559574702524390980102811948000080100800008000040053118520006438401240213402894022720182320259240100200800008000020016000016000040217402011180201100991001008000080000100825042220052369152467801681508024845104812825561052649000051101161140190800028000080000801004016940200403224020440325
160204403103024044001002617922781017127821302764021022376836927925241528801028359680000801008000080000400531184724865026602401874027440267202133201832401002008000080000200160000160000402614025511802011009910010080000800001008250422225424204249680105150902480510465782553821502000051101171140229800028000080000801004019640234403134019240238
1602044022930144440010107163226410167214401702644025022168008343925243259801028147780000801008000080000400531185106464865812403014024840371201523202132401002008000080000200160000160000402294023811802011009910010080000800001008247121232124233246180112150402472486466882530972100030051101161140229800028000080000801004021440264403014026540217
16020440263302400400101191482271101672127411027640274223098793676252417318010283589800008010080000800004005311845400645810024027840179403222022132024724010020080000800002001600001600004021040223118020110099100100800008000010082488302263241212247480136151642484510477682557952014000051101161140197800028000080000801004015840298402104022140235

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5029

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f20222429373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600444018630102200001033240223930196011542644014922056599204225243619800128022980000800108000080000400081184309664942912401130401884021720116032012324001020800008000020160000160000401374015911800211091010800008000010825032012242436182450800531522324922544631824864215911400502014169740174800028000080000800104020040144401674017040202
1600244016230103003001032071221810194451482644014121986514285625243505800128347080000800108000080000400081184398464586512401220401924015820113032015724001020800008000020160000160000401734021511800211091010800008000010824742017662426924658005815260247625446548251338209914305020916101140150800028000080000800104016940144401474018140143
16002440150301033030099698322171014161534726440157218449465273252427028001282523800008001080000800004000811844152645495124011804017440152201500320182240010208000080000201600001600004016840178118002110910108000080000108247420196924293245880044150532484254463682510531612140150209167940147800028000080000800104018040166401474016840122
160024401623010200000102485822721017045625292401462223677552502524053580012818398000080010800008000040008118451126467551240171040139401262006203201592400102080000800002016000016000040171401331180021109101080000800001082486161717246422461800461509024925084539825294213991400502091691440177800028000080000800104019540185401594017240131
160024403193010200000100953822611017041479946040190223859647250252406698001283368800008001080130800004000811844272643830124009404014740138200870320156240010208000080000201600001600004015840208218002110910108000080000108248216206724311124708005615060248450845968250349162014005020131612940183800028000080000800104015140118401784017340188
160024401623000200000100297623051016801516480401082240541451812524290680012836098000080010800008000040008118469366477461240118040189401702013603201082400102080000800002016000016000040160401451180021109101080000800001082490161689244172466800441527224725104626824973917531400502010169740159800028000080000800104015140188401334019040171
16002440191300022000010179502274101688359154724011222323585365225243201800128327180000800108000080000400081184741664867812401100401614016020094032012124001020800008000020160000160000401344023311800211091010800008000010824741617072431162463800581506024645104553825064516061400502091691240176800028000080000800104012540147401214014340202
16002440173300022000010077622273101616188642040158223346954846252430918001283127800008001080000800004000811843240642193124017104014340165201250320153240010208000080000201600001600004013740144118002110910108000080000108250218221624533247280070153022480508454082490481327142050209169740162800028000080000800104014940144401664019440125
160024401343010220200100175923041016568407292401172232603660592524376180012848888000080010800008000040008118451366499131240135040184401342008103201242400102080000800002016000016000040137401451180021109101080000800001082497182297245117246580085151402479510464882519491849144050208169740350800028000080000800104012640158401804015140156
1600244017430102200009942622287101496294952484012122524465547025240281800128008580000800108000080000400081184499264100112401200401414014020100032012024001020800008000020160000160000401254013511800211091010800008000010825101615402421624698005615170249651045698249244153314205020816111240125800028000080000800104016240158401494018340175