Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (pre-index, S)

Test 1: uops

Code:

  stp s0, s1, [x6, #0x10]!
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3e3f40464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)aaabacafbcl1d cache miss st nonspec (c0)cfmap dispatch bubble (d6)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
900611669020530101601151181025300010001000100010001000100059991087580000011451166116632530001000100020002000116611661180011000100010128350010000003100442351016116310000010001000100011671167116711671167
900411669006528101601151141025300010001000100010001000100059991087580000011451166116632430001000100020002000116611661180011000100010260393161004140144102041251016116310000010001000100011671167116711671167
90041166800072410100115141125300010001000100010001000100059991087580000011451166116632430001000100020002000116611661180011000100010220232161004016304103642351016116310000010001000100011671167116711671167
900411668000622101201151181125300010001000100010001000100059991087580000011451166116632430001000100020002000116611661180011000100010000280010040000100441651016116310000010001000100011671167116711671167
90041166800071410110115140025300010001000100010001000100059991087580000111451166116632430001000100020002000116611661180011000100010420244241004024184102641951016116310000010001000100011671167116711671167
90041166800071610211611510002530001000100010001000100010005999108758000011145116611663253000100010002000200011661166118001100010001038028122100401684103342751016116310000010001000100011671167116711671167
9004116690012641012411518402530001000100010001000100010005999108758000011145116611663253000100010002000200012691166118001100010001014031561003138280102643151016116310000010001000100011671167116711671167
900411669000512001420115114002530001000100010001000100010005999108758000001145116611663253000100010002000200011661166118001100010001016027416100311206102804351016116310000010001000100011671167116711671167
900411669000738101301151121025300010001000100010001000100059991087580000111451166116632430001000100020002000116611661180011000100010220231610070804102841251016116310000010001000100012291170116711671167
900411669006528101101151121025300010001000100010001000100059991087580000011451166116632530001000100020002000116611661180011000100010140191241000044284102802751016116310000010001000100011671167116711671167

Test 2: Latency 3->3

Code:

  stp s0, s1, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f202229373a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
20224100407522020010470512249114481305194410148221701601763625302411010010023100001010010000100005439124688488041001001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001249516169714740145211020153302493504647125022669700710116111003710000177211000010000101001004110041100411004110041
20204100407630330010260722306117281416093210025230602262213525302311010010123100001010010000100005439794688488023701001710040100407424374983010020210000100002002000020000100401004011102011009910010010000100001001247915172314600145811027152822485504619124962158300710116111003710000241411000010000101001004110041100411004110041
202041004075300000103356223211149616112194410025226402261902825302211010010123100001010010000100005439594688488041001001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001250124174112910150111037150632505504536125002875800710116111003710000283011000010000101001004110041100411004110041
20204100407533030010266622259112481847073610025227801832054025302391010010102100001010010000100005439354688488005701001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001248124175014650147711065151832493504646125102670770710116111003710000323011000010000101001004110041100411004110041
20204100407533030010455672306114002127178010025225301771834125301961010010113100001010010000100005439394688488039001001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001246524166214830146411050149402485504499124942775209710116111003710000329311000010000101001004110041100411004110041
202041004076300000102426322561148013710294410025222422101874225302501010010097100001010010000100005439514688488049711001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001248926174813940147511024151602485504582124972774800710116111003710000225411000010000101001004110041100411004110041
202041004075330300102697722781148817111184010025226102341754225302541010010131100001010010000100005439674688488025801001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001248424172115030143211064149832457504538124822584803710116111003710000229511000010000101001004110041100411004110041
202041004075300300103926822781152014090103210025225501842062425302241010010137100001010010000100005439674688488018701001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001248424173714690150911017150232493504507124872762803710116111003710000214011000010000101001004110041100411004110041
2020410040763300001050667228611488197100704100252250019922438253024810100101551000010100100001000054394746884880621010017100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012493271663147511480110631514024895045431249925784007101161110037100003941011000010000101001004110041100411004110041
20204100407530030010497582294116881257273610025226202221742925302411010010140100001010010000100005439204688488050001001710040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001248523175314710144611048151202505504502124872580503710116111003710000239011000010000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)181e1f202229373a3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
20044100407533000104136622251222433612021210025216526221139253010610010104301000010010100001000054340946884881114010022100401004074463752130010201000010000202000020000100401004011100211091010100001000010124893116261622016741084014708249750454312490337170064061665100371000038741000010000100101004110041100411004110041
20024100407644000103715922111222449414021210025216521222541253055410010100141000010010100001000054339746884881606110022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124812615591648016701083015284248950457312496319100464051656100371000040541000010000100101004110041100411004110041
20024100407555000103868222181222442515021210025217221419843253002710010102231000010010100001000054334546884881489010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010125024015651670017011081515290248950451412487297280064071667100371000023551000010000100101004110041100411004110041
20024100407550000103056321901222416182212100252165242226572530536100101027910000100101000010000543393468848800450100221004010040744637521300102010000100002020000200001004010040111002110910101000010000101248132156816480170710840150502497504617124943376001064071666100371000040131000010000100101004110041100411004110041
2002410040755500010407632218122405341602121002521652242321012530659100101002310000100101000010000543329468848808600100221004010040744637520300102010000100002020240202401016410040111002110910101000010000101248635158316430167410831146710248150453112479317730564071666100371000030791000010000100101004110041100411004110041
200241004075440001059362220412208428021210025216520224335253065910010106431000010010100001000054337746884880754010022100401004074463752130010201000010000202000020000100401004011100211091010100001000010124652716841652016821084115284248950455912492428020464051655100371000023131000010000100101004110041100411004110041
200241004075404401018266221112208546120212100252158258225332530029100101054110000100101000010000543361468848819251100221004010040744637520300102010000100002020000200001004010040111002110910101000010000101249735154816390166610819152002489504568124753970301064061566100371000028441000010000100101004110041100411004110041
2002410040755005010371652211122001214021210025215824426936253027110010103111000010010100001000054300146884881601010022100401004074463752130010201000010000202000020000100401004011100211091010100001000010124833414921623016801080315130248950455512480287490564051655100371000038841000010000100101004110041100411004110041
20024100407655500101766122111221665013021210025216520422540253019110010100361000010010100001000054338546884880876010022100401004074463752030010201000010000202000020000100401004011100211091010100001000010124944017461678016881081515030248950457912484247230064071677100371000022231000010000100101004110041100411004110041
200241004075500001036273218312224616021210025216517123141253031710010104071000010010100001000054335346884881307110022100401004074463752130010201000010000202000020000100401004011100211091010100001000010124732714891624016901082614940248150457412486377740464051666100371000032081000010000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  stp s0, s1, [x6, #0x10]!
  stp s0, s1, [x7, #0x10]!
  stp s0, s1, [x8, #0x10]!
  stp s0, s1, [x9, #0x10]!
  stp s0, s1, [x10, #0x10]!
  stp s0, s1, [x11, #0x10]!
  stp s0, s1, [x12, #0x10]!
  stp s0, s1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5022

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2022232429373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602244009730122200010182812385200125628229772401392296645358222524302780102827568000080100800008000040053118448486503400240131401464012920133032008524010020080000800002001600001600004013940151118020110099100100800008000010082528201242248702491800511548024946824688825375211651400000511001161140137800028000080000801004014740117401764013040153
16020440128301300000100863562335200128829757604401502301477684492524271980102813118000080100800008000040053118429046483270240156404114035320067032007324010020080000800002001600001600004014240129118020110099100100800008000010082578402284245892494800741530024749424661825307818711400000511001161140110800028000080000801004017540107401554015940166
1602044012830022220098556923542001288151067004011622965755845025242012801028256380000801008000080000400531184362464603802401694012640165201060320099240100200800008000020016000016000040161401171180201100991001008000080000100825031614862470112487800561540024965584747825404914371400000511001161140120800028000080000801004010840148401084014440359
16020440343302220201104524692330200128814218744401112353629430108424192080102826198000080100800008000040053118442486444831240070401054010920061032028224010020080120800002001600001600004018940130118020110099100100800008000010082587181611248602480800481515024785204574827646311921400000511001161140346800028000080000801004012440149401664015240149
160204401453002000009975792336400113618479708401192295590533232524208680102836888000080100800008000040053118429286453931240086401114017620080032015924010020080000800002001600001600004015540137118020110099100100800008000010082509181107245622496800481530424909964641825415713211430000511001161140132800028000080000801004021240098401714015640163
160204401183012200009759772365200104029388772400992363501519312524381080102831548000080100800008000040053118461926475880240091401444011220200032004924010020080000800002001600001600004011540145118020110099100100800008000010082496161460249302483800571521424747264625825337614121400000511001161140126800028000080000801004013640118401254013140152
160204401223022220009945912359200104014255844400852299668594442524109180102826018000080100800008000040053118455686451461240112401424013520055032007324010020080000800002001600001600004011640134118020110099100100800008000010082522181250248422493800501536024967044625825375914991430000511001161140115800028000080000801004012540139401254015240145
16020440122301202000990988236320088811424668400972294546743402524160480102829628000080100800008000040053118443206471141240079401554015120045032009824010020080000800002001600001600004014940149118020110099100100800008000010082519161107247732501800621516024965464713825475711881440000511001161140121800028000080000801004014540140401214011640158
16020440141301202000948381235720010563195106564008923127165422625242814801028162280000801008000080000400531184422464628712400874014440126200420320108240100200800008000020016000016000040141401271180201100991001008000080000100825071611612489142491800471534224904284634825505312551440000511001161140177800028000080000801004015340148401384014540143
1602044012530020200010128682312200127231026692401342285431522452524283480102834718000080100800008000040053118442246481431240127401394015420046032007724010020080000800002001600001600004011840143118020110099100100800008000010082526161193249402498800551553225003704671825386810401430000511001161140137800028000080000801004013340137401404018940122

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5021

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f20222429373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)606167696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600454017030002022200101767222781016569524728402062229477667422524353080012804328000080010800008000040008118439126528320024011740411401452006803200882400102080000800002016000016000040125401081180021109101080000800001082490272053245742499800571523024849384652825084315041400502000111671040184800028000080000800104014240153401394013240175
160024401343000202200010137622323101696259667284013023205244905825241960800128393580000800108000080000400081184427264371900240134402064021520127032016124001020800008000020160000160000401584015211800211091010800008000010825021621722417924648005514960250025446218250539151514005020001116121140148800028000080000800104015640131401954018440133
16002440162300020000001011085229810166478935564013122705966524625240128800128012080000800108000080000400081184412864791200240130401684017520087032013224001020800008000020160000160000401434016711800211091010800008000010824912013282440224538005515140247525446258250446115714205020001116101040130800028000080000800104018340147401424017540214
160024401873010200220010116762290101608123057284010622243395403025242534800128267580000800108000080000400081184588064031300240204401994020020089032017324001020800008000020160000160000401364015311800211091010800008000010825001614522438324858005315160247225446468250965134314205020001116101040382800028000080000800104017140172401684016940146
1600244013430002020200101584822753016564355101640203226446663548252410168001284005800008001080000800004000811845328649074102401454015540143200840320090240010208000080000201600001600004011640177118002110910108000080000108248616169024671224608005715000248425447348251638173314005020001016101040132800028000080000800104019040201401744018440184
16002440159301020022001000869228510151223815556402022241455698592524043880012829938000080010800008000040008118454486519660024006940188401332007103201532400102080000800002016000016000040117401631180021109101080000800001082494181752245910248280050153122468252466182510549361400502000101610940143800028000080000800104017440145401664012840206
160024401813010200200010041802304101504999672840134227650156345252426388001284328800008001080000800004000811844920653951102401774019040148201260320140240010208000080000201600001600004020140138118002110910108000080000108249716132924276247480059152342479254460782530601403140050200011169640133800028000080000800104015940145401364016540114
16002440176300010010009942662290101520166045564018722706334856225241394800128378280000800108000080000400081184501664038610240161401134021120100032014624001020800008000020160000160000401204017011800211091010800008000010824821616632456324728005215102247225045808250953172814005020001016101040144800028000080000800104019640140401954018940192
16002440171301020022001009276229130162469575764014122564804204725241637800128200680000800108000080000400081184379265214110240131401834016020083032011824001020800008000020160000160000401404014711800211091010800008000010824941617442460724598004915340247625446518251746158114005020001116111040165800028000080000800104013640158401704015340160
16002440131301020200009861892296101472172470840189226758654638252415388001283039800008001080000800004000811843048641499002400984017440158200720320130240010208000080000201600001600004016140153118002110910108000080000108250220127124701324848003215140247625045938250945134914005020001016111040116800028000080000800104018940160401894018640184