Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (pre-index, Q)

Test 1: uops

Code:

  stp q0, q1, [x6, #0x10]!
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3e3f40464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
90051165810001416101201150183325300010002000100020005999125351114011651165324300020004000116511651180011000100020291293724201201330152014127712151581666116210002000100011661166116611661166
9004116591110154310171211507032530001000200010002000599912492111401165116532530002000400011651165118001100010002035141115282012011720122017129312051651656116210002000100011661166116611661166
900411659101614192010011501011225300010002000100020005999124931114011651165323300020004000116511651180011000100020441361416201301318122018125312151671666116210002000100011661166116611661166
9004116591020150101812115000325300010002000100020005999124650114011651165323300020004000116511651180011000100020291253524201312318182031124512151661665116210002000100011661166116611661166
9004116591110143510701150124302530001000200010002000599912477111401165116532330002000400011651165118001100010002027139362320120000152000126112051671666116210002000100011661166116611661166
9004116581116154310114115078025300010002000100020005999124180114011651165323300020004000116511651180011000100020311469624201212188122031128512251661654116210002000100011661166116611661166
90041165811101402016011501103253000100020001000200059991254201140116511653233000200040001165116511800110001000204513693152012002310182018126112151571677116210002000100011661166116611661166
9004116591100162410121211500165253000100020001000200059991248011140116511653233000200040001165116511800110001000203713614202012011714122015129312151661667116210002000100011661166116611661166
9004116591010151220151211508103253000100020001000200059991258811140116511653233000200040001165116511800110001000202912772452012103114122043128812151661666116210002000100011661166116611661166
900411659111018371018121150838253000100020001000200059991250101140116511653243000200040001165116511800110001000204612937212012013514122031125312251661676116210002000100011661166116611661166

Test 2: Latency 3->3

Code:

  stp q0, q1, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1656

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20223102737744410677732194122009021211733217212302952562530100101002000010100200007627246818811021910273103167739391753010020020000200400001168411718111020110099100100100001000010022477341341236648242920125149002479504609225236612121447101161110300100001180020000101001166611713116901168711709
202041027177303106417322031220811021210249216312333181462530100101002000010100200007420553840801164711702116929106391003010020020000200400001170611690111020110099100100100001000010022486421881197636201620564151302472504662224916813851487101161111692100001241220000101001031010256103231028810241
20204103107640010470772202122161202121020521772222310209253010010100200001010020000741815392520116731165311724908639201301002002000020040000117221169911102011009910010010000100001002252138126523703124132013315144247250461122501581280144710116111019010000572120000101001025710374102741022610309
20204102927744010617812203122088021211673215122712752052530100101002000010100200007380653862801166411676116999089391793010020020000200400001173211667111020110099100100100001000010022470291683198310200220521147262487504632225208212631437101161111681100001241020000101001165411678114691168911677
20204117068844410728722202122161202121170321720319336186253010010100200001010020000740475389360116661160611704912139115301002002000020040000117391168011102011009910010010000100001002248929146923822724242013714940249650469822528621299143710116111022910000582120000101001031310227102201032010277
20204102707630310704752196122167075211691215012722951662530100101002000010100200007408253721001167611700116619098391363010020020000200400001168211665111020110099100100100001000010022485261544198829200720518148502479504617225385012251437101161111702100001433020000101001028010275102491028710332
202041028077400106208022091222415021211679217203612262162530100101002000010100200007406153838801170511722116839113391803010020020000200400001170511704111020110099100100100001000010022502261533197542198020518154002472504637225077614411437271161111697100001434120000101001028610270102461028110233
20204102567730310755652189122081102121020121710265219136253010010100200001010020000741505394920116851168710255770437703301002002000020040000102261025311102011009910010010000100001002250126134623704824292011315530249550462622513551247143710116111023910000383620000101001169511695116931168711679
20204116638833010500772223122084021211627221802442991872530100101002000010100200007410654035601164411640115369111391663010020020000200400001152411704111020110099100100100001000010022493201528195915199220507149242488504624225266513081427101161111681100001401220000101001173811699117091167311689
20204116598720010710762188122006021211689216522502421672530100101002000010100200007409153747601165711656116969082391623010020020000200400001165911668111020110099100100100001000010022501261566201825199920502149332472504602225184913161407101161111684100001251320000101001174611710116881167511659

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0310

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e1f2022293a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200431029785210121027578226811728508801017822510286324732530010100102000010010200007510847024811143311412102877671388863001020200002040000114491025911100211091010100001000010224986130723924921202011415252248950467522548381174000641516551136110000773120000100101020711379102091147910205
200241021085010101019770226011608709441133922482260317155253001010010200001001020000749714688561113611142310363766037696300102020000204000010236114151110021109101010000100001022469015902381302081200941535024815044742252541966000641516551026310000822020000100101021511425102731143010238
2002410227850101010158782250117126072411417224112313031332530010100102000010010200007508346854411146311449102907709377273001020200002040000102471142711100211091010100001000010225010163023765120082010915030248550458522530501220000641516541029710000682020000100101141210241114011024911394
200241135777010109819822301117203171211389225722912671432530010100102000010010200007752952360011022310188113448850388733001020200002040000114151020911100211091010100001000010224588131519835524162043815050247750462222523521173000641516541142910000842220000100101022311433102211141310248
2002410245851111010104792277117125093210217224512962681192530010100102000010010200007815352724811022810206114328891388973001020200002040000113761029111100211091010100001000010224814112420446024012046115051249750472622535401198000641516451143210000494020000100101027711409102451141810186
2002410203860101010266782305116324070410208221102552401052530010100102000010010200007787052436811018810264114058887388613001020200002040000113651025011100211091010100001000010224738121420844724422043115260246550459322508491114000641516541148610000794020000100101142610250114131018411396
2002411401760101010095822270115046072411390221502492601562530010101782000010010200007763752868811019410224114438842388273001020200002040000114271030911100211091010100001000010224679125120694823782041114870246950458022512601324000641316441142310000681020000100101028811401103211142510233
20024101807701010100778422771171250724102532222123731813225300101001020000100102000077349526528110231102311145388543885930010202000020400001135710208111002110910101000010000102248912118620634324502044115041248150464122533371133000641416451136610000490020000100101028511383102411140110295
200241022285110111033846229111632309481020022471280272157253001010010200001001020000778415257601102251023711392885338857300102020000204000010243114461110021109101010000100001022477011182100462393204591497024855050492253152993000641516441139810000530020000100101025911371102691139310274
20024102568501010102666523051170440724113722234028530615825300101001020000100102000075103468184111396114821025277133769230010202000020400001022811372111002110910101000010000102247810153723763521472010814920248550468422515501170000641416451020510000452020000100101140910291114031019411481

Test 3: throughput

Count: 8

Code:

  stp q0, q1, [x6, #0x10]!
  stp q0, q1, [x7, #0x10]!
  stp q0, q1, [x8, #0x10]!
  stp q0, q1, [x9, #0x10]!
  stp q0, q1, [x10, #0x10]!
  stp q0, q1, [x11, #0x10]!
  stp q0, q1, [x12, #0x10]!
  stp q0, q1, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0026

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)191e1f2022293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606167696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602238022760040000976296232611408111008012623015478406125240104801041600008010016000040051136856401048014080156801486007636015624010020016000020032000080151801461180201100991001008000080000100162498131060247472490160112151522498222469416258311610570000511000116118016180004160000801008016880141801938015480200
160204802016013000097899923271127211264801482298669677432524010480104160000801001600004005113686840004801508013480132600883601382401002001600002003200008013080175118020110099100100800008000010016250413162624527248416009915180248622246131625708713020200511000116118019480004160000801008014980241801538020180179
16020480145601303009876892337113849172801332302499854502524010480104160000801001600004005113685328104801128019680152601083601092401002001600002003200008012380160118020110099100100800008000010016247517165324402249216014515390248622246451625689415630000511000116118016480004160000801008017480150802128019080210
160204801436013000097891562310114081325680120228910201094103252401048010416000080100160000400511368681610480139801638018260090360099240100200160000200320000801358013811802011009910010080000800001001624749153924941247616009415010248019046761625609812810600511000116118017080004160000801008016880246801738018480189
16020480119601444009873198231811256152528019923068801015104252401048010416006080100160000400511368513600480130801618023760072360126240100200160000200320000802138017511802011009910010080000800001001624902910352461125051601421537425022264700162573124234221000511000116118014680004160000801008020680183801738016980165
16020480175601330301005314823261121612100801592293968884642524010480104160000801001600004005113686648104801768018380214601863601452401002001600002003200008022980138118020110099100100800008000010016249217123224709250216006315276249822247061625789116590000511000116118017280004160000801008019080160802268024080210
1602048015760033030976518623331128818272801792289707754522524010480104160000801001600004005113688160104801008011880207601013601672401002001600002003200008013080266118020110099100100800008000010016250032118624453250316011915333249021247551625609815920000511000116118016380004160000801008015580238801808016080155
160204801466012000097868223111125614100801352296571980522524010480104160000801001600004005113688880104802208020780202600723601382401002001600002003200008015880145118020110099100100800008000010016250020142724501247916009615090250428246661625897818320300511000116118012980004160000801008024280137801568018480203
1602048018460130001967515723151126416104802072279734952482524010480104160000801001600004005113687464104801468019480203601423601202401002001600002003200008020180223118020110099100100800008000010016250023179924655250016010915260249017846471625569218240000511000116118017780004160000801008022680129801498023480211
16020480173600303309792167232411240132208017423116185468725240104801041600608010016000040051136877281048013980179801656012836010424010020016000020032000080245801761180201100991001008000080000100162499172276243622480160123152102482158467416258411218060500511000116118022580004160000801008013680164802788027080199

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0025

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f20222324293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606167696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600438019960040000009588150235710012721326480240225183863579252400148001416000080010160000400061368818405480324801568017560094360181240010201600002032000080142801711180021109101080000800001016250828250124306250116016714990248849247261625721552275045020075160338022680004160000800108012380199802028030480123
16002480134601300030098791832297100149618276801692278988838412524001480014160000800101600004000613681800110480244801528019760170360230240010201600002032000080257802231180021109101080000800001016250918178624554249616007115123250521247431625511072872065020073160328008080004160000800108018780110801458012780139
160024801006001100100991294234710014326264800682324741913482524001480014160000800101600004000613682928010480112801888022060061360127240010201600002032000080195800941180021109101080000800001016251013135224747250416011115090251021246531625601481135005020002160338021080004160000800108014580149802198031580295
1600248027860030000009936112234410013041125280095229647952726252400148001416000080010160000400061368852011048013380169802346015836013924001020160000203200008009180109118002110910108000080000101625123812249902496160079153202518212463216256979755005020003160338023780004160000800108020580194801348014580140
16002480211601200000096151942349100141692888013523105164582925240014800141600008001016000040006136867440104801158013880162601033600662400102016000020320000801638010111800211091010800008000010162490515062479124881601611505025105048641626086810580050201083160338022880004160000800108030380114802228027480148
1600248017660122000009921218235210010161425680115227681183087252400148001416000080010160000400061368292801048013980186801436016536017424001020160000203200008013780118118002110910108000080000101625189544245872506160104148802509262467116257413320360050201083160338024080004160000800108021080204800968025780242
160024800906002200000978319423001001504134688025122318591011552524001480014160000800661600004000613688232110480253802668018060163360188240010201600002032000080286802461180021109101080000800001016247216219524561247616010115010248447845931626081421804005020003160338025980004160000800108025480249802558018080197
16002480208601330030097832062346100151217236802752244799711372524001480014160000800101600004000613687344010480155802228019860183360196240010201600002032000080242802231180021109101080000800001016248813214524281246616012715100249249846341625961212224005020003160338024380004160000800108024080189802898027480232
16002480196601340000010020191229710015282527280228226380912245925240014800141600008001016000040006136880640104802888027580223601803601372400102016000020320000802678024111800211091010800008000010162492231206244072524160123147242488514467416261314522390050201002160238031180004160000800108029180278802838032480262
16002480247602440040097201682297100128010172801382324117844051252400148001416000080010160000400061368492001048023280134801776026436018024001020160000203200008014780179118002110910108000080000101624862124932439324891600881507025102124606162621611381005020003160338012080004160000800108028980160802468021280198