Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, Q)

Test 1: uops

Code:

  stp q0, q1, [x6], #0x10
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f2022293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? int retires (ef)f5f6f7f8fd
9005116591100032810601150000025300010002000100020005999119021114011651165032330002000400011651165118001100010002016249550202000302622020260950514516541162100002000100011661166116611661166
9004116580000921400001150023002530001000200010002000599911924111401165116503233000200040001165116511800110001000203208750262000104914020280550514516551162100002000100011661166116611661166
90041165900000224101012115001002530001000200010002000599911894011401165116503233000200040001165116511800110001000204007970162000001416320320710514316531162100002000100011661166116611661166
900411659000003120014241150430602530001000200010002000599911902011401165116503233000200040001165116511800110001000203206330322000003018020320980514416441162100002000100011661166116611661166
9004116590100630024401150000025300010002000100020005999119021114011651165032330002000400011651165118001100010002036011100402000002414320360630514416441162100002000100011661166116611661166
900411659000064281015161150210002530001000200010002000599911924011401165116503233000200040001165116511800110001000202008770162002002216020000870513316221162100002000100011661166116611661166
900411659000002141010121150158002530001000200010002000599911902111401165116503233000200040001165116511800110001000202407940242000001816020260790514516551162100002000100011661166116611661166
900411659010003000412115013000253000100020001000200059991188901140116511650323300020004000116511651180011000100020240106100242000004412020240870513416451162100002000100011661166116611661166
9004116580000022200104115053302530001000200010002000599911902011401165116503233000200040001165116511800110001000202208730202000002214020240710514416421162100002000100011661166116611661166
900411659000003321241416115013167025300010002000100020005999119940114011651165032330002000400011651165118001100010002030063002220000000020440950513316441162100002000100011661166116611661166

Test 2: Latency 3->3

Code:

  stp q0, q1, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0279

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f2022293a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
202231159586220001059910922571172050980115432250024324712825301001010020000101002000079021469096110204115051154689213766230100200200002004000010285102521110201100991001001000010000100224932716982030452068204891547324895045982252553126304710216221018810000180120000101001024510161101891021810255
202041020077110001047995231311496507201020122731322270602530100101002000010100200007915646609611016411484114858885389763010020020000200400001147311533111020110099100100100001000010022482615422019492044205311545125015046012252848123401710216221029310000380120000101001018910242101711023410254
2020410178761110010260782271114643094411461225412522731472530100101002000010100200007548852830411156110194101977641376643010020020000200400001151011485111020110099100100100001000010022497510472395292460200841508124895047792255148128701710216221152210000740520000101001148111557114991153011545
2020411477861010010227982299114643095610171226702552851002530100101002000010100200007920846592811016311521115708977389443010020020000200400001151911539111020110099100100100001000010022493915451993242034204841534424775046942251846123504710216221156710000870320000101001150411470115101153111576
2020411487872000010536842320114806098010176227402792481202530100101002000010100200007933747015211018011561115048913389903010020020000200400001159511536111020110099100100100001000010022493914412016152001205351534024895046312251834125400710216221019710000271220000101001016310240102341026310243
2020410233761010010347502271117125076011486222102082889825301001010020000101002000075575527584011520102151023376593763930100200200002004000010210102121110201100991001001000010000100224872111402442242451200911551024735045862251662114100710216221157810000691820000101001032210207102521017210240
202041021577220001046445232811496809401154222610257220120253010010100200001010020000791044677281101501146011456896838927301002002000020040000115371152111102011009910010010000100001002249526159420111920562052715082248550461822521511402017102162211502100001010320000101001148711558115511159511455
2020411539861000010431912286114728071611530225301962811192530100101002000010100200007557653008011146410278102267658376883010020020000200400001017310196111020110099100100100001000010022493912952436132473200871503024855046532254072114500710216221024810000270120000101001021310185102041019010247
2020410217772020010599622267114721308801146622740231219182253010010100200001010020000755765268161115141021510247761637682301002002000020040000102261018911102011009910010010000100001002250712134724061824562010215500248950465922517571354007102162211484100002714320000101001023310199102471021210257
20204102007730000102425623131151260804114832267024520719725301001010020000101002000079179468928010198115441159289293899530100200200002004000011542115201110201100991001001000010000100225011615371995332019205291531224855045402250934114006710216221020810000320120000101001025710258102141020710233

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0349

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)181e1f2022293a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200431022077101010224592268116967072410247223402942691322530010100102000010010200007517052465611139811433114118870377653001020200002040000102791027311100211091010100001000010224884158420681421462046015030248950456222574591257000640316321136410000780120000100101023810219102801023610238
2002410326761000101946322681170470724101842219032732213225300101001020000100102000077846470104010294102171023376503772730010202000020400001024610220111002110910101000010000102248241346240125233620108151512489504572225466511751400640216331025110000561020000100101024910221102891025210285
2002410219762000100928122751168050724113802213023229012925300101001020000100102000075021526000011413114371139588033888330010202000020400001028110337111002110910101000010000102248438128423977324332015314862248550462422547649751400640316221032010000531120000100101027310206102471023410192
200241027777222010329137226011712507241021022200310385106253001010010200001001020000753214735161102351032710320779237742300102020000204000010285102231110021109101010000100001022495201593207237207920428149522479504653225045110731400640316331141510000750220000100101143511445114531142811431
20024114398520009981962234116969071611472222912652871822530010100102000010010200007374152763601142411437114108843389383001020200002040000114141136111100211091010100001000010224732015242123732107204151502024815045982250866957000640316221026210000362020000100101029310275102271026010272
200241020277111010134742268117045072411396219302692941172530010100102000010010200007511752312001140011387114778847388783001020200002040000114141149611100211091010100001000010224894163920334921032043215441247350458322526601201000640216331142910000675020000100101144411385114041136711388
2002411442851000102309322681169640716102092229028827010625300101001020000100102000077646470608010225102271018477203773630010202000020400001023610213111002110910101000010000102248320100423873424042010915340248150461022536551164010640316321026810000514120000100101026110217101941034410305
200241023576110010188822256117123071610237222702543321022530010100102000010010200007801346811211138111403114338850389443001020200002040000114101142211100211091010100001000010224714155620423620552043814871248950453222520431186000640216331141110000580020000100101142311386114451141111397
200241143885100010059682267117043071611390222803233091392530010100102000010010200007800147058401027110284103217670377623001020200002040000102661028911100211091010100001000010224797151320782520732047515150246550459022523611044020640316331029510000580020000100101024810300102911028710284
20024102237610001027572227311496507041143122020305326132253001010010200001001020000737345265320113561148311457885538880300102020000204000010239102831110021109101010000100001022486181326236624244520139151002463504603225057010932720640316321027910000661020000100101149411396114361144011445

Test 3: throughput

Count: 8

Code:

  stp q0, q1, [x6], #0x10
  stp q0, q1, [x7], #0x10
  stp q0, q1, [x8], #0x10
  stp q0, q1, [x9], #0x10
  stp q0, q1, [x10], #0x10
  stp q0, q1, [x11], #0x10
  stp q0, q1, [x12], #0x10
  stp q0, q1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0035

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)1e1f2022293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160223802446013033981320522931148810264803232221791100471252401048010416000080100160000400511368986914802758031480259601553602092401002001600002003200008029280226118020110099100100800008000010016246627233424231024751602061503424685044725162619129100314805110116118023780004160000801008024480236801718030980302
160204802496015000971427822671196015260802592258601114112325240104801041600008010016000040051136907321480209802188030760279360220240100200160000200320000803118035411802011009910010080000800001001624842726292460724731601751512824685024751162632184293614405110116118028380004160000801008026080221802908025380244
160204803466014400986130722932148817264803202229109582264252401048010416000080100160000400511368979614802708034580365601603602942401002001600002003200008028580286118020110099100100800008000010016251929251323871224651602081493524762584829162610161143821005110116118019680004160000801008023480285803078031780348
160204803796014000993925722941170419228800842269972124210325240104801041600008010016000040051136932761480204803078028860130360238240100200160000200320000802848025011802011009910010080000800001001624822722092449824661601521526824725104752162594171173114005110116118028280004160000801008027580235803728034580248
16020480337601300098521662343115041420880179224812986875725240104801041600008010016000040051136896041480247803378021660214360215240100200160000200320000802328029611802011009910010080000800001001624852620462414524831601921518324784984763162627154227321005110116118033580004160000801008030380254803898030280274
1602048035960240449984304226311480172688032422699739807625240104801041600008010016000040051136931581480279802098025160190360192240100200160000200320000802588027511802011009910010080000800001001624862717742410324711601681493024725104705162618218203414005110116118027080004160000801008026880241803308031180306
1602048033860140401008928322491148012224802472265793115811125240104801041600008010016000040051136923401480193803638033460214360317240100200160000200320000803438031911802011009910010080000800001001624623625072440324651602021659024685124811162622187196314805110116118029480004160000801008025880341802708029580327
160204803246024040100592522293114801426080313222472513496225240104801041600008010016000040051136907801480327802628030460184360182240100200160000200320000803308033011802011009910010080000800001001625013528722399224821601931546024762764764162663165201114405110116118022980004160000801008030680357802828033080278
16020480280602404497923282284115041328480239247396684587252401048010416000080100160000400511368929214802368023180268602883601652401002001600002003200008030680265118020110099100100800008000010016249029254023972249416018415061224662584737162664157290614005110116118035780004160000801008022880298803788027580334
16020480242601400498372072272117121426080267223297665631025240104801041600608010016000040051136928101480180802348029760282360228240100200160000200320000802778028511802011009910010080000800001001624882922312416324801601621483524842584815162615144179914005110116118019280004160000801008023080284802998029980362

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0025

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f20222324293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606167696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)branch cond mispred nonspec (c5)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)dfe0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600438013160101101009771298228110013041216080226234469254159252400148001416000080010160000400061368835610480125801728025260144360248240010201600002032000080174801331180021109101080000800001016251214950240672472160116151702505194483216256714417551405022003161128015280004160000800108020180289802668020680146
16002480198600000000098138723281001000112128028522559856155425240014800141600008001016000040006136880400048014980240802086012136029124001020160000203200008015380194118002110910108000080000101624970154024608246916016914830247417646541626272012134005022001161128014880004160000800108015580210802148021280372
1600248021760100000009702217231310012481216080204224177310425525240014800141600008001016000040006136930800048016880388802896013236017724001020160000203200008031080248118002110910108000080000101624690161324775245516010515200250511246731625941421482005022001161128021780004160000800108023480278802858028480154
16002480179601010000099601192310100101691968025222356691337462524001480014160000800101600004000613684032104801388015880252601533601302400102016000020320000802168023211800211091010800008000010162494097024685246916014016660249051646821625671041009015022001162228023880004160000800108020380185801698019680126
16002480127601000000099331052296100129614156802612272342666312524001480014160000800101600004000613687464004801648012480225602053602122400102016000020320000802198021911800211091010800008000010162484050624781246916013715420247828646891626051522463005022001161128032380004160000800108018880170803708015580208
1600248026460100000001003813422421001272101768014923057808705025240136800141600008001016000040006136894801048017380151801696009436009524001020160000203200008028280315118002110910108000080000101624850185224453248216011815440247651446791625651481299005022001161128024580004160000800108026780333801628028380218
16002480238600000000097681592281100126482928020222211214532552524001480014160000800101600004003413685424004801158034680199601813602562400102016000020320240801898025711800211091010800008000010162456017092425525071601051540024944644700162620941085005022001161128025280004160000800108015880304801808025980112
1600248024660100000009507263234310012961320480213224980382868252400148001416000080010160000400061369200010480129802398018060197360251240010201600002032000080161801521180021109101080000800001016248502321242432462160148154402499784681162612261702005022001161128019480004160000800108015580121802328023180178
1600248023260210001201912363208423011001688113568216122041612128015583562423348119516096081186162160405384375308010482823831498316661556108626932434542016192020320000802998041611800211091010800008000010162476015452405424721611351513024867341750216377817222450052620042051628304181184160000800108297482894830278227082840
16002482326623000001721126361996218210012968212831512187195616371720703242450816941610808152216291640650138387720048316783585826366228636026524001020160000203200008024880264118002110910108000080000101624820184724428249416010815470249842646841626891723575005022001161128033880004160000800108033180162803478013580274