Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, D)

Test 1: uops

Code:

  stp d0, d1, [x6], #0x10
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e1f20223a3e3f40464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)aaabacafl1d cache miss st nonspec (c0)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
900611669010512025011512410253000100010001000100010001000599910875800011145116611663243000100010002000200011661166118001100010001004024230100012212010302451711167711631000110001000100011671167116711671167
90041166800033010011510112530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140241391000032831014285166167711631000010001000100011671167116711671167
9004116690063141180115141025300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000101204412710000363601028285176167711631000010001000100011671167116711671167
900411669000314110011516012530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140201201000114801014445177167711631000010001000100011671167116711671167
90041166800631411001151101125300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000102202012210002222401022285167165511631000010001000100011671167116711671167
9004116690004141140115114002530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140241141000014801014285177168811631000010001000100011671167116711671167
90041166910124161100115140025300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000100002000100000001000285177167711631000010001000100011671167116711671167
900411669000314110011514212530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010000202141002014801014205157167711631000010001000100011671167116711671167
9004116680003141100115142125300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000103004863010000302401022285166166711631000010001000100011671167116711671167
90041166900631411001151100125300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000101402412210010362031030365177167711631000010001000100011671167116711671167

Test 2: Latency 3->3

Code:

  stp d0, d1, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f202229373a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
20224100407510100102725322741166496742121002522180189236312530351101001011710000101001000010000543994468848805391001610040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001247714141115380152110968148902493504569125162364000710116111003710000215411000010000101001004110041100411004110041
20204100407500000104974023081167213600932100252228019723618253012210100101711000010100100001000054397446884880449100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124690157215190148410979152602477504582125012874100710116111003710000226311000010000101001004110041100411004110041
20204100407500000103295723001166411121776100252228023219625253023710100102781000010100100001000054400246884880296100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124850166914960150210990151202497504638124972563000710116111003710000395611000010000101001004110041100411004110041
20204100407500000104524422631149620620944100252263025721627253027010100101321000010100100001000054399446884880484100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124970160114950147510978151302453504548125112765700710116111003710000273611000010000101001004110041100411004110041
20204100407600000103176222971159213530716100252236018724320253022110100101211000010100100001000054397846884880284100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124890165215080152510990150402481504576125042961000710116111003710000293511000010000101001004110041100411004110041
20204100407500000103024922631151217120944100252256220621725253020310100101681000010100100001000054397146884880387100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124850162214880146411003150302481504498125141874700710116111003710000216311000010000101001004110041100411004110041
2020410040750000010374502302122327780212100252168021518737253022610100101381000010100100001000054399446884880785100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124855138216970170210797148122493504578125093463202710116111003710000318911000010000101001004110041100411004110041
2020410040751000010602502186122169663212100252168018021224253021210100102081000010100100001000054399846884880382100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124695148516280169610826150102477504594124962568500710116111003710000149311000010000101001004110041100411004110041
20204100407510000105634422001221624150212100252251020819632253023610100101321000010100100001000054399446884880409100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124695144216550169710816153602477504584124913178200710116111003710000268711000010000101001004110041100411004110041
20204100407510000106565522071221611560212100252147020721924253022110100101231000010100100001000054399146884880490100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124855146116700171110840151202469504651124972464900710116111003710000240111000010000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f202229373a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
200441004075101103179522741172018951724100252223020725928253027110010106621000010010100001000054336546884880939001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012487111585159101566109321493024895046341251736674016400316331003710000339301000010000100101004110041100411004110041
200241004075202102697122751172029840724100252236024924326253002010010104321000010010100001000054327746884882023001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012481161538157701598109521497024805045951249644633726400216221003710000281301000010000100101004110041100411004110041
20024100407510010284612282117042174071610025222202332515625300191001010305100001001010000100005433454688488071000100211004010040744637520300102010000100002020000200001004010040111002110910101000010000101248141790154801585109531519024895045701251248706746400216231003710000250401000010000100101004110041100411004110041
200241004075220102039122751170427380716100252230022924426253054810010102571000010010100001000054339746884880678001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012487221678159401580109311524224975046091251541700746400316331003710000257101000010000100101004110041100411004110041
20024100407522210173101226611704222907241002522360234253282530212100101022810000100101000010000543389468848816421010021100401004074463752030010201000010000202000020000100401004011100211091010100001000010124971116201570015561093515522248050453212507517697064003162210037100002721001000010000100101004110041100411004110041
20024100407522010269662268117045366072410025223602752353725302721001010432100001001010000100005433294688488163000100211004010040744637520300102010000100002020000200001004010040111002110910101000010000101247241496157701597109211562124815045601250137575026400316331003710000260601000010000100101004110041100411004110041
2002410040752001028152225311696305227161002522290207241312530026100101001810000100101000010000543341468848807750010021100401004074463752030010201000010000202000020000100401004011100211091010100001000010124871316111570015821094514884246850465112523397180064002163310037100003451501000010000100101004110041100411004110041
200241004076200103506422621169625840724100252216024024825253020410010100121000010010100001000054326146884880036001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012478151597155801577109411531124655046121252151698026400216331003710000218801000010000100101004110041100411004110041
200241004075200103956322741169623511072410025221402222484625303221001010552100001001010000100005433374688488073210100211004010040744637520300102010000100002020000200001004010040111002110910101000010000101249841538154801579109351513024735046441251942774706400216331003710000327401000010000100101004110041100411004110041
200241004075202105158222601170432260724100252237024821235253019310010100171000010010100001000054334146884880907001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012488131577156901571109561539624805045751251244605746400216331003710000247401000010000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  stp d0, d1, [x6], #0x10
  stp d0, d1, [x7], #0x10
  stp d0, d1, [x8], #0x10
  stp d0, d1, [x9], #0x10
  stp d0, d1, [x10], #0x10
  stp d0, d1, [x11], #0x10
  stp d0, d1, [x12], #0x10
  stp d0, d1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5019

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f202229373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160224401923026661042542228511712367411264401312236797551462524347480110828418000080112800128000640058318472366431930240134401344020720079062012424013120080016800162001600321600324012340108118020110099100100800008000010082518362029244911248780036149710251651045878251047174714811151170160040196800108000080000801004016840168401174017240142
160204401173015001023354228511496333184884010722528024014925241523801108290080000801168001280007400583184444265142602401204011940154200430620089240140200800168001620016003216003240170401811180201100991001008000080000100825183614702452525028003515270249251045378250939205514011151170160040135800108000080000801004013140167401384016440156
1602044012530044010245722324114645059145204014022317408014625241638801028122880000801008000080000400531184439264882202401414013640120200410320086240100200800008000020016000016000040113401481180201100991001008000080000100825063214892443624988005315360250451047118248624186814400051101161140161800028000080000801004017840128401694018040102
160204402003004401021883226811456304911520401342238676649472524427880102829428000080100801278000040053118432166504080240176401984015120202032010824010020080000800002001600001600004018340111118020110099100100800008000010082509361602246011724878004815058246851045798247437160014000051101161140118800028000080000801004014040190401654024340117
1602044013830140410602592271117043990112644014822517584193625241071801028389880000801008000080000400531184393664356902400834011740158200320320100240100200800008000020016000016000040150401401180201100991001008000080000100825142815782435524818003514950246851046198250525138214000051101171140149800028000080000801004013040137401734013540147
1602044016030040410233622279116722624628840118229047535849252440368010284101800008010080000800004005311844800648521024011940142401552006703201562401002008000080000200160000160000401334012711802011009910010080000800001008249536155124612025038006615005249225447358249751234914000051101171140136800028000080000801004013540160401314015340155
1602044019130150510437572232119602587626440148221253637125252410798010282949800008010080000800004005311844992651333024009540173401302002903201122401002008000080000200160000160000401254011311802011009910010080000800001008249623162424671624758003615130247625445818250144193814000051101171140165800028000080000801004016440142401574015640146
160204401843004401039281223911952141692644008022336273885025242986801028123580000801008000080000400531184590465001702401124014140185200530320087240100200800008000020016000016000040167401531180201100991001008000080000100825102820982447125008003814924248425446898251956271014000051101171140120800028000080000801004017540140401594018940155
1602044016330044410383632232119523356112644008722128034356625241113801028277080000801008000080000400531184624064968202401414017940122202120320119240100200800008000020016000016000040132401361180201100991001008000080000100824823614272457025088004315120249225446108249144160614400051101161140154800028000080000801004013940161401214011940112
1602044012630040010491872246119682789102644010122126896405925243867801028140080000801008000080000400531184336064505202401114013340186200040320109240100200800008000020016000016000040156401331180201100991001008000080000100824982818542432424938004515064249925446308247153192214300051101161140133800028000080000801004014640124401524012440197

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5016

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)191e1f202229373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)606167696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600444019730133301034744231011704107432040123225166151151252438248001282448800008001080000800004000811843696654649152400754010240096200240320077240010208000080000201600001600004016340138118002110910108000080000108249820117524796249280049152002484510473982505401662140502053151600171740123800028000080000800104012140131401414012840160
16002440137300220010257782310112483215732040071223950762141252444528001280526800008001080000800004000811844032640285152400934011640116200530320130240010208000080000201600001600004013740144118002110910108000080000108252216946246752470800291519024967464606825214514471405020538160091740120800028000080000800104013540122401144014540133
160024401023002000975656234811456104625240125230737635641252401638001283762800008001080000800004000811844728649458152400754013940159200500320105240010208000080000201600001600004013540119118002110910108000080000108251020164824656249080058153242480510455982513331386142502053171600171740132800028000080000800104011340104401384017240143
160024401343003300105034522461197613279264401592219729831412524428380012835108000080010800008000040008118442246487971524027040152401642006503201402400102080000800002016000016000040169401621180021109101080000800001082510361886245192476800341538424922524582825253919521485020538160081740103800028000080000800104016840125401614014840128
160024401153014400105066922391197666682644012722335066106525243273800128049680000800108000080000400081184458464138415240161401404013120056032013924001020800008000020160000160000401544016011800211091010800008000010825143217402450825078005115030246825445868251248187514850205317160017840146800028000080000800104012240108401124013940115
1600244012830033301023346231111632360652964012022616734665925244083800128435980000800108000080000400081184434465055315240100401004014920058032009824001020800008000020160000160000401364016311800211091010800008000010825062716292452142488800411515924845104659825255013481435020539160017840095800028000080000800104013940083401324018240125
16002440126301333010347742274114645326252401282290649605312524079780012800368000080010800008000040008118442726424011524009940168401172005703201042400102080000800002016000016000040154401301180021109101080000800001082513249332478824848002914960248050845838253365153714350205418160091740123800028000080000800104016740125401574013540150
16002440161301300010242492317114564899520401292262523578632524291480012810978000080010800008000040008118435766487391524009240116400992007003201282400102080000800002016000016000040123401551180021109101080000800001082502241687246418248280034150702492508467382512311366143502054171600171740139800028000080000800104017840130401854017740074
160024401073013330103026522461196098252644011622055186336525243566800128243680000800108000080000400081184362465161415240064401144015020066032011624001020800008000020160000160000401514010211800211091010800008000010825062314262480224938005215476246825446258250445162614650205417160018740110800028000080000800104012640114401304013740156
160024401473004400105546222521196843008264401042219691659432524129980012832678000080010800008000040008118436486483511524014640214401552006703201042400102080000800002016000016000040140401291180021109101080000800001082512278932453625008005515200249251045388250946109314050205461600171740185800028000080000800104013840123401294015340125