Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, D)

Test 1: uops

Code:

  stp d0, d1, [x6], #0x10
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03080b1e1f20223a3e3f40464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd store (99)inst ldst (9b)a0a1a2a3a6a7a8aaabacafdcache store miss (c0)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eald/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
900611669010512025011512410253000100010001000100010001000599910875800011145116611663243000100010002000200011661166118001100010001004024230100012212010302451711167711631000110001000100011671167116711671167
90041166800033010011510112530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140241391000032831014285166167711631000010001000100011671167116711671167
9004116690063141180115141025300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000101204412710000363601028285176167711631000010001000100011671167116711671167
900411669000314110011516012530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140201201000114801014445177167711631000010001000100011671167116711671167
90041166800631411001151101125300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000102202012210002222401022285167165511631000010001000100011671167116711671167
9004116690004141140115114002530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140241141000014801014285177168811631000010001000100011671167116711671167
90041166910124161100115140025300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000100002000100000001000285177167711631000010001000100011671167116711671167
900411669000314110011514212530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010000202141002014801014205157167711631000010001000100011671167116711671167
9004116680003141100115142125300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000103004863010000302401022285166166711631000010001000100011671167116711671167
90041166900631411001151100125300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000101402412210010362031030365177167711631000010001000100011671167116711671167

Test 2: Latency 3->3

Code:

  stp d0, d1, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)0305080b18191e1f202229373a3c3e3f404446494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
20224100407510100102725322741166496742121002522180189236312530351101001011710000101001000010000543994468848805391001610040100407424374983010020010000100002002000020000100401004011102011009910010010000100001001247714141115380152110968148902493504569125162364000710116111003710000215411000010000101001004110041100411004110041
20204100407500000104974023081167213600932100252228019723618253012210100101711000010100100001000054397446884880449100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124690157215190148410979152602477504582125012874100710116111003710000226311000010000101001004110041100411004110041
20204100407500000103295723001166411121776100252228023219625253023710100102781000010100100001000054400246884880296100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124850166914960150210990151202497504638124972563000710116111003710000395611000010000101001004110041100411004110041
20204100407500000104524422631149620620944100252263025721627253027010100101321000010100100001000054399446884880484100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124970160114950147510978151302453504548125112765700710116111003710000273611000010000101001004110041100411004110041
20204100407600000103176222971159213530716100252236018724320253022110100101211000010100100001000054397846884880284100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124890165215080152510990150402481504576125042961000710116111003710000293511000010000101001004110041100411004110041
20204100407500000103024922631151217120944100252256220621725253020310100101681000010100100001000054397146884880387100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124850162214880146411003150302481504498125141874700710116111003710000216311000010000101001004110041100411004110041
2020410040750000010374502302122327780212100252168021518737253022610100101381000010100100001000054399446884880785100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124855138216970170210797148122493504578125093463202710116111003710000318911000010000101001004110041100411004110041
2020410040751000010602502186122169663212100252168018021224253021210100102081000010100100001000054399846884880382100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124695148516280169610826150102477504594124962568500710116111003710000149311000010000101001004110041100411004110041
20204100407510000105634422001221624150212100252251020819632253023610100101321000010100100001000054399446884880409100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124695144216550169710816153602477504584124913178200710116111003710000268711000010000101001004110041100411004110041
20204100407510000106565522071221611560212100252147020721924253022110100101231000010100100001000054399146884880490100161004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124855146116700171110840151202469504651124972464900710116111003710000240111000010000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)0305080b1e1f202229373a3c3e3f404446494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd2d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
200441004075101103179522741172018951724100252223020725928253027110010106621000010010100001000054336546884880939001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012487111585159101566109321493024895046341251736674016400316331003710000339301000010000100101004110041100411004110041
200241004075202102697122751172029840724100252236024924326253002010010104321000010010100001000054327746884882023001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012481161538157701598109521497024805045951249644633726400216221003710000281301000010000100101004110041100411004110041
20024100407510010284612282117042174071610025222202332515625300191001010305100001001010000100005433454688488071000100211004010040744637520300102010000100002020000200001004010040111002110910101000010000101248141790154801585109531519024895045701251248706746400216231003710000250401000010000100101004110041100411004110041
200241004075220102039122751170427380716100252230022924426253054810010102571000010010100001000054339746884880678001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012487221678159401580109311524224975046091251541700746400316331003710000257101000010000100101004110041100411004110041
20024100407522210173101226611704222907241002522360234253282530212100101022810000100101000010000543389468848816421010021100401004074463752030010201000010000202000020000100401004011100211091010100001000010124971116201570015561093515522248050453212507517697064003162210037100002721001000010000100101004110041100411004110041
20024100407522010269662268117045366072410025223602752353725302721001010432100001001010000100005433294688488163000100211004010040744637520300102010000100002020000200001004010040111002110910101000010000101247241496157701597109211562124815045601250137575026400316331003710000260601000010000100101004110041100411004110041
2002410040752001028152225311696305227161002522290207241312530026100101001810000100101000010000543341468848807750010021100401004074463752030010201000010000202000020000100401004011100211091010100001000010124871316111570015821094514884246850465112523397180064002163310037100003451501000010000100101004110041100411004110041
200241004076200103506422621169625840724100252216024024825253020410010100121000010010100001000054326146884880036001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012478151597155801577109411531124655046121252151698026400216331003710000218801000010000100101004110041100411004110041
200241004075200103956322741169623511072410025221402222484625303221001010552100001001010000100005433374688488073210100211004010040744637520300102010000100002020000200001004010040111002110910101000010000101249841538154801579109351513024735046441251942774706400216331003710000327401000010000100101004110041100411004110041
200241004075202105158222601170432260724100252237024821235253019310010100171000010010100001000054334146884880907001002110040100407446375203001020100001000020200002000010040100401110021109101010000100001012488131577156901571109561539624805045751251244605746400216331003710000247401000010000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  stp d0, d1, [x6], #0x10
  stp d0, d1, [x7], #0x10
  stp d0, d1, [x8], #0x10
  stp d0, d1, [x9], #0x10
  stp d0, d1, [x10], #0x10
  stp d0, d1, [x11], #0x10
  stp d0, d1, [x12], #0x10
  stp d0, d1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5019

retire (01)cycle (02)0305080b1e1f202229373a3e3f4046494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6067696d6edispatch stall (70)74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160224401923026661042542228511712367411264401312236797551462524347480110828418000080112800128000640058318472366431930240134401344020720079062012424013120080016800162001600321600324012340108118020110099100100800008000010082518362029244911248780036149710251651045878251047174714811151170160040196800108000080000801004016840168401174017240142
160204401173015001023354228511496333184884010722528024014925241523801108290080000801168001280007400583184444265142602401204011940154200430620089240140200800168001620016003216003240170401811180201100991001008000080000100825183614702452525028003515270249251045378250939205514011151170160040135800108000080000801004013140167401384016440156
1602044012530044010245722324114645059145204014022317408014625241638801028122880000801008000080000400531184439264882202401414013640120200410320086240100200800008000020016000016000040113401481180201100991001008000080000100825063214892443624988005315360250451047118248624186814400051101161140161800028000080000801004017840128401694018040102
160204402003004401021883226811456304911520401342238676649472524427880102829428000080100801278000040053118432166504080240176401984015120202032010824010020080000800002001600001600004018340111118020110099100100800008000010082509361602246011724878004815058246851045798247437160014000051101161140118800028000080000801004014040190401654024340117
1602044013830140410602592271117043990112644014822517584193625241071801028389880000801008000080000400531184393664356902400834011740158200320320100240100200800008000020016000016000040150401401180201100991001008000080000100825142815782435524818003514950246851046198250525138214000051101171140149800028000080000801004013040137401734013540147
1602044016030040410233622279116722624628840118229047535849252440368010284101800008010080000800004005311844800648521024011940142401552006703201562401002008000080000200160000160000401334012711802011009910010080000800001008249536155124612025038006615005249225447358249751234914000051101171140136800028000080000801004013540160401314015340155
1602044019130150510437572232119602587626440148221253637125252410798010282949800008010080000800004005311844992651333024009540173401302002903201122401002008000080000200160000160000401254011311802011009910010080000800001008249623162424671624758003615130247625445818250144193814000051101171140165800028000080000801004016440142401574015640146
160204401843004401039281223911952141692644008022336273885025242986801028123580000801008000080000400531184590465001702401124014140185200530320087240100200800008000020016000016000040167401531180201100991001008000080000100825102820982447125008003814924248425446898251956271014000051101171140120800028000080000801004017540140401594018940155
1602044016330044410383632232119523356112644008722128034356625241113801028277080000801008000080000400531184624064968202401414017940122202120320119240100200800008000020016000016000040132401361180201100991001008000080000100824823614272457025088004315120249225446108249144160614400051101161140154800028000080000801004013940161401214011940112
1602044012630040010491872246119682789102644010122126896405925243867801028140080000801008000080000400531184336064505202401114013340186200040320109240100200800008000020016000016000040156401331180201100991001008000080000100824982818542432424938004515064249925446308247153192214300051101161140133800028000080000801004014640124401524012440197

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5016

retire (01)cycle (02)0305080b191e1f202229373a3e3f4046494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)606167696d6edispatch stall (70)74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd0d2d5d6dadbddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600444019730133301034744231011704107432040123225166151151252438248001282448800008001080000800004000811843696654649152400754010240096200240320077240010208000080000201600001600004016340138118002110910108000080000108249820117524796249280049152002484510473982505401662140502053151600171740123800028000080000800104012140131401414012840160
16002440137300220010257782310112483215732040071223950762141252444528001280526800008001080000800004000811844032640285152400934011640116200530320130240010208000080000201600001600004013740144118002110910108000080000108252216946246752470800291519024967464606825214514471405020538160091740120800028000080000800104013540122401144014540133
160024401023002000975656234811456104625240125230737635641252401638001283762800008001080000800004000811844728649458152400754013940159200500320105240010208000080000201600001600004013540119118002110910108000080000108251020164824656249080058153242480510455982513331386142502053171600171740132800028000080000800104011340104401384017240143
160024401343003300105034522461197613279264401592219729831412524428380012835108000080010800008000040008118442246487971524027040152401642006503201402400102080000800002016000016000040169401621180021109101080000800001082510361886245192476800341538424922524582825253919521485020538160081740103800028000080000800104016840125401614014840128
160024401153014400105066922391197666682644012722335066106525243273800128049680000800108000080000400081184458464138415240161401404013120056032013924001020800008000020160000160000401544016011800211091010800008000010825143217402450825078005115030246825445868251248187514850205317160017840146800028000080000800104012240108401124013940115
1600244012830033301023346231111632360652964012022616734665925244083800128435980000800108000080000400081184434465055315240100401004014920058032009824001020800008000020160000160000401364016311800211091010800008000010825062716292452142488800411515924845104659825255013481435020539160017840095800028000080000800104013940083401324018240125
16002440126301333010347742274114645326252401282290649605312524079780012800368000080010800008000040008118442726424011524009940168401172005703201042400102080000800002016000016000040154401301180021109101080000800001082513249332478824848002914960248050845838253365153714350205418160091740123800028000080000800104016740125401574013540150
16002440161301300010242492317114564899520401292262523578632524291480012810978000080010800008000040008118435766487391524009240116400992007003201282400102080000800002016000016000040123401551180021109101080000800001082502241687246418248280034150702492508467382512311366143502054171600171740139800028000080000800104017840130401854017740074
160024401073013330103026522461196098252644011622055186336525243566800128243680000800108000080000400081184362465161415240064401144015020066032011624001020800008000020160000160000401514010211800211091010800008000010825062314262480224938005215476246825446258250445162614650205417160018740110800028000080000800104012640114401304013740156
160024401473004400105546222521196843008264401042219691659432524129980012832678000080010800008000040008118436486483511524014640214401552006703201042400102080000800002016000016000040140401291180021109101080000800001082512278932453625008005515200249251045388250946109314050205461600171740185800028000080000800104013840123401294015340125