Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, S)

Test 1: uops

Code:

  stp s0, s1, [x6], #0x10
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)0305080b1e1f20223a3e3f40464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd store (99)inst ldst (9b)a0a1a2a3a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd2d5d6ddinst fetch restart (de)e0? int output thing (e9)eald/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
90061166900004003011510032530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010315520019100901190610140280051202162211631000010001000100011671167116711671167
900411669001043810011510002530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010000160010000000010000160051222162211631000010001000100011671167116711671167
900411668111010003011517002530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010191820013100901160610006206151102162211631000110001000100011671167116711671167
90041166910168141170115110322530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010206283201010011924610346366051202162211631000110001000100011671167116711671167
9004116691010814113411514002530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010077280010060100610006326051101162211631000210001000100011671167116711671167
90041166911109003011510712530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010140204221004002316010190200051202162211631000010001000100011671167116711671167
900411668000022011001151412253000100010001000100010001000599910875800001145116611663243000100010002000200011661166118001100010001012028028100000196010320160051101162211631000010001000100011671167116711671167
9004116690000314110011514412530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010041216001001002512010250400051201162211631000010001000100011671167116711671167
90041166900064120110115110002530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140201191000002314010220240051102162211631000010001000100011671167116711671167
900411669111091411901151100025300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000100003209100300250010000240051202162211631000010001000100011671167116711671167

Test 2: Latency 3->3

Code:

  stp s0, s1, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)0305080b191e1f202229373a3c3e3f4046494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2022410040766220103447822111220883302121002521582242252725303251010010117100001010010000100005439514688488042910021100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012491122463152701714109561526224811504612124967484304710116111003710000426131000010000101001004110041100411004110041
20204100407542201036577220412216936021210025216224623861253017910100100751000010100100001000054396646884880297100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124731414431686016931083814856248150458012481846497671011611100371000021711000010000101001004110041100411004110041
202041004075400010599162220312208907021210025213425022433253025310100101061000010100100001000054390346884880563100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124871915261568016731081815193247350462012491427697371011611100371000039121000010000101001004110041100411004110041
202041004075530010422642218422082326021210025216526923542253022710100101121000010100100001000054393146884880467100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124912515691632116001083014693247350457112501457017071011611100371000027771000010000101001004110041100411004110041
202041004075430010287772196122167390212100252180213229992530234101001010010000101001000010000543916468848807021002110040100407424376643010020010000100002002000020000100401004011102011009910010010000100001001246419156916670168010910149832488504558124834878173710116111003710000329121000010000101001004110041100411004110041
20204100407643001023387219712208845021210147215624326651253014610100101581000010100100001010854393846884880254100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124871814531675017021083214993248050457312486507437071011611101411009526251000010000101001004110041100411004110041
2020410040754330104498122041220893602121015021582412243825302461010010170100001010010000100005438884688488030810021100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012485211563165901708108331514624895045671249643170276710116111003710000388131000010000101001004110041100411004110041
202041004075433110458862196122082335080010025215923120230533020410194100871000010100100001000054389046884880469100211004010040742437498304402001000010000200200002000010040100401110201100991001001000010000100124792513681682016741083114946248050454312496447427371011611100371000031621000010000101001004110041100411004110041
2020410040753000102518122031221610711121210025216524922038253017510100101131000010100100001000054395046884880521100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124871914971679016711083515183244150458912485477737371011611100371000025131000010000101001004110041100411004110041
20204100407540001047072221712208296021210025216524625031253018310100100861000010100100001000054395946884880572100211004010040742437498301002001000010000204200002000010162100401110201100991001001000010000100124832215921663017001082015533247350451912494507407071011611100371000044391000010000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)0305080b1e1f202229373a3c3e3f4046494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
200441004076220106326322251196036510046410025218625323334253054210010102031000010010100001000054298146884880021110016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124691216451588015861096815171246950460012497238130640216331003710000313311000010000100101004110041100411004110041
200241004075222105605622321194499046410025219321720752253003310010103031000010010100001000054338546884880663010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124871216451555416001090215126246950451312497337264640316331003710000371911000010000100101004110041100411004110041
20024100407520210503652225119522328046410025218621720643253077310010105411000010010100001000054338546884881285010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124611215741550015761094414940248550452512477237804640316331003710000472211000010000100101004110041100411004110041
2002410040752221061779224611968265604641002521852042283125300171001010229100001001010000100005433934688488070501001610040100407446375203001020100001000020200002000010040101631110021109101010000100001012477916891469015181099015270246150455912496317481640316331003710092224311000010000100101004110041100411004110041
20024100407510010530582232119685448046410025219324121127253003510010102281000010010100001000054328546884880063010016100401004074463752130010201000010000202000020000100401004011100211091010100001000010124871215461554015981091014762246950457612486287254640316321003710000272311000010000100101004110041100411004110041
20024100407520210566522232119525258046410025218620526620253025110010101791000010010100001000054339746884880675010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124711416431525015801091314952247750458912484317820640216331003710000350411000010000100101004110041100411004110041
20024100407620010509742225119606607046410025217923319925253019010010103281000010010100001000054335346884880489110016101671004074463752030010201000010000202000020000100401004011100211091010100001000010124611416541585015671095015314247750461712491257600640216331003710000257411000010000100101004110041100411004110041
20024100407522210599682232119525469046410025218620123616253036610010102131000010010100001000054338946884881161010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124771217001526115771090415070246950447112480318280640316321003710000327411000010000100101004110041100411004110041
2002410040752001072560222511952919246410025218626319433253003310010104311000010010100001000054340546884880447110016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124691215761577015461093314960247750455512486307080640316331003710000245311000010000100101004110041100411004110041
200241004075200105901282232119604888046410025220023325233253024510010102131000010010100001000054338546884880060010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124871216371540015621092614912246950451012496298104640216331003710000299211000010000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  stp s0, s1, [x6], #0x10
  stp s0, s1, [x7], #0x10
  stp s0, s1, [x8], #0x10
  stp s0, s1, [x9], #0x10
  stp s0, s1, [x10], #0x10
  stp s0, s1, [x11], #0x10
  stp s0, s1, [x12], #0x10
  stp s0, s1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5016

retire (01)cycle (02)0305080b18191e1f202229373a3e3f4046494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)67696b6d6edispatch stall (70)74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)c2c5branch mispredict (cb)cdcfd6e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602244015530000000102122723001148831306208400722293534588122524172680110812898000080112800168000840059918450766441172401200400894010120042062010524014020080016800162001600321600324011440139118020110099100100800008000010082482015892461624748005015610249051044908250441759011151171640158800148000080000801004014140124400984012240157
160204401413010000099756223271149624447156401182266436444382524174780114829428000080112800128000840058318424366504572401270401144014220053062006224013120080016800162001600321600324013340157118020110099100100800008000010082486013002447525018000815340250451045718253540923011151171640118800148000080000801004011440137401644013340176
1602044014230000000999334232511488138952364008222576483673225241693801108115380000801128001680008400599184528764668624007804011540115200090620098240140200800168001620016003216003240143401281180201100991001008000080000100824680211324241248280051153702480976463182524251525011151171640143800108000080000801004008640126401394010640138
16020440168300000001021230234211144108961604011022593713243925241809801148312380000801128001280007400583184360764908624013904012940086200140620091240140200800168001620016003216003240122400941180201100991001008000080000100824940169624763247480024152402474976459282522241823011151171640109800148000080000801004013540142401564013640109
16020440110301000001026362229511496297362284009722474795864125242945801108297980000801168001680008400599184399164869324011804008740154200660620062240130200800168001620016003216003240122401301180201100991001008000080000100824693138424717245480044150002484976468782485311543011151171640143800108000080000801004013140132401314014940144
16020440176300100001004121228811472289431564010323344387726325241356801108124880000801168001280008400583184279165098424008404010240154200630620060240131200800168001620016003216003240123401131180201100991001008000080000100824720159524634247780039153502494976452882527161850011151171640156800108000080000801004012740101401204011740186
16020440154300000009876322327111601467520040135226651361747252413958011082968800008011680016800074005831844135650993240074040111401582006306200942401312008001680016200160032160032401194015111802011009910010080000800001008248601328243512244980030150802504954461982512191556011151171640105800108000080000801004010640102401024015640133
160204401073010000010215352307115043151610440119226132340347252424768011081825800008011280012800074005831844087647568240135040100401662007406200922401402008001680016200160032160032401464012411802011009910010080000800001008246841691246212245380024150602476510464882511271481011151171640133800148000080000801004015340134401394012840126
1602044013130100000996637233911504152442644010123056842574425242998801108150580000801128001280007400583184396264618824006904017440113200440620036240130200800168001620016003216003240134401101180201100991001008000080000100824880161124624248380026152002476510466682508271883011151171640110800148000080000801004014240126401554012640179
16020440119301000001024540229511504150142284010523065855005025241704801108153680000801128001280006400599184173564437424006604014440100200110620101240140200800168001620016003216003240143401791180201100991001008000080000100824960187324493248080034152102488510459982523381258011151171640135800108000080000801004019740173401194013040110

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5025

retire (01)cycle (02)0305080b1e1f202229373a3e3f4046494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)67696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)c2cfd0d2icache miss (d3)d5d6dbddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600444012330000010032602317212562300548040153227057858848252417368001280640800008001080000800004000811846480649587240264401764015520098320182240010208000080000201600001600004016940264118002110910108000080000108250422245244702468800911517125025164651825656210590502003071604440205800028000080000800104023540117401594011640211
16002440200301100987611823772129621381449240409225862065716425243710802448180480000802428000080000400081186034065209224014040191402302011725201962400102080240800002016000016027640288404861180021109101080000800001082472413232444102485800661512024864904782825828814380502000071606640268800028000080000800104021540174402284011040155
160024402513011001015511123412126428288256402262307741854372524395380012807988000080010800008000040008118449446477722401974018040173201883201472400102080000800002016000016000040216401461180021109101080000800001082470415892444225068007015120247658846758257912213180502000041605540180800028000080000800104014040141402094016640184
16002440202301110102452142368212482974746440169227291048636252414008001281426800008001080000800004000811846048647256240208401944019020118320211240010208000080000201600001600004018040130118002110910108000080000108248921445247602484800571528024855644628825488417411502000061604540134800028000080000800104019040264401774023540292
16002440182301110998712222732124814262476402602296646700292524328280012831338000080010800008000040008118474886427462401754018440203201314620252240010208000080000201600001600004015540384118002110910108000080000108249821666245402477800891518024804964683825597513851502000041606440207800028000080000800104013840136402034014140224
1600244017330110099841282358214801232842040114227174942572252428288001281332800008001080000800004006611844032647968240132402084020920217320181240010208000080000201600001600004020240149118002110910108000080000108247321771239672500801061531025024904789825689910411502000041504440199800028000080000800104012840162401734014540158
1600244013730110199541842296212963022126084023522836367181625241145800128334880000800108000080000400081184868864471824014340186402102024532011224001020800008000020160000160000402044017411800211091010800008000010824872164024901224638006915191246871847258257010718162502000041606540154800028000080000800104021640181401724019440152
16002440276301111101851742299312328598500402972281635783362524331880012831768000080010800008000040008118455206495842400874022840216201113201692400102080000800002016000016000040242402061180021109101080000800001082468216032409192490801401529124675584701825338517600502000031605540220800028000080000800104016540142402004013240164
16002440138301110997512222932125618007352401552292750440192524103580012837258000080010800008000040008118482086435352401124017740216201323202072400102080000800002016000016000040358402621180021109101080000800001082488318492469132477800891505024845164730825807112640502050041605540169800028000080000800104017140200401314009440144
1600244020630200010317233229621272177362164013823045557333725242668800128029880000800108000080000400081184484864868224020240105401692014432018924001020800008000020160000160000401564014111800211091010800008000010824750132524471024948011015340248499247128255911314720502000051604440209800028000080000800104025440290402144015540174