Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, S)

Test 1: uops

Code:

  stp s0, s1, [x6], #0x10
  nop ; nop ; nop ; nop ; nop ; nop ; nop

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 3.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f20223a3e3f40464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
90061166900004003011510032530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010315520019100901190610140280051202162211631000010001000100011671167116711671167
900411669001043810011510002530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010000160010000000010000160051222162211631000010001000100011671167116711671167
900411668111010003011517002530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010191820013100901160610006206151102162211631000110001000100011671167116711671167
90041166910168141170115110322530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010206283201010011924610346366051202162211631000110001000100011671167116711671167
9004116691010814113411514002530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010077280010060100610006326051101162211631000210001000100011671167116711671167
90041166911109003011510712530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010140204221004002316010190200051202162211631000010001000100011671167116711671167
900411668000022011001151412253000100010001000100010001000599910875800001145116611663243000100010002000200011661166118001100010001012028028100000196010320160051101162211631000010001000100011671167116711671167
9004116690000314110011514412530001000100010001000100010005999108758000011451166116632430001000100020002000116611661180011000100010041216001001002512010250400051201162211631000010001000100011671167116711671167
90041166900064120110115110002530001000100010001000100010005999108758000111451166116632430001000100020002000116611661180011000100010140201191000002314010220240051102162211631000010001000100011671167116711671167
900411669111091411901151100025300010001000100010001000100059991087580001114511661166324300010001000200020001166116611800110001000100003209100300250010000240051202162211631000010001000100011671167116711671167

Test 2: Latency 3->3

Code:

  stp s0, s1, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)191e1f202229373a3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2022410040766220103447822111220883302121002521582242252725303251010010117100001010010000100005439514688488042910021100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012491122463152701714109561526224811504612124967484304710116111003710000426131000010000101001004110041100411004110041
20204100407542201036577220412216936021210025216224623861253017910100100751000010100100001000054396646884880297100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124731414431686016931083814856248150458012481846497671011611100371000021711000010000101001004110041100411004110041
202041004075400010599162220312208907021210025213425022433253025310100101061000010100100001000054390346884880563100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124871915261568016731081815193247350462012491427697371011611100371000039121000010000101001004110041100411004110041
202041004075530010422642218422082326021210025216526923542253022710100101121000010100100001000054393146884880467100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124912515691632116001083014693247350457112501457017071011611100371000027771000010000101001004110041100411004110041
202041004075430010287772196122167390212100252180213229992530234101001010010000101001000010000543916468848807021002110040100407424376643010020010000100002002000020000100401004011102011009910010010000100001001246419156916670168010910149832488504558124834878173710116111003710000329121000010000101001004110041100411004110041
20204100407643001023387219712208845021210147215624326651253014610100101581000010100100001010854393846884880254100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124871814531675017021083214993248050457312486507437071011611101411009526251000010000101001004110041100411004110041
2020410040754330104498122041220893602121015021582412243825302461010010170100001010010000100005438884688488030810021100401004074243749830100200100001000020020000200001004010040111020110099100100100001000010012485211563165901708108331514624895045671249643170276710116111003710000388131000010000101001004110041100411004110041
202041004075433110458862196122082335080010025215923120230533020410194100871000010100100001000054389046884880469100211004010040742437498304402001000010000200200002000010040100401110201100991001001000010000100124792513681682016741083114946248050454312496447427371011611100371000031621000010000101001004110041100411004110041
2020410040753000102518122031221610711121210025216524922038253017510100101131000010100100001000054395046884880521100211004010040742437498301002001000010000200200002000010040100401110201100991001001000010000100124871914971679016711083515183244150458912485477737371011611100371000025131000010000101001004110041100411004110041
20204100407540001047072221712208296021210025216524625031253018310100100861000010100100001000054395946884880572100211004010040742437498301002001000010000204200002000010162100401110201100991001001000010000100124832215921663017001082015533247350451912494507407071011611100371000044391000010000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f202229373a3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
200441004076220106326322251196036510046410025218625323334253054210010102031000010010100001000054298146884880021110016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124691216451588015861096815171246950460012497238130640216331003710000313311000010000100101004110041100411004110041
200241004075222105605622321194499046410025219321720752253003310010103031000010010100001000054338546884880663010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124871216451555416001090215126246950451312497337264640316331003710000371911000010000100101004110041100411004110041
20024100407520210503652225119522328046410025218621720643253077310010105411000010010100001000054338546884881285010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124611215741550015761094414940248550452512477237804640316331003710000472211000010000100101004110041100411004110041
2002410040752221061779224611968265604641002521852042283125300171001010229100001001010000100005433934688488070501001610040100407446375203001020100001000020200002000010040101631110021109101010000100001012477916891469015181099015270246150455912496317481640316331003710092224311000010000100101004110041100411004110041
20024100407510010530582232119685448046410025219324121127253003510010102281000010010100001000054328546884880063010016100401004074463752130010201000010000202000020000100401004011100211091010100001000010124871215461554015981091014762246950457612486287254640316321003710000272311000010000100101004110041100411004110041
20024100407520210566522232119525258046410025218620526620253025110010101791000010010100001000054339746884880675010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124711416431525015801091314952247750458912484317820640216331003710000350411000010000100101004110041100411004110041
20024100407620010509742225119606607046410025217923319925253019010010103281000010010100001000054335346884880489110016101671004074463752030010201000010000202000020000100401004011100211091010100001000010124611416541585015671095015314247750461712491257600640216331003710000257411000010000100101004110041100411004110041
20024100407522210599682232119525469046410025218620123616253036610010102131000010010100001000054338946884881161010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124771217001526115771090415070246950447112480318280640316321003710000327411000010000100101004110041100411004110041
2002410040752001072560222511952919246410025218626319433253003310010104311000010010100001000054340546884880447110016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124691215761577015461093314960247750455512486307080640316331003710000245311000010000100101004110041100411004110041
200241004075200105901282232119604888046410025220023325233253024510010102131000010010100001000054338546884880060010016100401004074463752030010201000010000202000020000100401004011100211091010100001000010124871216371540015621092614912246950451012496298104640216331003710000299211000010000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  stp s0, s1, [x6], #0x10
  stp s0, s1, [x7], #0x10
  stp s0, s1, [x8], #0x10
  stp s0, s1, [x9], #0x10
  stp s0, s1, [x10], #0x10
  stp s0, s1, [x11], #0x10
  stp s0, s1, [x12], #0x10
  stp s0, s1, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5016

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f202229373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)67696b6d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602244015530000000102122723001148831306208400722293534588122524172680110812898000080112800168000840059918450766441172401200400894010120042062010524014020080016800162001600321600324011440139118020110099100100800008000010082482015892461624748005015610249051044908250441759011151171640158800148000080000801004014140124400984012240157
160204401413010000099756223271149624447156401182266436444382524174780114829428000080112800128000840058318424366504572401270401144014220053062006224013120080016800162001600321600324013340157118020110099100100800008000010082486013002447525018000815340250451045718253540923011151171640118800148000080000801004011440137401644013340176
1602044014230000000999334232511488138952364008222576483673225241693801108115380000801128001680008400599184528764668624007804011540115200090620098240140200800168001620016003216003240143401281180201100991001008000080000100824680211324241248280051153702480976463182524251525011151171640143800108000080000801004008640126401394010640138
16020440168300000001021230234211144108961604011022593713243925241809801148312380000801128001280007400583184360764908624013904012940086200140620091240140200800168001620016003216003240122400941180201100991001008000080000100824940169624763247480024152402474976459282522241823011151171640109800148000080000801004013540142401564013640109
16020440110301000001026362229511496297362284009722474795864125242945801108297980000801168001680008400599184399164869324011804008740154200660620062240130200800168001620016003216003240122401301180201100991001008000080000100824693138424717245480044150002484976468782485311543011151171640143800108000080000801004013140132401314014940144
16020440176300100001004121228811472289431564010323344387726325241356801108124880000801168001280008400583184279165098424008404010240154200630620060240131200800168001620016003216003240123401131180201100991001008000080000100824720159524634247780039153502494976452882527161850011151171640156800108000080000801004012740101401204011740186
16020440154300000009876322327111601467520040135226651361747252413958011082968800008011680016800074005831844135650993240074040111401582006306200942401312008001680016200160032160032401194015111802011009910010080000800001008248601328243512244980030150802504954461982512191556011151171640105800108000080000801004010640102401024015640133
160204401073010000010215352307115043151610440119226132340347252424768011081825800008011280012800074005831844087647568240135040100401662007406200922401402008001680016200160032160032401464012411802011009910010080000800001008246841691246212245380024150602476510464882511271481011151171640133800148000080000801004015340134401394012840126
1602044013130100000996637233911504152442644010123056842574425242998801108150580000801128001280007400583184396264618824006904017440113200440620036240130200800168001620016003216003240134401101180201100991001008000080000100824880161124624248380026152002476510466682508271883011151171640110800148000080000801004014240126401554012640179
16020440119301000001024540229511504150142284010523065855005025241704801108153680000801128001280006400599184173564437424006604014440100200110620101240140200800168001620016003216003240143401791180201100991001008000080000100824960187324493248080034152102488510459982523381258011151171640135800108000080000801004019740173401194013040110

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5025

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f202229373a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)67696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)c2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600444012330000010032602317212562300548040153227057858848252417368001280640800008001080000800004000811846480649587240264401764015520098320182240010208000080000201600001600004016940264118002110910108000080000108250422245244702468800911517125025164651825656210590502003071604440205800028000080000800104023540117401594011640211
16002440200301100987611823772129621381449240409225862065716425243710802448180480000802428000080000400081186034065209224014040191402302011725201962400102080240800002016000016027640288404861180021109101080000800001082472413232444102485800661512024864904782825828814380502000071606640268800028000080000800104021540174402284011040155
160024402513011001015511123412126428288256402262307741854372524395380012807988000080010800008000040008118449446477722401974018040173201883201472400102080000800002016000016000040216401461180021109101080000800001082470415892444225068007015120247658846758257912213180502000041605540180800028000080000800104014040141402094016640184
16002440202301110102452142368212482974746440169227291048636252414008001281426800008001080000800004000811846048647256240208401944019020118320211240010208000080000201600001600004018040130118002110910108000080000108248921445247602484800571528024855644628825488417411502000061604540134800028000080000800104019040264401774023540292
16002440182301110998712222732124814262476402602296646700292524328280012831338000080010800008000040008118474886427462401754018440203201314620252240010208000080000201600001600004015540384118002110910108000080000108249821666245402477800891518024804964683825597513851502000041606440207800028000080000800104013840136402034014140224
1600244017330110099841282358214801232842040114227174942572252428288001281332800008001080000800004006611844032647968240132402084020920217320181240010208000080000201600001600004020240149118002110910108000080000108247321771239672500801061531025024904789825689910411502000041504440199800028000080000800104012840162401734014540158
1600244013730110199541842296212963022126084023522836367181625241145800128334880000800108000080000400081184868864471824014340186402102024532011224001020800008000020160000160000402044017411800211091010800008000010824872164024901224638006915191246871847258257010718162502000041606540154800028000080000800104021640181401724019440152
16002440276301111101851742299312328598500402972281635783362524331880012831768000080010800008000040008118455206495842400874022840216201113201692400102080000800002016000016000040242402061180021109101080000800001082468216032409192490801401529124675584701825338517600502000031605540220800028000080000800104016540142402004013240164
16002440138301110997512222932125618007352401552292750440192524103580012837258000080010800008000040008118482086435352401124017740216201323202072400102080000800002016000016000040358402621180021109101080000800001082488318492469132477800891505024845164730825807112640502050041605540169800028000080000800104017140200401314009440144
1600244020630200010317233229621272177362164013823045557333725242668800128029880000800108000080000400081184484864868224020240105401692014432018924001020800008000020160000160000401564014111800211091010800008000010824750132524471024948011015340248499247128255911314720502000051604440209800028000080000800104025440290402144015540174