Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

STR (pre-index, D)

Test 1: uops

Code:

  str d0, [x6, #0x10]!

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0305080b1e1f20223a3e3f4046494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd store (99)inst ldst (9b)a0a1a2a3a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eald/st retires (ed)gpr retires (ef)f5f6f7f8fd
10051040811101232113010257135252000100010001000100050762458241015104010408243898200010002000104010401110011000100010409353221007025016710167277073216221037100001000100010411041104110411041
1004104081110916119010258344252000100010001000100050762458241015104010408243898200010002000104010401110011000100010171915226100800120710007237173216221037100001000100010411041104110411041
1004104081110916119010258314252000100010001000100050754458241015104010408243898200010002000104010401110011000100010209271171007012481010247157273216221037100001000100010411041104110411041
100410407100090012010258225252000100010001000100050754458241015104010408243898200010002000104010401110011000100010349395101010114618710167197273216221037100001000100010411041104110411041
1004104081110916270102583152520001000100010001000507544582410151040104082438982000100020001040104011100110001000102473932610070224161010167237273216221037100001000100010411041104110411041
10041040811109161140102551332520001000100010001000507544582410151040104082438982000100020001040104011100110001000101971102510070126161010167237373216221037100001000100010411041104110411041
100410408110090014010256105252000100010001000100050754458241015104010408243898200010002000104010401110011000100010348233371011011716710007237173216221037100001000100010411041104110411041
100410408111091611301025830325200010001000100010005075445824101510401040824389820001000200010401040111001100010001025823291007012822710187317173216221037100001000100010411041104110411041
1004104071100916114010257334252000100010001000100050754458241015104010408243898200010002000104010401110011000100010238193361007022618710167357273216221037100001000100010411041104110411041
10041040811101016114010258224252000100010001000100050754458241015104010408243898200010002000104010401110011000100010248193171010002418710187437073216221037100001000100010411041104110411041

Test 2: Latency 3->3

Code:

  str d0, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)030508090b18191e1f2022293a3c3e3f404446494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
102141004075000000105098322561163230732100252244023821525252010010100100001010010000522165468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124770169714820148210994149102461504587125133069007101171110037100003431110000101001004110041100411004110041
102041004075000000104286123081148810944100252240017925140252010010100100001010010000522151468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124690159214910155611008149802505504597125042968107101171110037100002268110000101001004110041100411004110041
102041004075010000102937623011163221944100252284024320628252010010100100001010010000522173468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124730162514870152810977148002497504565125192878707101171110037100001414110000101001004110041100411004110041
102041004075000000104435622701149630840100252240021425227252010010100100001010010000522157468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124730161414630146710996149802485504558125123466707101171110037100003914110000101001004110041100411004110041
102041004075000000103804423011171220780100252267024023932252010010100100001010010000522165468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124850164115210151111017149202497504530125182765407101171110037100001492110000101001004110041100411004110041
102041004075000000102934322631166430944100252227018320514252010010100100001010010000522141468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124850158514970147911052150602481504505125123369337261171110037100001746110000101001004110041100411004110041
102041004075100100106086222681170468724100252243021117937252010010100100001010010000522125468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124644165014960150510960149202473504509125063268917101171110037100002463110000101001004110041100411004110041
102041004075101000104465122611169660712100252229019718623252010010100100001010010000522157468824110017100401004086743874720100200100002002000010040100401110201100991001001000010000100124814160414940151910980150402497504561125022268317101171110037100002797110000101001004110041100411004110041
102041004075110000104105122751172040716100252229020723629252010010100100001010010000522157468824010017100401004086743874720100200100002002000010040100401110201100991001001000010000100124774168114940154011023150702493504638125093161607101171110037100003855110000101001004110041100411004110041
1020410040751001001047640228211712427241002522430215204272520100101001000010100100005221334688240100171004010040867438747201002001000020020000100401004011102011009910010010000100001001248541521147501531110181510024815045981251130648071011711100371000031210110000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)030508090b1e1f2022293a3c3e3f404446494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebld/st retires (ed)gpr retires (ef)f5f6f7f8fd
10034100407622021024570226411488150936100252267027225637252001010010100001001010000521097468824110022100401004086963877020010201000020200001004010040111002110910101000010000101247718160415464157610964154622481504541124904477800640516551003710000421310000100101004110041100411004110041
10024100407512021020070226711616110720100252239025724248252001010010100001001010000521073468824110022100401004086963877020010201000020200001004010040111002110910101000010000101249718170915294157210936152202501504675124953074173640516451003710000293310000100101004110041100411004110041
1002410040753330100868123021149660720100252216025923435252001010010100001001010000521137468824110022100401004086963877020010201000020200001004010040111002110910101000010000101247915162715399156510945150202465504536124873579002640616351003710000285310000100101004110041100411004110041
1002410040752200101559522951154480760100252218022920863252001010010100001001010000521065468824110022100401004086963877020010201000020200001004010040111002110910101000010000101249316166915380156510924150102485504626124963374000640516551003710000299210000100101004110041100411004110041
10024100407510011023068230511504110880100252208025023130252001010010100001001010000520809468824110022100401004086963877020010201000020200001004010040111002110910101000010000101249624153615170154010938152902466504529124944674801640416441003710000195110000100101004110041100411004110041
100241004075201010338982275117121107241002522360244263582520010100101000010010100005211054688241100221004010040869638770200102010000202000010040100401110021109101010000100001012480161510155731567109681536424655045991250045698140640316551003710000352210000100101004110041100411004110041
1002410040753303102756722681169620716100252208021320652252001010010100001001010000521001468824010022100401004086963877020010201000020200001004010040111002110910101000010000101250421159515310153210944151262478504564125165170502640516431003710000272910000100101004110041100411004110041
10024100407520001020976224311704807161002522130187188352520010100101000010010100005210654688241100221004010040869638770200102010000202000010040100401110021109101010000100001012475111540155901560109571520024815045761249334754143640416541003710000206310000100101004110041100411004110041
10024100407533021018253226111712120724100252229024021132252001010010100001001010000521041468824110022100401004086963877020010201000020200001004010040111002110910101000010000101249520155115682160810971154902470504652125003779570640416441003710000221310000100101004110041100411004110041
10024100407522001019468226511704100724100252235023925540252001010010100001001010000521089468824110022100401004086963877020010201000020200001004010040111002110910101000010000101248119158115400158910947154302481504600125053567770640416341003710000251610000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  str d0, [x6, #0x10]!
  str d0, [x7, #0x10]!
  str d0, [x8, #0x10]!
  str d0, [x9, #0x10]!
  str d0, [x10, #0x10]!
  str d0, [x11, #0x10]!
  str d0, [x12, #0x10]!
  str d0, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5018

retire (01)cycle (02)0305080b18191e1f2022293a3e3f4046494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8021440155300300001048540224611960426440126220535048994251601028010280000801008000040053118440120240177401444015930102330116160100200800002001600004014640125118020110099100100800008000010082506321615246972500800381528025082544592825104323441400511021622401098000280000801004019940140401304011040172
8020440112301505001040113522311196810264401512219807724492516010280102800008010080000400531184362802401274015040118300823301081601002008000020016000040135401751180201100991001008000080000100825113617792449182457800541508024842544636825115918001450511021622402018000280000801004023440185402104021240175
8020440176300500001027296224611952926440132221263377357251601028010280000801008000040053118450441240140401604018330082330158160100200800002001600004017840195118020110099100100800008000010082511411851244352477800621510524842524607825097919611400511021622401518000280000801004014040138401384010840174
80204401643016000010431572260119521526440176221987799028226116202381233807108110881080405642189188812412034145441372309724431137162096202812182001616924141541420101802011009910010080000800001008285326251624052612461806011544024982541224983052271684000528429622410138112280000801004135341474415044064941408
8020441467311300101011532969224111696952041566223792610258042191615018123080660812608118840337618661241240503415034145131075443129616230720281218200161452414444157311180201100991001008000080000100831532416912441310245680329155102472921293682758711860000530421622401628000280000801004021540161401434013940143
80204401423013330010383432284116721356840100224986169485251601028010280000801008000040053518463601240147401384017130072330113160100200800002001600004012240161118020110099100100800008000010082487181676246314247880038150302484254469982512531655060511021622401478000280000801004011140184401204014640096
802044011930133300103924722701171295204009822515609175025160102801028000080100800004005351843888124010240161401023003833007616010020080000200160000401634013811802011009910010080000800001008247130211824266248380043152302476254457782513411621000511021622401398000280000801004015740132401914013340133
802044010930130000104973622841171265004017222445737216125160102801028000080100800004005351845472124015840133401523003833008516010020080000200160000400874013611802011009910010080000800001008249235250124348250880042152702507254463982503432291000511021622401548000280000801004017140148401064015940149
802044010130050000105517022391195213264401242212633660532516010280102800008010080000400535184525612401444015240122301913300901601002008000020016000040181401441180201100991001008000080000100824973518292421102485800401516524682524570825054818671400511021622401148000280000801004016940161401904016240135
8020440129300500001045261223911968142644010922188258554825160102801028000080100800004005351843384124016840167402783005733008516010020080000200160000401314010911802011009910010080000800001008250028220724315246080049152242492254461582512461613000511021622401288000280000801004018240123401554014240134

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5024

retire (01)cycle (02)030508090b18191e1f20222324293a3e3f4046494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8003440162300101100100474522971001448510040137225940554333251600128001280000800108000040008718441040124011240096401323012133008716001020800002016000040133401191180021109101080000800001082480013002462524948003515180250498645478250321109300502081742401718000280000800104009140104401244013840109
8002440103301000000100083822891001248722440075227241655257461600128001280000800108010840064718469000124009240148401013009433009016001020800002016000040120401091180021109101080000800001082486019952472424858002715180249884845218252823118200502041747401178000280000800104016240106400794013440150
8002440131300000000972355232920015043220400902276390549372516001280012800008001080000400087184307201240093401094014730057330120160010208000020160000401474010511800211091010800008000010825060101124161524778003115210249492245868250826220501502041724401338000280000800104015040129401294013340077
800244011230001000099661922991001264526040098226853555851251600128001280000800108000040008718470200124010140126401543006733006916001020800002016000040111401501180021109101080000800001083146013062479424808004015160247059868318310228152500502041742401348000280000800104017640208402494021940191
80024403003011100001022718222920101280222040240225656554842251600128001280000800108000040008718500320124028040237403583019533012816001020800002016000040285402151180021109101080000800001082468922642425824708008615360246049447098255367175001502051745402448000280000800104014940163401634010940167
80024401023011101001005053231201012729336402302252469641942516001280012800008001080000400087184813601240151402904022830140330241160010208000020160000401714024711800211091010800008000010824760154424288242780119150502488976474282552119192800502041742402448000280000800104028640185401824023440161
80024402123021110009942187229201015441317640115230859974435251600128001280000800108000040008718760000024012040199401623009533010216001020800002016000040186401502180021109101080000800001082490416962393824828013315170248497646728255164161100502021724402248000280000800104025040188402174014440168
8002440197302020000102212592300010150415260401632273525614542516001280012800008001080000400087184698411240171402254026930069330221160010208000020160000402784025511800211091010800008000010824788199324146246180108152302472478474982548116102400502021624402648000280000800104024640241401644021140131
800244017731311100099759022670011496202684017522745498103225160012800128000080010800004000871847824012402214026640213301913301831600102080000201600004016140091118002110910108000080000108248616238224270247980100151302474756476182590102181004502031653402758000280000800104024540139401984020740261
800244019830120000097231122309001150413212402302241539695562516001280012800008001080000400087185270801240264402484022030174330219160010208000020160000402354017411800211091010800008000010824782024542428525078011315014248897646358255460205900502041742401518000280000800104025640201405174022040219