Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

STR (pre-index, Q)

Test 1: uops

Code:

  str q0, [x6, #0x10]!

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03080b1e1f2022293a3e3f4046494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd store (99)inst ldst (9b)a0a1a2a3a6a7a8aaabacafdcache store miss (c0)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eald/st retires (ed)gpr retires (ef)f5f6f7f8fd
100510408000330116121610258112252000100010001000100050778458241101510401040824389820001000200010401040111001100010001016019243100003622010161973116111037100001000100010411041104110411041
1004104080002161010121025011225200010001000100010005077845824110151040104082438982000100020001040104011100110001000102401519100002430010161173116111037100001000100010411041104110411041
1004104071102161013010258032252000100010001000100050778458241101510401040824389820001000200010401040111001100010001024019124100402822310161573116111037100001000100010411092104110411041
1004104070063181012010258032252000100010001000100050778458241101510401040824389820001000200010401040111001100010001016019328100302418010241573116111037100001000100010411041104110411041
1004104080005241018010258011252000100010001000100050778458241101510401040824389820001000200010401040111001100010001000031001000000010003173116111037100001000100010411041104110411041
1004104080002161011010258211252000100010001000100050778458241101510401040824389820001000200010401040111001100010001028023312100303224010162373116111037100001000100010411041104110411041
1004104080004161012010251615225200010001000100010005077845824110151040104082438982000100020001040104011100110001000101601519100002516010161973116111037100001000100010411041104110411041
100410408000324101201025810225200010001000100010005077845824110151040104082438982000100020001040104011100110001000102402724610030160010241973116111037100001000100010411041104110411041
100410408000218101201025145412520001000100010001000507784582411015104010408243898200010002000104010401110011000100010160232910000160010161973116111037100001000100010411041104110411041
10041040700621610121210250111252000100010001000100050778458241101510401040824389820001000200010401040111001100010001016027128100002430010161573116111037100001000100010411041104110411041

Test 2: Latency 3->3

Code:

  str q0, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)0305080b1e1f2022293a3c3e3f404446494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
102141004075550104046723211171222096410025227402022035825201001010010000101001000052212546882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100125175117551394015221108315080247550465512475337647007101171110037100002882110000101001004110041100411004110041
1020410040757701033272226111464181972100252281022518029252010010100100001010010000522007468824100171004010040867438747201002001000020020000100401004011102011009910010010000100001001249843180114340146511024151102494504584124773227467007101171110037100002885110000101001004110041100411004110041
102041004075600103746523041146420096010025226042361923625201001010010000101001000052216546882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100124984617981430014771109915170248250465912487388587607101171110037100002441110000101001004110041100411004110041
102041004075660102907122971165615095610025223301791984325201001010010000101001000052214146882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100125244318571405014461106614926249550458812468378907007101171110037100001802110000101001004110041100411004110041
102041004075600103988623031166419176410025225301881954225201001010010000101001000052214946882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100125004617711394014371107515416248250460212468388357007101171110037100002954110000101001004110041100411004110041
1020410040766601081267229711656220980100252267017618730252010010100100001010010000522093468824100171004010040867438747201002001000020020000100401004011102011009910010010000100001001248146181014260149911073149112247850468512477328007007101171110037100002072110000101001004110041100411004110041
102041004075666103207823051194419178010025226802121953625201001010010000101001000052210946882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100124914817181427014321106415150249150453912479306707007101171110037100003394110000101001004110041100411004110041
102041004075700102997622901198423071210025228202011894525201001010010000101001000052210146882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100125195118201421015311108215227249150452712472369587707101171110037100003835110000101001004110041100411004110041
102041004075707103476723191198425098010025222902091602825201001010010000101001000052211746882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100125044619031424014501104215240249550458612474377357007101171110037100001872110000101001004110041100411004110041
102041004075777105036522811196827194810025226701851843925201001010010000101001000052209346882410017100401004086743874720100200100002002000010040100401110201100991001001000010000100125014617911465014881110215137248750452412468318057007101171110037100001635110000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire (01)cycle (02)030508090b191e1f2022293a3c3e3f404446494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
10034100407500000101049622821166410102410025221402262524025200101001010000100101000052108946882410022100401004086963877020010201000020200001004010040111002110910101000010000101246501593154601579109401508124735045041252639793006403164310037100004917010000100101004110041100411004110041
1002410040750000010149522340112482075210025221502282333425200101001010000100101000052108146882410022100401004086963877020010201000020200001004010040111002110910101000010000101247301606154601551109601506024815045911249834789006404164410037100002693010000100101004110041100411004110041
1002410040751000010224922268116646072410025222202322923025200101001010000100101000052112146882410022100401004086963877020010201000020200001004010040111002110910101000010000101250101683155701559109391545024895046661250346661006404164410037100002434010000100101004110041100411004110041
1002410040750000010035692252114721094810025222302452432925200101001010000100101000052106546882410022100401004086963877020010201000020200001004010040111002110910101000010000101247441625153701586109541524024935046161252946755016403163310037100003686010000100101004110041100411004110041
10024100407500000101797722531144020118810025222602012762825200101001010000100101000052113746882410022100401004086963877020010201000020200001004010040111002110910101000010000101248241578156001574109331501124815045571250434871006404164410037100002542010000100101004110041100411004110041
1002410040751100010215552235117046072410025222102642204225200101001010000100101000052104946882410022100401004086963877020010201000020200001004010040111002110910101000010000101247301546154101580109341519024975045921251929755006403163310037100004041010000100101004110041100411004110041
1002410040761000010341792257116640072410025221602092454125200101001010000100101000052112146882410022100401004086963877020010201000020200001004010040111002110910101000010000101250101763156731585109341533024775046651250939662016725254410115100002943010000100101004110091101921009310091
1002410040750000010563752253117521070410077212502493033346201161003810026101561006952108147101810022101481011386963883520153201000020201601004010191111002110910101000010000101250301587155501579109011471024735046381250240691046404164410037100003725410000100101004110041100411004110041
10024100407500000107045122551162460724100252220021724244252001010010100001001010069519533468824100221004010040869638770200102010160202032010040100401110021109101010000100001012503816081525015771096015280250150458012507377140464041633100371000033111010000100101004110041100411004110041
100241004081120001026662224811704110143210125222202553021311262011810066100761016110208517537473156101411027910343875015877020443201008020204801014110192411002110910101000010000101247971676153901542108991511024535067661250023673006877574410115100272555010000100101019110041100411004110041

Test 3: throughput

Count: 8

Code:

  str q0, [x6, #0x10]!
  str q0, [x7, #0x10]!
  str q0, [x8, #0x10]!
  str q0, [x9, #0x10]!
  str q0, [x10, #0x10]!
  str q0, [x11, #0x10]!
  str q0, [x12, #0x10]!
  str q0, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5021

retire (01)cycle (02)0305080b1e1f2022293a3e3f4046494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8021440182301404102363422461194414264401542219634773542516010280102800008010080000400535184307212401540401484020030084330115160100200800002001600004016440104118020110099100100800008000010082492262147243862478800501526024922544624825015530921400511011611402398000280000801004015540167401754019140143
80204401283015009948662264117041052040147233358988357251601028010280000801008000040053118446601240098040113401893004133019016010020080000200160000401684018811802011009910010080000800001008250634205324131224758010615340248425446388250811423541400511011611401738000280000801004014440200402084020740211
802044015430260099273022461195292644014822125747543625160102801028000080100800004005351843936124013204014040180300293301611601002008000020016000040128401781180201100991001008000080000100824842614072422102471800451557024681164458582490302592000511011611401298000280000801004020640188401174018040193
80204401953014401017063223211968626440132219850772252251601028010280000801008000040053518442480240110040140401453006533013716010020080000200160000401654014511802011009910010080000800001008247914171824226246480049154602484254467082505182151030511011611401858000280000801004016040311402264021840197
802044014830133310143402239119447264401092191446676702516010280102800008010080000400535184424802401570401504015530088330064160100200800002001600004013740182118020110099100100800008000010082467141859244615246580043152002452254456182489342228000511011611401558000280000801004020140122401074013540188
802044014930040010083462278117121152040144223772779657251601028010280000801008000040053518454000240142040174401563008233013016010020080000200160000401404013311802011009910010080000800001008249214183124367247980049148802492254467382488271384000511011611401818000280000801004016840124401994017840165
8020440170300404100687022641172010520401322251630695472516010280102800008010080000400535184295202401000401534018430070330128160100200800002001600004014140172118020110099100100800008000010082458201219242622461800401495324761074462582510291673000511011611401558000280000801004018840127401504014540163
80204401233005069981372250117041252040163225167362851251601028010280000801008000040053518452800240097040111401373004233012816010020080000200160000401514015311802011009910010080000800001008248414218224323249180027150262492254464182555652325030511011611401528000280000801004021440155401614017040129
802044013130133098824122571169615520401082230759626652516010280102800008010080000400535184780002400880401164018230050330100160100200800002001600004014740173118020110099100100800008000010082492222147240911249280032150582500254464582488352643000511011611401248000280000801004015040165401474024740150
802044015430140410287472232119761526440173219171577769251601028010280000801008000040053518450880240146040128401683005633014716010020080000200160272401354012211802011009910010080000800001008247630266224254244680049155452476254459682489292649000511011611401978000280000801004021540164402134016340180

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5015

retire (01)cycle (02)030508090b18191e1f202224293a3e3f4046494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6167696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd store (99)inst ldst (9b)9fa0a1a2a3a4a6a7a8a9aaabacafbcdcache store miss (c0)c2branch mispredict (cb)cfd0d2icache miss (d3)d5d6dbddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
800344010230000000010191412299101488214840123225737448153251600128001280000800108000040008718424720240098401134011830068330068160010208000020160000401024012911800221091010800008000010825050112724623248480015152802502974459182516411701205020081261702524400928000280000800104011740129400994017640094
800244009630000000010152242299101296425240110231134729918251600128001280000800108000040008718424240240102401084011630068330092160010208000020160000401264014411800211091010800008000010824940129424803247980036155302505796458782530309340050200110264901527403948000280000800104011040149401464014640108
800244012330000000010272732305101600426040134236042749024251600128001280000800108000040008718424960240123401984009730065330136160010208000020160000401014010011800211091010800008000010824860992246412249780048151702494976449282526371080005020080101702515400738000280000800104012340120400854014140124
800244009930100000010065342346101480311240071230742354378251600128001280000800108000040008718434560240099401294010830051330134160010208000020160000400904006911800211091010800008000010824870110124680249980020152702478884462382519271668005020070221702626401028000280000800104011740104401144011240110
80024401243000000001028751232610151262324008023106084673425160012800128000080010800004000871841608024004140101401093001433020016001020800002016000040094400901180021109101080000800001082478019012495122492800381519025079864439825303613370050200100251711728400968022680000800104012540103401304011640173
800244015330000000010233422348101488316840098229649532027251600128001280000800108000040008718444400240088400974011130037330294160010208000020160000401524010011800211091010800008000010824640130824727250180041153402494756466482521201055005020070161701527401118000280000800104014540098401354009140091
800244011130100000010122402302101384332440081227352036431251600128001280000800108000040008718433360240133401084012230062330150160010208000020160000401274010511800211091010800008000010824780141724712250680038149202477984467582521451659005020090274901729401528000280000800104010840109401154018040089
8002440102300000010999048230810146451884007122784553663251600128001280000800108000040008718442480240073401164013930036330067160010208000020160000401094011311800211091010800008000010825000110924654247680032149102507846457682511201056005020060231702827401318000280000800104009640098401064010640096
80024400963010000001024551233810152052804012122985615114251600128001280000800108000040008718418000240103400944008530022330084160010208000020160000401044010211800211091010800008000010824940151024867250080028147602507792456982500171331005020060231702725400948000280000800104013140143401064009940106
8002440105300000000104402923001014964212400822293706372302516001280012800008001080000400087184444002400874009840131300663300911600102080000201600004012040105118002110910108000080000108248902122246382476800181490024869864517825052216390050200100251702717401568000280000800104012040146401394014140090