Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (post-index, D)

Test 1: uops

Code:

  str d0, [x6], #0x10

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)1e1f202223293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
10051040711101511360005121025034225200010001000100010005075445824110151040104082438982000100020001040104011100110001000102895781510100101213102476472731161110371000001000100010411041104110411041
1004104081000011501081720102521115252000100010001000100050754458241101510401040824389820001000200010401040111001100010001062957745100701362424103675272731161110371000201000100010411041104110411041
1004104071201011000020102500152520001000100010001000507624582411015104010408243898200010002000104010401110011000100010508324371013000014102994872731161110371000001000100010411041104110411041
1004104081110012151001716102516434252000100010001000100050762458241101510401040824389820001000200010401040111001100010001024765443101200531213104874870731161110371000301000100010411041104110411041
1004104081000011410015010251216113252000100010001000100050754458241101510401040824389820001000200010401040111001100010001033844723101200301620102175270731161110371000001000100010411041104110411041
10041040811300111200020102500053320001000100010001000507624582411015104010408243898200010002000104010401110011000100010207651136101210481610105974071731161110371000001000100010411041104110411041
100410407101061140100201610252451325200010001000100010005075445824110151040104082438982000100020001040104011100110001000102776174610110136024103074071731161110371000001000100010411041104110411041
1004104071011012440001941025890525200010001000100010005075445824110151040104082438982000100020001040104011100110001000100784451810070040010104674472731161110371000001000100010411041104110411041
1004104071001010120002410252230325200010001000100010005075445824110151040104082438982000100020001040104011100110001000105484493210150040011105372872731161110371000001000100010411041104110411041
1004104081012011120001820102530002252000100010001000100050754458241101510401040824389820001000200010401040111001100010001034848936101500361210103674470731161110371000001000100010411041104110411041

Test 2: Latency 3->3

Code:

  str d0, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3c3e3f404446494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
10214100407510110413672304114726075210025226502101833725201001010010000101001000052218146882401001610040100408674387472010020010000200200001004010040111020110099100100100001000010012501101658145101526110601440024855047161250531625707101171110037100002672110000101001004110041100411004110041
10204100407510010473532275114484097610025226002241983525201001010010000101001000052201746882401001610040100408674387472010020010000200200001004010040111020110099100100100001000010012479171665142901527110021453424855046681251429746007101171110037100003261110000101001004110041100411004110041
10204100407520010266512319114726011881002522240197208382520100101001000010100100005222034688240100161004010040867438747201002001000020020000100401004011102011009910010010000100001001250581512141001507110561455624685046051251131713707101171110037100001603110000101001004110041100411004110041
10204100407510110422562354114486176410025223302031942825201001010010000101001000052214946882401001610040100408674387472010020010000200200001004010040111020110099100100100001000010012502151663143901450110801463424855045531250426761017101171110037100001893110000101001004110041100411004110041
1020410040751011034184227311416208401002522270193208252520100101001000010100100005221634688240100161004010040867438747201002001000020020000100401004011102011009910010010000100001001248991661145901517110161468024965046361250428732707101171110037100003391110000101001004110041100411004110041
10204100407510110410332308114725076010025222902032101825201001010010000101001000052217146882401001610040100408674387472010020010000200200001004010040111020110099100100100001000010012505151622144301518110021439424895046211250616680007101171110037100003714110000101001004110041100411004110041
102041004075220102484222551160860104410025231002191962925201001010010000101001000052219546882401001610040100408674387472010020010000200200001004010040111020110099100100100001000010012509101716145901511110381446424615046371251923670007101171110037100002000110000101001004110041100411004110041
1020410040751101029960233611688809681002522460213229312520100101001000010100100005221954688240100161004010040867438747201002001000020020000100401004011102011009910010010000100001001252081600145101478109791466624935045091249928762017101171110037100002361110000101001004110041100411004110041
1020410040762001031749231911688607641002522390173212252520100101001000010100100005221714688240100161004010040867438747201002001000020020000100401004011102011009910010010000100001001250591676149601454110541478024845046791252132668727101171110037100002440110000101001004110041100411004110041
10204100407510010452522282116807095610025223102442201925201001010010000101001000052214746882401001610040100408674387472010020010000200200001004010040111020110099100100100001000010012483141679148101520110411452024775046151249722712017101171110037100003234110000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)181e1f2022293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1003410040752000010110682305115281084410025223827425747252001010010100001001010000521041468824010021100401004086963877020010201000020200001004010040111002110910101000010000101248415168515680156910960139582480504608125073882274640216331003710000263610000100101004110041100411004110041
10024100407522000988276229011544119361002522362582514025200101001010000100101000052107346882401002110192100408696387702001020100002020000100401004011100211091010100001000010124839160215310155710938143142477504645125094269570640316321003710000230510000100101004110041100411004110041
100241004075220201029670226711528684410025225323223431252001010010100001001010000521057468824010021100401004086961387702001020100002020000100401004011100211091010100001000010124798160215430156310969142622485504636125204077270640316331003710000125110000100101004110041100411004110041
10024100407511000101677222641166468041002522602342093425200101001010000100101000052105746882401002110040100408696387702001020100002020000100401004011100211091010100001000010124847149715210158810963145762464506616125114272770640216331003710000219210000100101004110041100411004110041
1002410040762000099157322931170458441002522612032623325200101001010000100101000052102546882401002110040100408696387702001020100002020000100401004011100211091010100001000010124877157915240157510938143202464504618125114472570640316221003710000335310000100101004110041100411004110041
10024100407510000101195622921149647241002522152412104325200101001010000100101000052105746882401002110040100408696387702001020100002020000100401004011100211091010100001000010124757165015710158910962142542484504488125143962770640316331003710000264410000100101004110041100411004110041
10024100407510000100928323051167248441002522521892693625200101001010000100101000052107346882401002110040100408696387702001020100002020000100401004011100211091010100001000010124759165915670159210975142822485504610125174372670640325221003710000236310000100101004110041100411004110041
10024100407522000102246722671170427321002522401872185325200101001010000100101000052104146882401002110040100408696387702001020100002020000100401004011100211091010100001000010124849164315280158910923144802492504572125233870472640316331003710000394510000100101004110041100411004110041
1002410040752000099156623021171247241002522292592433625200101001010000100101000052101746882401002110040100408696387702001020100002020000100401004011100211091010100001000010124928158115590158110953146002473504585125023965970640316231003710000187510000100101004110041100411004110041
10024100407511100101646622901164068081002522312902752825200101001010000100101000052104146882401002110040100408696387702001020100002020000100401004011100211091010100001000010124817171515640157810978142802472504610125014165670640316221003710000189310000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  str d0, [x6], #0x10
  str d0, [x7], #0x10
  str d0, [x8], #0x10
  str d0, [x9], #0x10
  str d0, [x10], #0x10
  str d0, [x11], #0x10
  str d0, [x12], #0x10
  str d0, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5026

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)181e1f202224293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80214402203015550997514322982012005264401552263642734562516010280102800008010080000400531184886002401994023940252301403301291601002008000020016000040221402111180201100991001008000080000100824644523592405202464800571494024669564686824865623641450511021622402158000280000801004018140163402354016940182
8020440187301400098617623152012241226440175227893577367251601028010280000801008000040053118470600240184401904017330114330174160100200800002001600004022040226118020110099100100800008000010082456412803242832444800821492524659564638824707826691400511021622401948000280000801004021740208401544015340229
802044021230155509912682290201216102644019422981045756672516010280102800008010080000400531184845212401734020040239301723301511603202008000020016000040180401831180201100991001008000080000100824872419012413224678007415423246212124655824925320291400511021622402398000280000801004019140233401524018440216
802044017830150009882812297201224152644017822779138881022516010280102800008010080000400531184418002401554028040204301173301641601002008000020016000040171402411180201100991001008000080000100824762830382385142454800601525024504724597824866023761400511021622401858000280000801004018340176401994024840190
80204401923014000996980230520120810264401672270682809100251601028010280000801008000040053118476361240177402114017430084330138160100200800002001600004022940182118020110099100100800008000010082471362354239582462800671491024659584597825047528442800511021622401598000280000801004020740185401844025340249
80204401583015050972088231220124012264401772264822795742516010280102800008010080000400531184751702401374020740197301353302241601002008000020016000040183401771180201100991001008000080000100824633026232392102446800641512024499584535824857025711440511021622402008000280000801004019940187402284023740211
80204402293015000993971231220121612264401842292680935972516010280102800008010080000400531184559602401614022940162300943301191601002008000020016000040203402151180201100991001008000080000100824762826152395192446800681515024429564684824805920671400511021622402008000280000801004019040202401744021740181
80204401743014040981611123052012241326440247238197793080251601028010280000801008000040053118480920240184405734018730066330133160100200800002001600004021740214118020110099100100800008000010082483362081242211243380099150702458163446118248472285227100511021622402328000280000801004015740204401864022840186
8020440171301500099511002298201232926440209229175465691251601028010280000802348000040053118476361240152401714019130088330173160100200800002001600004022640178118020110099100100800008000010082464322365240162487800621517024659564726824945928311440511021622402088000280000801004026140156402394018740159
80204401513014000100207922902012009264402002376775824100251601028010280000801008000040053118457400240176401904018530104330177160100200800002001600004018440200118020110099100100800008000010082459282144239442457800681510024419584627824886023721400511021622401558000280000801004020140220402194017540167

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5022

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f20222324293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800344020937111000010155132230710014886544401362309437312312516001280012800008001080000400083185082800240146401734022230088330182160010208000020160000402224016211800211091010800008000010825041411992498324978003615340248675446948255670178414050200151709124020780002080000800104019840144401624021640207
8002440165372100000102698323201001496742040163223282573843251600128001280000800108000040008318479480024014540180401023022933021416001020800002016000040873403141180021109101080000800001082648141768240940247380321146702464774463282559491765005020010170974013180002080000800104030840100401854014740186
8002440174348000000999612922521001424911640151226554767650251600128001280000800108000040008718437680024029840260402683014533009616001020800002016000040139402791180021109101080000800001082515147682447224808010215042248881647078254284169814050200717011104019780002080000800104028040140401664012340177
80024401583471111001005914222931001256926040199228875534832251600128001280000800108000040008318439160024017740161401903004733014916001020800002016000040161402551180021109101080000800001082510241116245720246380080152302474848466282557731454140502001017010104009880002080000800104012840192403024025940143
80024401153481100009924141226800199284524017722795916244725160012800128000080010800004000831849004002401664017440122301763301371600102080000201600004020640285118002110910108000080000108253222138924662247480082154602496486473282550781670140502001017010104016680002080000800104019240140401764021640184
800244018532310220010356190234600114168292402092246382688104251600128001280000800108000040008718486640024011040155402253008833011316001020800002016000040096401141180021109101080000800001082496010122398124468005515450251099646848254483142200502001117011114021080002080000800104017740201400764013340196
800244018232300000010329156232400112485500402242262880574362516001280012800008001080000400087184691200240146402794015330056330163160010208000020160000401054023111800211091010800008000010825100149523881250880055149602489101046908256548130800502001017011104016280002080000800104010540123401904014440129
8002440139310000000102571012292001147225164010022727063856325160012800128000080010800004000871847968002401864020740177300553301721600102080000201600004020740143118002110910108000080000108248409922457724978009315490248812304710825448015540050200917010114010380002080000800104019640149402384017540237
8002440132311000000102272102316001122453844015122586367069251600128001280000800108000040008718431920024020940196401123010533012116001020800002016000040104401281180021109101080000800001082468014202467224428005115320250084846488255589112501502001217010104017780002080000800104017540144401424015240138
800244019830600000010134156234800112324116402002289249571372516001280012800008001080000400087184660000240181402394017730084330113160010208000020160000402414021011800211091010800008000010825020178924801249080079152502488756463682556639680050200111708114019780002080000800104018240228402114016640109