Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (post-index, S)

Test 1: uops

Code:

  str s0, [x6], #0x10

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e1f202223293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)aaabacafbcl1d cache miss st nonspec (c0)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
10051040800012612100150102536112252000100010001000100050762458241101510401040824389820001000200010401040111001100010001036052891007001651038440732162210371000001000100010411041104110411041
10041040700009120002010253400325200010001000100010005077045824110151040104082438982000100020001040104011100110001000102404453210051243271060044732162210371000001000100010411041104110411041
1004104070006924102415010251410425200010001000100010005077045824010151040104082438982000100020001040104011100110001000103604879100902820101044440732162210371000001000100010411041104110411041
100410407000125400000102500162520001000100010001000507704582411015104010408243898200010002000104010401110011000100010160203610071322441036428732162210371000001000100010411041104110411092
100410407001063810019010252211325200010001000100010005077845824110151040104082438982000100020001040104011100110001000102804046100800091028432732162210371000101000100010411041104110411041
100410408010073210018010252800525200010001000100010005076245824110151040104082438982000100020001040104011100110001000102403642210260383241052056732162210371000101000100010411041104110411041
1004104080001263810023010252031425200010001000100010005076245824110151040104082438982000100020001040104011100110001000103805242410041383041036424732162210371000001000100010411041104110411041
10041040800008241002341025001425200010001000100010005077045824010151040104082438982000100020001040104011100110001000102603622410041304071038424732162210371000001000100010411041104110411041
100410408000125161001441025160022520001000100010001000507704582401015104010408243898200010002000104010401110011000100010000360241006324831004428732162210371000001000100010411041104110411041
1004104080001294010022010251801325200010001000100010005076245824110151040104082438982000100020001040104011100110001000104802432610030120101036440732162210371000001000100010411041104110411041

Test 2: Latency 3->3

Code:

  str s0, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2022293a3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1021410040750010001040446226811720147241002522292192332625201001010010000101001000052218746882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124694168815470150510983148302489504678124982665700007101171110037100003367110000101001004110041100411004110041
102041004075110100103834122611170462724100252229205210282520100101001000010100100005221874688241001610040100408674387472010020010000200200001004010040111020110099100100100001000010012489515871489015271100315101248910245821250332807010071011711100371000029712110000101001004110041100411004110041
1020410040751101001042250227011688607241002522292132073025201001010010000101001000052220346882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124834152214599152111013151702489504530125162566600007101171110037100003804110000101001004110041100411004110041
1020410040761000001037771228211712417241002522292561983225201001010010000101001000052217346882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124815162514810151611012150512489504584124943369902007101171110037100002535110000101001004110041100411004110041
1020410040751101001041650226811680507241002522382212192525201001010010000101001000052217146882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124735164614880152910997152912465504515125102570001007101171110037100002025110000101001004110041100411004110041
1020410040751001001035945226811712607241002522222292142625201001010010000101001000052217946882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124744160914750153910985147702489504476124952865200007101171110037100002793110000101001004110041100411004110041
1020410040751100001055462226811712617241002522361942201625201001010010000101001000052215546882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124844168715100152210968150312480504604124982979400007101171110037100002223110000101001004110041100411004110041
1020410040751100001048578226311696517121002522222222102225201001010010000101001000052216346882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124904169715100151310989154222489504615124893262900007101171110037100002394110000101001004110041100411004110041
10204100407510000010479522282117042071210025222920636620252010010100100001010010000522187468824100161004010040867438747201002001000020020000100401004011102011009910010010000100001001247315166415040152510981151222489504545125072969500007101171110037100003451110000101001004110041100411004110041
1020410040751101001054546226911696407121002522291792122225201001010010000101001000052217946882410016100401004086743874720100200100002002000010040100401110201100991001001000010000100124907162714680152410997152522489504528125212761503007101171110037100002324110000101001004110041100411004110041

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0040

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)1e1f2022293a3b3c3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
100341004075101102994423391170440071210025226922219830252001010010100001001010000521097468824101001610040100408696387702001020100002020000100401004011100211091010100001000010125018163514780152711068154802496504576125162065900640003163310037100002503110000100101004110041100411004110041
1002410040750001032343230911472800748100252244195204232520010100101000010010100005211294688241010016100401004086963877020010201000020200001004010040111002110910101000010000101249911172915030148211040151602481504614125042560870640003163310037100002467110000100101004110041100411004110041
1002410040752021014957237411440900760100252260183222442520010100101000010010100005211214688240010016100401004086963877020010201000020200001004010040111002110910101000010000101250615166714700150110967149712488504578125032369074640002163210037100001562110000100101004110041100411004110041
100241004075201106235223211166411001004100252242203189212520010100101000010010100005211054688240010016100401004086963877020010201000020200001004010040111002110910101000010000101250815163615310147611046152902500504622125153558470640003162310037100001795110000100101004110041100411004110041
1002410040762001011648233011472800760100252273190235202520010100101000010010100005210974688241010016100401004086963877020010201000020200001004010040111002110910101000010000101249615154514820149211040151822480504531125204170370640003163310037100001662110000100101004110041100411004110041
100241004075200104074723451169630295610025223518819343252001010010100001001010000521121468824001001610040100408696387702001020100002020000100401004011100211091010100001000010125158167014920145811046150422493504635125063067470640003162310037100002743110000100101004110041100411004110041
1002410040752001041051231511640400804100252243194156172520010100101000010010100005210734688240010016100401004086963877020010201000020200001004010040111002110910101000010000101249214168015060148211054151702488504461125103373170640002163310037100002453110000100101004110041100411004110041
10024100407520010404372299116322001204100252243194212342520010100101000010010100005211054688240010016100401004086963877020010201000020200001004010040111002110910101000010000101248715162514870153211049150322480504657125142964772640003163310037100001555110000100101004110041100411004110041
1002410040752221045865230911464600123610025227121025317252001010010100001001010000521097468824001001610040100408696387702001020100002020000100401004011100211091010100001000010124939166515310154811020153402493504642125162859500640003163310037100002099110000100101004110041100411004110041
100241004075100102634423271148010176410025228422223127252001010010100001001010000521129468824001001610040100408696387702001020100002020000100401004011100211091010100001000010125018166414730147311019151102476504547124962857100640002332310037100002893110000100101004110041100411004110041

Test 3: throughput

Count: 8

Code:

  str s0, [x6], #0x10
  str s0, [x7], #0x10
  str s0, [x8], #0x10
  str s0, [x9], #0x10
  str s0, [x10], #0x10
  str s0, [x11], #0x10
  str s0, [x12], #0x10
  str s0, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5028

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f20222324293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8021440140300702009963532361100126413268401162302713514502516010280102800008010080000400535184434402400994019140110300053300591601002008000020016000040097401101180201100991001008000080000100824719132724560248880050151402494822465882507231762000511011611401208000280000801004012240156401224013740140
80204401203004220099424923201001456114804012222596225274025160102801028000080100800004005351844584124011340113401703003633008516010020080000200160000401804012711802011009910010080000800001008247716183124714250780042154962482898456282507201591000511011611401308000280000801004014340181401344021140124
80204401433014200010143672317100148092684010222797316383425160102801028000080100800004005351843720024011340109401503005133007716010020080000200160000401684011011802011009910010080000800001008249016190524702248380046149622518754463482506311787000511011611401618000280000801004020140155401484012640112
80204401293003030099997923521001400121124015123107436174825160102801028000080100800004005351843576024011640146401163005333015316010020080000200160000401194010711802011009910010080000800001008249418163224673249980026148502508974456482500501847060511011611401468000280000801004014540172401164013140121
80204401173013110010041432339100148881164012922915125764725160102801028000080100800004005351842520124012140150401343005033009416010020080000200160000401294012111802011009910010080000800001008251021201624700248880046152322488994458582497261253000511011611401428000280000801004010740148400754011740106
8020440082301222001026932234510012561149640104226857760693827816218281566807808158381296407382189738412413124066341308310542931167162088202809662021626404133641432101802011009910010080000800001008306216209624213022476806531535024679861175883121331429000530816511413188102480000801004131641256414654146041099
8020441473311201111010764866235610016641116841424231177545481723816200281326804808123081296406196189277602412774148741469309714431316162528206812062061624244153940955111802011009910010080000800001008310581476243931024548064115330248834612051830972713901300525411611401858000280000801004008740102401254014740155
8020440177301211009939342337100149682524011023034595854025160102801028000080100800004005351841752124011240130401263005533010316010020080000200160000401384009411802011009910010080000800001008248616180424699250080033153742500896455682512261706000511011611401158000280000801004009340117401084008840133
802044011430010100997845240510010327204400752279518691462516010280102800008010080000400535184516012401124013740198300293301021601002008000020016000040146401311180201100991001008000080000100824881618432448132479800361514225001034460382505292206000511011611401288000280000801004012240138401244016540129
80204401143012220010248462377100125614220401342257607875732516010280102800008010080000400535184583202401284017440112300163300721601002008000020016000040144401251180201100991001008000080000100824748196024848249680029153202492748453982511411545000511011611401358000280000801004013740114401774012640147

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5015

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2022293a3e3f4046494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606167696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd0d2d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80034401263016020000100413823441124015152401252283575595232516001280012800008001080000400087184309610240082401094011530046330100160010208000020160000401424012011800211091010800008000010825022019182474112506800361514025161028468382504372385030502000617035401358000280000800104011540127401664013440137
80024400863014000000100083023801116810276401032307809705472516001280012800008001080000400087184376800240107401724011230050330084160010208000020160000401334009911800211091010800008000010825082721532452142498800161520025061226468282504221377000502000516065401098000280000800104018540147401614014340112
8002440126301400020010047262367114401151640124228479165435251600128001280000800108000040008718421360024010040134401053003733010416001020800002016000040080401051180021109101080000800001082502101608245722485800251489024981068453582487441713040502000317053401168000280000800104018840106400894013040152
8002440093300503030010029452365412561512840091234855464121251600128001280000800108000040008718435280024014240151401283006133013216001020800002016000040177401151180021109101080000800001082493331712244216248680044150802502804455682498252224040502000516035401478000280000800104014340162401314010640133
8002440139301600040010212502351112561610040135230366186952251600128001280000800108000040008718468160024015340149401633011333013816001020800002016000040112401411180021109101080000800001082502271462246382491800371537025001002464782506372202090502000516035401338000280000800104017240184401464019540140
8002440120301713000010086502342114401823640075230064976353251600128001280000800108000040008718458320024009440147401223007433010516001020800002016000040132401281180021109101080000800001082496201305246632500800201505024821182462282483421443000502000316053401398000280000800104014340125401264012840158
8002440094301502000010119362387114801135240078229463769933251600128001280000800108000040008718451600024012540100401593006433008816001020800002016000040124401581180021109101080000800001082483161956244912503800371546225001222459082480181136000502000616035400978000280000800104012040157401284018540161
80024401253004020200100742723371124811212400822317743697352516001280012800008001080000400087184480010240087401354008330084330121160010208000020160000401064012411800211091010800008000010824989141424708249580030148502500766466582514381096020502000617035401408000280000800104010340151401574009840139
800244017530130101001006245234211424517240101224455647351251600128001280000800108000040008718428321024011040096401033009433009016001020800002016000040149401901180021109101080000800001082490919872483112493800191525024981002455682507221484000502000317054401338000280000800104009640117401204011140114
800244010330030000001033262231611264103964013122995114386525160012800128000080010800004000871843912002400794010940124300243301581600102080000201600004012040129118002110910108000080000108249091545247220249280036155712512814459082515341859000502000517035400978000280000800104010140076401044011240096