Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEOR (64-bit)

Test 1: uops

Code:

  steor x0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005342943016101220041002200077701052130001000200020004000100320001000
73004340923003100320001000200077701052130001000200020004000100320001000
73004340723003100320001000200077701052130001000200020004000100320001000
73004340783003100320001000200077701052130001000200020004000100320001000
73004340803003100320001000200077701052130001000200020004000100320001000
73004340813003100320001000200077701052130001000200020004000100320001000
73004340803003100320001000200077701052130001000200020004000100320001000
73004340693003100320001000200077701052130001000200020004000100320001000
73004340753003100320001000200077701052130001000200020004000100320001000
73004340773003100320001000200077701052130001000200020004000100320001000

Test 2: throughput

Code:

  steor x0, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402113118640653204912016220323200071166351076474011420207200073021140013200102000020100
402043006340110201082000220104200041167501075824010820204200043020640008200082000020100
402043006340110201082000220104200041167541075924010820204200043020640008200082000020100
402043006340110201082000220104200041167491075844010820204200043020640008200082000020100
402043006340110201082000220104200381119001189084017620238200383020640008200082000020100
402043006340110201082000220104200041167471075764010820204200043020640008200082000020100
402043006340110201082000220104200041167461075764010820204200043020640008200082000020100
402043006340110201082000220104200381112411197414017620238200383020640008200082000020100
402043006540111201092000220104200041169961076974010820204200043020640008200082000020100
402043006340110201082000220104200041166531074104010820204200043020640008200102000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400293081240389202952009420164200041166161076414001820024200043003140013200102000020010
400243006640017200172000020010200001166061076344001020020200003002040000200072000020010
400243006640017200172000020010200001166031076284001020020200003002040000200072000020010
400243006640017200172000020010200001165971076164001020020200003002040000200072000020010
400243006640017200172000020010200001166151076524001020020200003002040000200072000020010
400243006640017200172000020010200001165981076184001020020200003002040000200072000020010
400243006640017200172000020010200001166151076524001020020200003002040000200072000020010
400243006640017200172000020010200381135081183334008620058200383002040000200072000020010
400243006640017200172000020010200001166131076484001020020200003002040000200072000020010
400243006640017200172000020010200001165451075114001020020200003002040000200072000020010

Test 3: throughput

Code:

  steor x0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
302051304864145221414020038101300200002454321231748330100102002000020200400002131020000010100
302041297614141021410020000101000200002449943231180730100102002000020200400002131020000010100
302041297544140921409020000101000200002449943231180730100102002000020200400002130920000010100
302041297544140921409020000101000200002449943231180730100102002000020200400002130920000010100
302041297544140921409020000101000200002449837231175330100102002000020200400002130920000010100
302041297924139621396020000101000200002450250231211330100102002000020200400002130920000010100
302041297544140921409020000101000200002450048231193330100102002000020200400002130920000010100
302041297614141021410020000101000200002449943231180730100102002000020200400002130920000010100
302041297614141021410020000101000200002449943231180730100102002000020200400002131020000010100
302051272744116021114020046101300200002450048231193330100102002000020200400002130920000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9761

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3002513015541291212550200361004002020024232232285182303241013420226200624008502128520000010010
3002412640341104211040200001001002000024562482313083300101002020000200204000002127320000010010
3002412976141283212830200001001002000024562482313083300101002020000200204000002127320000010010
3002412976141283212830200001001002000024562482313083300101002020000200764011002104220000010010
3002412972641283212830200001001002000024562482313083300101002020000200204000002127320000010010
3002412976641281212810200001001002019823913902256879303211013320224200204000002127320000010010
3002512773541096210500200461003802000024548732311895300101002020000200624008502128220000010010
3002412976141283212830200001001002000024562482313083300101002020000200204000002127320000010010
3002412976141283212830200001001002000024562482313083300101002020000200204000002127320000010010
3002412976141283212830200001001002019624437292302509303191013320223200204000002127920000010010