Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADD (32-bit)

Test 1: uops

Code:

  stadd w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
73005348773019101502004100202000776910520030001000200002002400401004200001000
73004345723003100302000100002000777010521030001000200002000400001003200001000
73004347643003100302000100002000777010521030001000200002002400401004200001000
73005353533012100602006100302000777010521030001000200002000400001003200001000
73004356093006100602000100002000777010521030001000200002000400001003200001000
73004341763003100302000100002000777010521030001000200002000400001003200001000
73004341463003100302000100002000777010521030001000200002000400001003200001000
73004342133003100302000100002000777010521030001000200002000400001003200001000
73004342013003100302000100002000777010521030001000200002000400001003200001000
73004341733003100302000100002000777010521030001000200002000400001003200001000

Test 2: throughput

Code:

  stadd w0, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402073063940382203192006320201200071159731064074011420207200073021140013200102000020100
402043005940115201102000520107200041160851063164010820204200043020640008200122000020100
402043005940114201122000220104200041160661062784010820204200043020640008200122000020100
402043005940114201122000220104200041160811063084010820204200043020640008200122000020100
402043005940114201122000220104200041160851063164010820204200043020640008200122000020100
402043005940114201122000220104200041160801063064010820204200043020640008200122000020100
402043005940114201122000220104200041160701062864010820204200043020640008200132000020100
402043005940114201122000220104200041160821063104010820204200043020640008200122000020100
402043005940114201122000220104200041160741062944010820204200043020640008200122000020100
402043005940114201122000220104200041160691062844010820204200043020640008200122000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400273063240254201842007020104200041158871063184001820024200043002040000200072000020010
400243006540018200182000020010200001156721060494001020020200003002040000200092000020010
400243005640017200172000020010200001157311061644001020020200003002040000200062000020010
400243005640016200162000020010200001157311061554001020020200003002040000200062000020010
400243005640016200162000020010200001157301061644001020020200003002040000200072000020010
400243005640017200172000020010200001157171061364001020020200003007740074200482000020010
400243006340017200172000020010200001157371061784001020020200003002040000200062000020010
400243005640017200172000020010200001157351061704001020020200003002040000200062000020010
400243005640016200162000020010200001157111061264001020020200003002040000200062000020010
400243005640016200162000020010200001157061061124001020020200003002040000200062000020010

Test 3: throughput

Code:

  stadd w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9761

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051301564136821330200381013120000245004423119333010010200200002020040000213102000010100
302041297614141021410200001010020000245004823119333010010200200002020040000213102000010100
302041297614141021410200001010020000245004823119333010010200200002020040000213102000010100
302041297614141021410200001010020000245004823119333010010200200002025240094213542000010100
302041297524140621406200001010020000245004823119333010010200200002020040000213102000010100
302041297664141021410200001010020000245004823119333010010200200002020040000213092000010100
302041297514135921359200001010020000245002823119153010010200200002020040000213102000010100
302041297614141021410200001010020000245004823119333010010200200002020040000213102000010100
302041297614141021410200001010020051240392922714293018110230200582020040000212742000010100
302041295304137421374200001010020000247333823338113010010200200002024040080214152000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9738

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251348044177521739200361004120000246760123235173001010020200002002040000213612000010010
300241303504137221371200011001020000245614323129573001010020200002002040000212522000010010
300241264764111221112200001001020000238871622535083001010020200002002040000211012000010010
300241264694111121111200001001020000238871622535083001010020200002002040000211012000010010
300241264694111121111200001001020000238871622535083001010020200002002040000211012000010010
300241264694111121111200001001020000238871622535083001010020200002002040000211012000010010
300241264694111121111200001001020000238871622535083001010020200002002640009210802000010010
300241264724111121111200001001020304235083122217493048810194203442002040000216562000010010
300241264694111121111200001001020050235764522265113008810048200552002040000211012000010010
300241255124097620930200461003820048240112822648183008610048200552002040000211482000010010