Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADDH

Test 1: uops

Code:

  staddh w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
730053528230191015200410022000777310527300010002000200040001004200001000
730043415430031003200010002000776210513300010002000200040001003200001000
730043419730021002200010002000777010529300010002000200040001002200001000
730043500130031003200010002000776210513300010002000200040001002200001000
730043415130021002200010002000776210513300010002000200040001002200001000
730043414330021002200010002000776210513300010002000200040001002200001000
730043415930021002200010002000776210513300010002000200040001002200001000
730043420930021002200010002000776210513300010002000200040001002200001000
730043417730021002200010002000776210513300010002000200040001002200001000
730043420130021002200010002000776210513300010002000200040001002200001000

Test 2: throughput

Code:

  staddh w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402063036440258202152004320169200071158701060724011420207200073021140013200112000020100
402043006640111201092000220104200041159581059294010820204200043020640008200092000020100
402043006640116201112000520107200041159121058864010820204200043020640008200092000020100
402043006640111201092000220104200041159591059314010820204200043020640008200092000020100
402043006640111201092000220104200041159701059534010820204200043020640008200092000020100
402053013240186201502003620138200041159401059424010820204200043020640008200092000020100
402043006640111201092000220104200041159661059454010820204200043020640008200092000020100
402043006640111201092000220104200041159591059334010820204200043020640008200092000020100
402043006640111201092000220104200041159151058884010820204200043020640008200092000020100
402043006640111201092000220104200041159061058744010820204200043020640008200092000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400273073640270202002007020104200041156771058914001820024200043002040000200072000020010
400243006340017200172000020010200001154731056504001020020200003002040000200062000020010
400243005640016200162000020010200001154841056704001020020200003002040000200062000020010
400243005640016200162000020010200001154611056264001020020200003002040000200072000020010
400243005640016200162000020010200001154951056834001020020200003002040000200072000020010
400243005640016200162000020010200001154741056504001020020200003002040000200072000020010
400243005640016200162000020010200001154821056694001020020200003002040000200062000020010
400243005640017200172000020010200001154971056854001020020200003002040000200062000020010
400243005640016200162000020010200001155221057024001020020200003002040000200062000020010
400243005640017200172000020010200001154771056584001020020200003002040000200062000020010

Test 3: throughput

Code:

  staddh w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.6476

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30206128717412912121020081101582000023823092252220301001020020000202004000002110220000010100
30204126476412022120220000101002000023825412252434301001020020000202004000002110220000010100
30204126476412022120220000101002000023825412252434301001020020000202004000002110220000010100
30204126476412022120220000101002000023825412252434301001020020000203004019202083820000010100
30204126476412022120220000101002005024415682305015301781022820056202704013702131120000010100
30204126449411992119920000101002000023825412252434301001020020000202624011902122720000010100
30204121277407932079320000101002000023413292217081301001020020000202004000002085020000010100
30204124509409512095120000101002000023413292217081301001020020000202004000002085120000010100
30204124509409512095120000101002000023825412252434301001020020000202004000002110220000010100
30204126476412022120220000101002000023825412252434301001020020000202004000002110220000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.6469

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251303164132521272200531004520008227509021539673002410027200152002040000211022000010010
300241264764111221112200001001020000238882122536343001010020200002002040000211022000010010
300251263704114321097200461004020000238843822528733001010020200002002040000211022000010010
300241264694111121111200001001020000238882122536343001010020200002002040000211022000010010
300241264694111121111200001001020000238882122536343001010020200002002040000211022000010010
300241264694111121111200001001020000238882122536343001010020200002002040000211022000010010
300241264694111121111200001001020000238882122536343001010020200002007640109210832000010010
300241264764111121111200001001020000238882122536343001010020200002002040000211022000010010
300241264694111121111200001001020000238882122536343001010020200002002040000211022000010010
300241264694111121111200001001020008227508521539743002410027200152002040000211012000010010