Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADD (64-bit)

Test 1: uops

Code:

  stadd x0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005346343018101420041002200077691052030001000200020004000100320001000
73004342603003100320001000200077701052130001000200020004000100320001000
73004342783003100320001000200077701052130001000200020004000100320001000
73004342333003100320001000200077701052130001000200020024004100320001000
73004346223003100320001000200077911054430001000200020004000100320001000
73004342043003100320001000200077701052130001000200020004000100320001000
73004342063003100320001000200077821053330001000200020004000100320001000
73004341813003100320001000200077701052130001000200020004000100320001000
73004342193003100320001000200077701052130001000200020004000100320001000
73004343243003100320001000200077701052130001000200020004000100420001000

Test 2: throughput

Code:

  stadd x0, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402093086740499204002009920261200071166081075744011420207200073020640008200082000020100
402043006340115201102000520107200041165851074214010820204200043020640008200052000020100
402043005640107201052000220104200041165871074254010820204200043020640008200052000020100
402043005640107201052000220104200041165941074494010820204200043020640008200052000020100
402043005640107201052000220104200041165931074394010820204200043020640008200062000020100
402043005640107201052000220104200041165751074054010820204200043020640008200052000020100
402043005640107201052000220104200041165831074114010820204200043020640008200082000020100
402043005840112201102000220104200041165971074414010820204200043020640008200052000020100
402043005640107201052000220104200041165821074194010820204200043020640008200052000020100
402043005640107201052000220104200041165921074294010820204200043020640008200052000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400293090340419203202009920164200041164861074954001820024200043002040000200072000020010
400243006340017200172000020010200001164771074914001020020200003002040000200072000020010
400243006340017200172000020010200001164971075314001020020200003002040000200072000020010
400243006340017200172000020010200001165021075414001020020200003002040000200072000020010
400243006340017200172000020010200001164831075034001020020200003002040000200072000020010
400243006340017200172000020010200001164881075134001020020200003002040000200072000020010
400243006340017200172000020010200001164721074814001020020200003002040000200072000020010
400243006340017200172000020010200001164771074914001020020200003002040000200072000020010
400243006340017200172000020010200361136891119194008220056200363002040000200072000020010
400243006340017200172000020010200001164801074974001020020200003002040000200072000020010

Test 3: throughput

Code:

  stadd x0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.6469

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051237254087920858200211011720000245004823119333010010200200002020040000212702000010100
302041284904130721295200121011520000238243322523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100
302041264184119621196200001010020000238243622523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100
302041264694120121201200001010020000238243622523083010010200200002020040000211012000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.6476

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251380944179121756200351003920000245624523130833001010020200002002040000212672000010010
300241297614128321283200001001020000245614323129573001010020200002002040000212722000010010
300241297614128321283200001001020000245614323129573001010020200002002040000212722000010010
300241298114127721277200001001020000245614323129573001010020200002002040000212722000010010
300251298534133821295200431003820000245625023126213001010020200002002040000203662000010010
300241264674111121111200001001020000238871622535083001010020200002002040000211012000010010
300241264764111221112200001001020000238871622535083001010020200002002040000211012000010010
300241264764111221112200001001020000238871622535083001010020200002002040000211012000010010
300241264764111221112200001001020000238871622535083001010020200002002040000211012000010010
300251252204089320847200461003720000238871622535083001010020200002002040000211012000010010