Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STCLR (32-bit)

Test 1: uops

Code:

  stclr w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005343753019101520041002200077701052130001000200020004000100320001000
73004341023003100320001000200077701052130001000200020004000100220001000
73004340953002100220001000200077621051330001000200020004000100220001000
73004341193002100220001000200077701052130001000200020004000100220001000
73004341223003100320001000200077701052130001000200020004000100220001000
73004341083002100220001000200077701052130001000200020004000100220001000
73004341073003100320001000200077621051330001000200020004000100220001000
73004341083002100220001000200077621051330001000200020004000100220001000
73004341193002100220001000200077701052130001000200020004000100320001000
73004341193002100220001000200077621051330001000200020004000100220001000

Test 2: throughput

Code:

  stclr w0, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402073064740390203242006620201200071160111064244011420207200073020640008200092000020100
402043006340116201112000520107200071160641065264011420207200073020640008200092000020100
402043006340110201082000220104200041160891063174010820204200043020640008200092000020100
402043006840112201102000220104200041161021063264010820204200043020640008200102000020100
402043006340111201092000220104200041161411063564010820204200043020640008200092000020100
402043006340111201092000220104200041161181063544010820204200043020640008200092000020100
402043006340111201092000220104200361104201149704017220236200363020640008200082000020100
402053014140185201522003320137200041160901062944010820204200043020640008200092000020100
402043006340111201092000220104200041160681062724010820204200043020640008200092000020100
402043006340111201092000220104200041160991063254010820204200043020640008200082000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400273051440225201662005920104200041156981060844001820024200043002040000200062000020010
400243005840018200182000020010200001157281061584001020020200003002040000200062000020010
400243005640016200162000020010200001157521062044001020020200003002040000200062000020010
400243005640016200162000020010200001157331061684001020020200003002040000200062000020010
400243005940018200182000020010200001157091061204001020020200003002040000200072000020010
400243005640016200162000020010200001157051061204001020020200003002040000200062000020010
400243005840017200172000020010200041155001078574001820024200043002040000200282000020010
400243005940037200372000020010200001159741063114001020020200003002040000200232000020010
400243005940037200372000020010200001159991063754001020020200003002040000200272000020010
400243005940037200372000020010200001159981063734001020020200003007440072201042000020010

Test 3: throughput

Code:

  stclr w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.6476

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3020512707741237212002003710131200002400951227039730100102002000020200400002110220000010100
3020412647641202212022000010100200002382436225230830100102002000020200400002110120000010100
3020412647641202212022000010100200512363121223550330181102302005820200400002109720000010100
3020412647641202212022000010100200002382436225230830100102002000020200400002110120000010100
3020412647641202212022000010100200002382436225230830100102002000020200400002110120000010100
3020512287340889208472004210128200002382436225230830100102002000020200400002110120000010100
3020412647641202212022000010100200002382436225230830100102002000020200400002110120000010100
3020512504141000209532004710130200002382436225230830100102002000020200400002110120000010100
3020412647641202212022000010100200002382436225230830100102002000020200400002110120000010100
3020412631741171211712000010100200002381283225132030100102002000020200400002110120000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251341844166221626200361004120000252072623704813001010020200002002040000215192000010010
300241329654152921529200001001020000252073023704813001010020200002002040000215192000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212592000010010
300241304664123121231200001001020000247037423252243001010020200002002040000212202000010010
300241304544123021230200001001020000245614323129573001010020200002002040000211022000010010
300241297644127821278200001001020000245568623125613001010020200002002040000212722000010010
300241297544128221282200001001020050239236122572163008810048200562007640110212082000010010
300241297614126321263200001001020000245527923125413001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010