Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STCLR (64-bit)

Test 1: uops

Code:

  stclr x0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005344473019101520041002200077671051830001000200020004000100320001000
73004341243003100320001000200077671051830001000200020004000100320001000
73004344153003100320001000200077671051830001000200020004000100320001000
73004343863003100320001000200077671051830001000200020004000100320001000
73004341543003100320001000200077671051830001000200020004000100320001000
73004341183003100320001000200077671051830001000200020004000100320001000
73004341133003100320001000200077671051830001000200020004000100320001000
73004341133003100320001000200077671051830001000200020004000100320001000
73004341533003100320001000200277751053030031001200220004000100320001000
73004341233003100320001000200077671051830001000200020004000100320001000

Test 2: throughput

Code:

  stclr x0, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402093105540531204302010120261200071167681077944011420207200073021140013200102000020100
402043006640115201102000520107200041168371076464010820204200043020640008200082000020100
402043006640110201082000220104200041168391076484010820204200043020640008200082000020100
402043006640110201082000220104200041168141076284010820204200043020640008200082000020100
402043006640110201082000220104200041168391076504010820204200043020640008200082000020100
402043006640110201082000220104200041168211076384010820204200043020640008200082000020100
402043006640110201082000220104200041168321076484010820204200043020640008200082000020100
402043006640110201082000220104200041168281076464010820204200043020640008200082000020100
402043006640110201082000220104200041168031076064010820204200043020640008200082000020100
402043006640110201082000220104200041168101076204010820204200043020640008200082000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400293082940402203052009720164200041165871075854001820024200043002040000200072000020010
400243006640017200172000020010200001165781075794001020020200003002040000200072000020010
400243006640017200172000020010200001166791077764001020020200003002040000200072000020010
400243008940020200182000220014200001164731074834001020020200003002040000200072000020010
400243006340017200172000020010200001164871075114001020020200003002040000200072000020010
400243006340017200172000020010200001164831075034001020020200003002040000200072000020010
400243006340017200172000020010200001164831075034001020020200003002040000200072000020010
400243006340017200172000020010200001164011073504001020020200003002040000200072000020010
400243006340017200172000020010200001164721074814001020020200003002040000200072000020010
400243006340017200172000020010200001164721074814001020020200003002040000200072000020010

Test 3: throughput

Code:

  stclr x0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051302914139421342200521013420000245004223119333010010200200002026040114206792000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302051298284145321415200381012120000244983823117643010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000213092000010100
302041297544140921409200001010020000244994323118073010010200200002020040000211022000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251469934277522745200301003520000245624523130833001010020200002002040000212732000010010
300241297464124521245200001001020000245624523130473001010020200002002040000212722000010010
300251251674087920833200461004020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020050245619923131053008810048200562002040000212432000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010
300241297544128221282200001001020000245614323129573001010020200002002040000212722000010010