Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STCLRB

Test 1: uops

Code:

  stclrb w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005347733018101420041002200077701052130001000200020004000100320001000
73004346933003100320001000200077701052130001000200020004000100320001000
73004345313003100320001000200077701052130001000200020004000100320001000
73004347543003100320001000200077691052030001000200020004000100320001000
73004340953003100320001000200077701052130001000200020004000100320001000
73004340923003100320001000200077701052130001000200020004000100320001000
73004345443003100320001000200077701052130001000200020004000100320001000
73004343193006100620001000200077671051830001000200020004000100320001000
73004341273003100320001000200277721052630031001200220004000100320001000
73004344633003100320001000200077671051830001000200020004000100320001000

Test 2: throughput

Code:

  stclrb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402063041040261202162004520169200071157611059334011420207200073021140013200102000020100
402043006340110201082000220104200041159091059184010820204200043020640008200122000020100
402043006340114201122000220104200381146581087494017620238200383020640008200082000020100
402043006340110201082000220104200041158751058444010820204200043020640008200122000020100
402043006340114201122000220104200361126701102184017220236200363020640008200082000020100
402043006340110201082000220104200041158871058784010820204200043020640008200122000020100
402043006340110201082000220104200041158791058624010820204200043020640008200122000020100
402043006340110201082000220104200041158951058924010820204200043020640008200122000020100
402043006340110201082000220104200041158951058944010820204200043020640008200122000020100
402043006340114201122000220104200041159151059324010820204200043020640008200122000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400263037840163201212004220074200041157431059564001820024200043002040000200082000020010
400243006640018200182000020010200001157001058754001020020200003002040000200082000020010
400243006640019200192000020010200001156951058654001020020200003002040000200092000020010
400243006640019200192000020010200001157071058894001020020200003002040000200092000020010
400243006640019200192000020010200001156971058734001020020200003002040000200092000020010
400243006640019200192000020010200001157181059114001020020200003002040000200092000020010
400243006640019200192000020010200001157031058804001020020200003002040000200092000020010
400243006640019200192000020010200001157121058994001020020200003002040000200082000020010
400253013140097200612003620048200001157181059184001020020200003002040000200082000020010
400243006640019200192000020010200001156701058214001020020200003002040000200092000020010

Test 3: throughput

Code:

  stclrb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9752

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051319424162321565200581013220000236541622388343010010200200002020040000210442000010100
302041256804114921144200051010020000236542422388353010010200200002020040000210442000010100
302041256734114821143200051010020000236542422388353010010200200002020040000210442000010100
302041256734114821143200051010020000236542422388353010010200200002020040000210442000010100
302041256734114821143200051010020000239503022648443010010200200002020040000212092000010100
302041256934114621141200051010020185244264623081783039110306202062020040000210442000010100
302041256764114821143200051010020000236542422388353010010200200002020040000211022000010100
302041264694120121201200001010020000238254122524343010010200200002020040000211022000010100
302041264694120121201200001010020000238214222520743010010200200002020040000211022000010100
302041264694120121201200001010020000238254122524343010010200200002020040000211022000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300241293254124921244200051001520000251378123633473001010020200002011840194210682000010010
300241297544128821288200001001120000245603023128673001010020200002007840110209322000010010
300241297674123421234200001001020000245618323129933001010020200002002040000212712000010010
300241297544128121281200001001020051238243322487893009110050200582002040000212572000010010
300241297544128121281200001001020000245614523129573001010020200002002040000212712000010010
300241297544128121281200001001020000245614523129573001010020200002002040000212712000010010
300241297574127921279200001001020000245609323129213001010020200002002040000212712000010010
300241297544128121281200001001020000245614523129573001010020200002002040000212712000010010
300251240424068720642200451003820000245614523129573001010020200002002040000212712000010010
300241297544128121281200001001020000245614523129573001010020200002002040000212712000010010