Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEORB

Test 1: uops

Code:

  steorb w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005345543018101420041002200077701052130001000200020004000100320001000
73004342263003100320001000200077701052130001000200020004000100320001000
73004342423003100320001000200077701052130001000200020004000100320001000
73004342983003100320001000200077731052730001000200020004000100320001000
73004342203003100320001000200077701052130001000200020004000100320001000
73004342153003100320001000200077701052130001000200020004000100320001000
73004342003003100320001000200077701052130001000200020004000100320001000
73004342573003100320001000200077701052130001000200020004000100320001000
73005343983006100420021001200077701052130001000200020004000100320001000
73004342193003100320001000200077701052130001000200020024004100420001000

Test 2: throughput

Code:

  steorb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402063036540254202112004320168200041157061061114010820204200043021140013200132000020100
402043006340114201122000220104200041158421058314010820204200043020640008200122000020100
402043006340114201122000220104200071158521060824011420207200073025440072200512000020100
402043006340114201122000220104200041158981058924010820204200043020640008200122000020100
402043028540113201112000220104200071158661060574011420207200073020640008200092000020100
402043006640111201092000220104200041159291059264010820204200043020640008200092000020100
402043006640111201092000220104200041159371059424010820204200043020640008200092000020100
402043006640111201092000220104200041159381059444010820204200043020640008200092000020100
402043006640111201092000220104200041159501059684010820204200043020640008200092000020100
402043006640111201092000220104200041159351059384010820204200043020640008200092000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400273048740218201482007020104200041161361063784001820024200043002040000200082000020010
400243006640018200182000020010200001157231058874001020020200003002040000200082000020010
400243006640018200182000020010200001157061058954001020020200003002040000200082000020010
400243006640018200182000020010200001157351059114001020020200003002040000200082000020010
400243006640018200182000020010200001157331059054001020020200003002040000200082000020010
400253013240096200602003620048200001157141059114001020020200003002040000200082000020010
400243006640018200182000020010200001157031058454001020020200003002040000200082000020010
400243006640018200182000020010200001157251058894001020020200003002040000200082000020010
400243006640018200182000020010200001157301059014001020020200003002040000200082000020010
400243006640018200182000020010200001157371059154001020020200003002040000200082000020010

Test 3: throughput

Code:

  steorb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30206126776411892112020069101452000023688012242338030100102002000002025640109211502000010100
30204129822414272141920008101002000023688002242338030100102002000002020040000209832000010100
30204129754414092140920000101002000024499432311807030100102002000002020040000213092000010100
30204129754414092140920000101002000024499432311807030100102002000002020040000213092000010100
30204129754414092140920000101002000024499432311807030100102002000002020040000213092000010100
30204129754414092140920000101002004824791522338895030176102282005602020040000212732000010100
30204129826413822137320009101002000024490532312542030100102002000002020040000212732000010100
30204129826413822137320009101002000024490532312542030100102002000002020040000212732000010100
30204129754414092140920000101002000024499432311807030100102002000002020040000213092000010100
30204129754414092140920000101002000024499432311807030100102002000002020040000213092000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30026130460413292124620083100602000024561392312957030010100202000002002040000212722000010010
30024129749412812128120000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002006240085212792000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010