Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEOR (32-bit)

Test 1: uops

Code:

  steor w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73006346303039102520141007200077701052130001000200020004000100320001000
73004341723003100320001000200077621051330001000200020004000100220001000
73004340973002100220001000200077621051330001000200020004000100220001000
73004341113002100220001000200077621051330001000200020004000100220001000
73004341133002100220001000200077621051330001000200020004000100220001000
73004341153002100220001000200077621051330001000200020004000100220001000
73004340983002100220001000200077621051330001000200020004000100220001000
73004341003002100220001000200077621051330001000200020004000100220001000
73004340963002100220001000200077621051330001000200020004000100220001000
73004340953002100220001000200077621051330001000200020004000100220001000

Test 2: throughput

Code:

  steor w0, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402083073340382202942008820233200071159931063944011420207200073021140013200102000020100
402043006340110201082000220104200041160971063274010820204200043021140013200102000020100
402043006340110201082000220104200041160881063094010820204200043020640008200082000020100
402043006340110201082000220104200041160951063214010820204200043020640008200082000020100
402043006340110201082000220104200041160981063294010820204200043020640008200082000020100
402043006340110201082000220104200041161001063334010820204200043020640008200082000020100
402043006340110201082000220104200041160871063094010820204200043020640008200092000020100
402043006340110201082000220104200041160941063214010820204200043020640008200082000020100
402043006340110201082000220104200041160901063114010820204200043020640008200082000020100
402043006340110201082000220104200041160851063034010820204200043020640008200082000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0066

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400283059840305202142009120134200041158951063304001820024200043002040000200072000020010
400243006340017200172000020010200001158891063274001020020200003002040000200072000020010
400243006340017200172000020010200001158861063324001020020200003002040000200072000020010
400243006340017200172000020010200001158821063244001020020200003002040000200072000020010
400243006340017200172000020010200001158691062964001020020200003002040000200072000020010
400243006340017200172000020010200001158891063404001020020200003002040000200072000020010
400243006340017200172000020010200001158901063404001020020200003002040000200072000020010
400243006340017200172000020010200001158861063324001020020200003002040000200082000020010
400243006340018200182000020010200001158351062404001020020200003002040000200082000020010
400243006340018200182000020010200381164421088644008620058200383002040000200082000020010

Test 3: throughput

Code:

  steor w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9761

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30206131806416842161520069101452000024665512327626301001020020000202004000002140820000010100
30204130606415132150820005101002000023825412252434301001020020000202004000002110220000010100
30204126476412022120220000101002000023826672252542301001020020000202004000002110220000010100
30204126489412022120220000101002000023828752252722301001020020000202584011002074320000010100
30204126476412022120220000101002000023825412252434301001020020000202004000002109820000010100
30204126476412022120220000101002000023827242252596301001020020000202004000002110220000010100
30204126708415332117820355103162000022178602118061301001020020000202004000002110320000010100
30204126245411832117220011101002000023821212252057301001020020000202004000002110120000010100
30204126476412022120220000101002000023825412252434301001020020000202004000002110220000010100
30204126476412022120220000101002000023825412252434301001020020000202004000002110220000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30025132252415342149720037100412000024538572310541030010100202000002002040000213412000010010
30024129613413512135120000100102000024537552310415030010100202000002002040000213412000010010
30024129613413512135120000100102000024537552310415030010100202000002002040000213412000010010
30024129613413512135120000100102005424629442318491030095100512006102002040000212732000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102005123792382245505030091100502005802002040000212732000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002007640110211952000010010
30024129761412832128320000100102000024549272311979030010100202000002002040000212722000010010
30024129754412822128220000100102000024561432312957030010100202000002002040000212722000010010