Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STXP (64-bit)

Test 1: uops

Code:

  stxp w0, x1, x2, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
71005341461003110021000400010001000300011000
71004338831001110001000400010001000300011000
71004338601001110001000400010001000300011000
71004338631001110001000400010001000300011000
71004338571001110001000400010001000300011000
71004338591001110001000400010001000300011000
71004338761001110001000400010001000300011000
71004340551001110001000400010001000300011000
71004341191001110001000400010001000300011000
71004340991001110001000400010001000300011000

Test 2: throughput

Code:

  stxp w0, x1, x2, [x6]
  add x6, x6, 16

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.5087

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20214269152056010380101801037910004354732747142010810204100041020330009100041000010100
20204250892010110101100001010210002354672743002010410202100021020230006100031000010100
20204251732010310103100001010210004354732755352010810204100041020230006100031000010100
20204251932010310103100001010410002354672748522010410202100021020230006100011000010100
20204251462010310103100001010210002354672745992010410202100021020230006100031000010100
20204251982010110101100001010210002354672748472010410202100021020230006100031000010100
20204250152010310103100001010210002354672752962010410202100021020430012100031000010100
20204250642010310103100001010210004354732741832010810204100041020430012100031000010100
20204250612010310103100001010410004354732736752010810204100041020430012100031000010100
20204251052010310103100001010210002354672734652010410202100021020230006100031000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.5592

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20034267742046710287101801028810003352522809552001610023100031002330009100041000010010
20024256002001110011100001001010000352352807462001010020100001002030000100011000010010
20024255522001110011100001001010000352352803462001010020100001002030000100011000010010
20024255732001110011100001001010000352352799962001010020100001002030000100011000010010
20024255902001110011100001001010000352352800812001010020100001002030000100011000010010
20024255942001110011100001001010000352352800882001010020100001002030000100011000010010
20024256332001110011100001001010000352352798912001010020100001002030000100011000010010
20024255292001110011100001001010000352352793102001010020100001002030000100011000010010
20024255832001110011100001001010000352352795752001010020100001002030000100011000010010
20024255932001110011100001001010000352352798332001010020100001002430012100031000010010

Test 3: throughput

Code:

  stxp w0, x1, x2, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020520157101191011001810010000300528855101002001000420030144110000100
1020430049101011011000010010000300528855101002001000420030012110000100
1020430047101011011000010010000300528855101002001000420030012110000100
1020430047101011011000010010000300528855101002001000420030012110000100
1020430047101011011000010010000300528855101002001000420030012110000100
1020430047101011011000010010036300532363101362001005020030012110000100
1020430058101011011000010010000300528855101002001000420030012110000100
1020430047101011011000010010000300529647101002001000420030012110000100
1020430065101011011000010010000300528873101002001000420030018110000100
1020430047101011011000010010000300528855101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002520174100291110018101000030528855100102010004203000011000010
1002430047100111110000101000030528855100102010000203000011000010
1002430047100111110000101000030528855100102010000203000011000010
1002430047100111110000101000030528855100102010000203000011000010
1002430047100111110000101000030528855100102010000203000011000010
1002430047100111110000101000030528855100102010000203000011000010
1002530087100291110018101000030528855100102010000203000011000010
1002430052100111110000101000030528963100102010000203000011000010
1002430062100111110000101000030528927100102010000203000011000010
1002430047100111110000101000030528855100102010000203000011000010