Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STNP (32-bit)

Test 1: uops

Code:

  stnp w0, w1, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005154710191101810001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000
1004106110011100010001727910001000300011000

Test 2: throughput

Code:

  stnp w0, w1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0038

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202091123420404103141009010315100031207131699802011010209100091020730021100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100
202041003820101101011000010108100031553951699312011110208100081020830024100011000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
20029108802031310223100901022410003622941704172002010027100081002030000100011000010010
200241005620011100111000010010100001021861700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001005930117100321000010010
20024100472001110011100001001010000929171700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001002030000100011000010010
200241004220011100111000010010100001021861700832001010020100001002030000100011000010010

Test 3: throughput

Code:

  stnp w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0401

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1020510149101191011001810010001300176544101012001000820230126210000100
1020410408101011011000010010001300176366101012001000820030024110000100
1020410401101011011000010010001300176544101012001000820030024110000100
1020410401101011011000010010001300176402101012001000820030024110000100
1020410408101011011000010010001300176544101012001000820030024110000100
1020410401101011011000010010001300176402101012001000820030024110000100
1020410401101011011000010010001300176402101012001000820030024110000100
1020410401101011011000010010001300180146101012001000820030024110000100
1020410401101011011000010010001300176402101012001000820030024110000100
1020410401101011011000010010001300176402101012001000820030024110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0408

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1002510146100291110018101000130176508100112010008203000011000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203002411000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203000011000010
1002410408100111110000101000030176541100102010000203000011000010