Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STNP (signed offset, Q)

Test 1: uops

Code:

  stnp q0, q1, [x6, #0x10]
  nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
900541292019120182000878120002000400012000
900423892001120002000921120002000400012000
900423022001120002000919720002000400012000
900423022001120002000919720002000400012000
900423092001120002000919720002000400012000
900423022001120002000919720002000400012000
900423022001120002000919720002000400012000
900423022001120002000895020002000400012000
900423102001120002000918220002000400012000
900423032001120002000924320002000400012000

Test 2: throughput

Count: 8

Code:

  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  stnp q0, q1, [x6, #0x10]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0747

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020516014616011910101600181000160001300282766801601012001600080200320016011600000100
16020416602816010110101600001000160001300282770401601012001600080200320016011600000100
16020616608016013510101600341000160001300282774001601012001600080200320016011600000100
16020416602816010110101600001000160001300282770401601012001600080200320016011600000100
16020516604916011810101600171000160001300282680401601012001600080200320016011600000100
16020416597816010110101600001000160001300282680401601012001600080200320016011600000100
16020416597816010110101600001000160001300282680401601012001600080200320016011600000100
16020416597816010110101600001000160034300282696801601342001600480200320016011600000100
16020416597816010110101600001000160001300282680401601012001600080200320016011600000100
16020416597816010110101600001000160001300282680401601012001600080200320016011600000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0753

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1600251601541600291101600181001600013028275620160011201600080203200161160000010
1600241660211600111101600001001600003028275590160010201600000203200001160000010
1600241660211600111101600001001600343028278120160044201600480203200801160000010
1600241660211600111101600001001600003028274990160010201600000203200001160000010
1600241660211600111101600001001600003028275590160010201600000203200001160000010
1600241660211600111101600001001600003028275590160010201600000203200961160000010
1600241660211600111101600001001600003028275590160010201600000203200961160000010
1600241660211600111101600001001600003028275590160010201600000203200001160000010
1600241660211600111101600001001600003028275590160010201600000203200001160000010
1600241660211600111101600001001600343028275460160044201600480203200001160000010