Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, 32-bit)

Test 1: uops

Code:

  stp w0, w1, [x6], #8

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100516402059104110181040100047611749720001000300010011000
100411022001100110001000100047611747920001000300010011000
100410662001100110001000100047611749720001000300010011000
100410652001100110001000100047611740720001000300010011000
100410752001100110001000100048011746120001000300010011000
100410742001100110001000100047611809120001000300010011000
100410652001100110001000100047611756920001000300010011000
100410682001100110001000100047611756920001000300010011000
100410682001100110001000100047611782120001000300010011000
100410682001100110001000100047611803720001000300010011000

Test 2: Latency 3->3

Code:

  stp w0, w1, [x6], #8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0107

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10209112112040010310100901031010003683671710092010920010010200300301000510000100
10204101052010510105100001010610002435501711692010620010008200300241000410000100
10204101062010410104100001010410002435501711512010620010008200300241000410000100
10205101952015510138100171014310002435361716412010620010008200300241000410000100
10204101212010410104100001010410002435541714752010620010008200300241000410000100
10204101082010410104100001010410002435501711872010620010008200300241000410000100
10204101072010410104100001010410002435501711512010620010008200300241000410000100
10204101072010410104100001010410002435501711512010620010008200300241000410000100
10204101072010410104100001010410002435501711692010620010008200300241000410000100
10204101072010410104100001010410002435501711512010620010008200300241000410000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0090

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
100291155620304102141009010214100021012571707602001620100082030030100021000010
10024101022001410014100001001410000430911712352001020100002030000100011000010
10024101102001110011100001001010000430951715772001020100002030000100011000010
10024101182001110011100001001010000430921711632001020100002030000100011000010
10024101012001110011100001001010000430871711452001020100002030000100011000010
10024101012001110011100001001010000430921708752001020100002030000100011000010
10024101012001110011100001001010000430931710192001020100002030000100011000010
10024101022001110011100001001010000430921712172001020100002030000100011000010
10024100812001110011100001001010000430991706772001020100002030000100011000010
10024100822001110011100001001010000430971707492001020100002030000100011000010

Test 3: throughput

Count: 8

Code:

  stp w0, w1, [x6], #8
  stp w0, w1, [x7], #8
  stp w0, w1, [x8], #8
  stp w0, w1, [x9], #8
  stp w0, w1, [x10], #8
  stp w0, w1, [x11], #8
  stp w0, w1, [x12], #8
  stp w0, w1, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8020980901160401803118009080311800022403121360108160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100
8020580108160154801378001780140800022403121360157160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100
8020580108160154801378001780140800022403121360157160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100
8020480053160105801058000080104800022403121360157160106200800082002400248000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8002981146160305802158009080214800022400421360121160016208000820240000800018000010
8002480078160011800118000080010800002400301360205160010208000020240000800018000010
8002480056160011800118000080010800002400301360205160010208000020240000800018000010
8002480056160011800118000080010800002400301360223160010208000020240000800018000010
8002480056160011800118000080010800002400301360205160010208000020240000800018000010
8002480056160011800118000080010800002400301360205160010208000020240000800018000010
8002480056160011800118000080010800002400301360205160010208000020240000800018000010
8002580112160065800488001780052800002400301360187160010208000020240000800018000010
8002480056160011800118000080010800002400301360205160010208000020240000800018000010
8002480056160011800118000080010800002400301360115160010208000020240000800018000010