Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STP (post-index, 64-bit)

Test 1: uops

Code:

  stp x0, x1, [x6], #8

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100515072059104110181040100046571908120001000300010011000
100411352001100110001000100046571908120001000300010011000
100411562001100110001000100046571811620001000300010011000
100411512001100110001000100046611845120001000300010011000
100411562001100110001000100046611893720001000300010011000
100411582001100110001000100046571906320001000300010011000
100411542001100110001000100046571805520001000300010011000
100411552001100110001000100046571902720001000300010011000
100411582001100110001000100046571900920001000300010011000
100411812001100110001000100046571906320001000300010011000

Test 2: Latency 3->3

Code:

  stp x0, x1, [x6], #8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0600

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10209118712040410314100901031410003446731804692010920010010200300301000510000100
10204106012010410104100001010410002435181806412010620010008200300241000410000100
10204106442010410104100001010410002435161813072010620010008200300241000410000100
10204106452010410104100001010410002435181806412010620010008200300241000410000100
10204106192010410104100001010410002435181807672010620010008200300241000410000100
10204106392010410104100001010410002435171808032010620010008200300241000410000100
10204106192010410104100001010410002435171803352010620010008200300241000410000100
10204106502010410104100001010410002435191807132010620010008200300241000410000100
10204106472010410104100001010410002435181809652010620010008200300241000410000100
10204106302010410104100001010410002435171808212010620010008200300241000410000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0647

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10029118552030710217100901021810002429851808572001620100082030000100011000010
10024106352001110011100001001010000429691809232001020100002030000100011000010
10024106422001110011100001001010000429731810132001020100002030000100011000010
10024106352001110011100001001010000429661809412001020100002030000100011000010
10024106542001110011100001001010000429681806352001020100002030000100011000010
10024106752001110011100001001010000429781812472001020100002030000100011000010
10024106592001110011100001001010000429781804012001020100002030000100011000010
10024106232001110011100001001010000429771811032001020100002030000100011000010
10024106632001110011100001001010000429821807212001020100002030030100051000010
10024106612001410014100001001410000429331809412001020100002030000100011000010

Test 3: throughput

Count: 8

Code:

  stp x0, x1, [x6], #8
  stp x0, x1, [x7], #8
  stp x0, x1, [x8], #8
  stp x0, x1, [x9], #8
  stp x0, x1, [x10], #8
  stp x0, x1, [x11], #8
  stp x0, x1, [x12], #8
  stp x0, x1, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0139

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8020981964160401803118009080311800032403181378616160109200800102002400248000580000100
8020481085160105801058000080104800022403121378704160106200800082002400248000580000100
8020481073160105801058000080104800022403121378501160106200800082002401448003780000100
8020481076160105801058000080104800022403121378501160106200800082002400248000580000100
8020481076160105801058000080104800352404191379120160175200800482002400248000580000100
8020481073160105801058000080104800022403121378501160106200800082002400248000580000100
8020481073160105801058000080104800022403121378501160106200800082002400248000580000100
8020481073160105801058000080104800022403121378501160106200800082002400248000580000100
8020481073160105801058000080104800352404191379133160175200800482002400248000580000100
8020481145160105801058000080104800022403121379208160106200800082002400248000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0139

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8002982037160305802158009080214800022400421378661160016208000820240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800352401491379151160085208004820240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010
8002481073160011800118000080010800002400301378495160010208000020240000800018000010