Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (post-index, S)

Test 1: uops

Code:

  str s0, [x6], #0x10

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100514082059104110181040100052131817620001000200010011000
100411672001100110001000100048131881120001000200010011000
100411452001100110001000100048131877520001000200010011000
100411122001100110001000100048131852320001000200010011000
100411332001100110001000100048131888320001000200010011000
100411622001100110001000100048131926120001000200010011000
100411662001100110001000100048131891920001000200010011000
100411842001100110001000100048051888320001000200010011000
100412192001100110001000100047851875720001000200010011000
100411192001100110001000100047851902720001000200010011000

Test 2: Latency 3->3

Code:

  str s0, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1328

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10214146272069810518101801051810005444111922542011420010010200200161000410000100
10204112762010410104100001010410001436221921962010520010008200200161000410000100
10204112562010510105100001010810001436341915302010520010008200200161000410000100
10204112972010310103100001010410002436191926252010620010008200200241000510000100
10204112982010410104100001010410001436431913502010520010008200200241000510000100
10204112522010410104100001010410002436501925232010620010008200200161000410000100
10204112522010410104100001010410004436481907182011220010012200200241000510000100
10204112662010410104100001010410001436211929462010520010008200200081000110000100
10204113282010410104100001010410001436341920702010520010008200200161000310000100
10204115322021910183100361018510002436401906252010620010008200200161000410000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1209

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10034153212061110431101801043210003441801924122001920100102020000100011000010
10024111512001110011100001001010000431021921432001020100002020000100011000010
10024112042001110011100001001010000431051910992001020100002020000100011000010
10024111772001110011100001001010000431061907212001020100002020000100011000010
10024111962001110011100001001010000431061899652001020100002020000100011000010
10024111612001110011100001001010000431061900552001020100002020000100011000010
10024111592001110011100001001010000431041907032001020100002020000100011000010
10024111472001110011100001001010000431021914232001020100002020000100011000010
10024111662001110011100001001010000431021912972001020100002020000100011000010
10024112962001110011100001001010000431041903972001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  str s0, [x6], #0x10
  str s0, [x7], #0x10
  str s0, [x8], #0x10
  str s0, [x9], #0x10
  str s0, [x10], #0x10
  str s0, [x11], #0x10
  str s0, [x12], #0x10
  str s0, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8021482075160692805128018080511800022403121360211160106200800082001600168000580000100
8020480056160105801058000080104800022403121360211160106200800082001600168000580000100
8020580138160155801388001780142800022403121360211160106200800082001600168000580000100
8020480056160105801058000080104800022403121360211160106200800082001600968003780000100
8020480056160105801058000080104800022403121360211160106200800082001600168000580000100
8020480056160105801058000080104800022403121360211160106200800082001600168000580000100
8020480056160105801058000080104800022403121360391160106200800082001600168000580000100
8020480056160105801058000080104800022403121360211160106200800082001600168000580000100
8020480056160105801058000080104800022403121360211160106200800082001600968004080000100
8020480152160163801458001880144800022403121360211160106200800082001600168000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8003482318160612804328018080432800022400421360157160016208000820160016800058000010
8002480053160015800158000080014800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002480045160011800118000080010800002400301359991160010208000020160000800018000010
8002580110160064800478001780050800002400301360151160010208000020160000800018000010