Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (pre-index, S)

Test 1: uops

Code:

  str s0, [x6, #0x10]!

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100516942059104110181040100046371825320001000200010011000
100411162001100110001000100046371848720001000200010011000
100411132001100110001000100046371814520001000200010011000
100411052001100110001000100046371801920001000200010011000
100411102001100110001000100046371872120001000200010011000
100411372001100110001000100046371843320001000200010011000
100411042001100110001000100046371839120001000200010011000
100411152001100110001000100046371846920001000200010011000
100411352001100110001000100046371803720001000200010011000
100411282001100110001000100046411940420001000200010011000

Test 2: Latency 3->3

Code:

  str s0, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1386

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
1021416823207031052301018010524010003444331929942010920010010200200201000510000100
1020411394201041010401000010104010003437041942332010920010010200200161000410000100
1020411438201031010301000010104010001434891939532010520010008200200161000410000100
1020411375201041010401000010104010001434791942702010520010008200200161000410000100
1020411402201041010401000010104010001434911944042010520010008200200161000410000100
1020411349201041010401000010104010001435101930622010520010008200200161000410000100
1020411547201611014301001810146010000434731939732010020010004200200161000410000100
1020411411201041010401000010104010002434921942462010620010008200200161000410000100
1020411386201011010101000010103010002435041941202010620010008200200161000410000100
1020411390201041010401000010104010002434651935972010620010008200200161000310000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1244

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10034160652061110431101801043210003445961923222001920100102020000100011000010
10024112572001110011100001001010000430961932852001020100002020000100011000010
10024112702001110011100001001010000430941918092001020100002020000100011000010
10024112122001110011100001001010000430941921152001020100002020000100011000010
10024112212001110011100001001010000430941924572001020100002020000100011000010
10024112122001110011100001001010000430941916832001020100002020000100011000010
10024112632001110011100001001010000430941915392001020100002020000100011000010
10024112582001110011100001001010000430951925652001020100002020000100011000010
10024112072001110011100001001010000430941917012001020100002020000100011000010
10024111982001110011100001001010000430941919892001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  str s0, [x6, #0x10]!
  str s0, [x7, #0x10]!
  str s0, [x8, #0x10]!
  str s0, [x9, #0x10]!
  str s0, [x10, #0x10]!
  str s0, [x11, #0x10]!
  str s0, [x12, #0x10]!
  str s0, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
802148204016069280512080180805110800022403121360108160106200800082001600968003780000100
802048005316010580105080000801040800022403121360157160106200800082001600168000580000100
848641102451640548252847814798234633800022403121360105160106200800082001600168000580000100
802048005316010580105080000801040800022403121360157160106200800082001600168000580000100
802048005316010580105080000801040800022403121360157160106200800082001600168000580000100
802048005316010580105080000801040800022403121360157160106200800082001600168000580000100
802048006516010580105080000801040800022403121360157160106200800082001600168000580000100
802048005316010580105080000801040800022403121360157160106200800082001600168000580000100
802048005316010580105080000801040800352404191360985160175200800482001601008003880000100
802048005316010580105080000801040800022403121360157160106200800082001600968003780000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80034820711606128043280180804328000324004813606521600192080010201600168000580000010
80024800831600118001180000800108000024003013606911600102080000201600008000180000010
80024800831600118001180000800108000024003013606911600102080000201600008000180000010
80024800831600118001180000800108000024003013606911600102080000201600008000180000010
80024800831600118001180000800108000224004213631631600162080008201600008000180000010
80024801971600118001180000800108000024003013627431600102080000201600008000180000010
80024801971600118001180000800108000024003013627431600102080000201600008000180000010
80024801971600118001180000800108000224004213606971600162080008201600008000180000010
80024800831600118001180000800108000024003013606911600102080000201600008000180000010
80024800831600118001180000800108000024003013606911600102080000201600968003780000010