Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (post-index, Q)

Test 1: uops

Code:

  str q0, [x6], #0x10

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100514102059104110181040100048011820520001000200010011000
100411272001100110001000100046611845120001000200010011000
100411232001100110001000100048131850520001000200010011000
100411862001100110001000100048171944120001000200010011000
100411202001100110001000100048131931520001000200010011000
100411182001100110001000100048131837920001000200010011000
100411502001100110001000100048171947720001000200010011000
100411332001100110001000100048131859520001000200010011000
100411422001100110001000100048171926120001000200010011000
100411182001100110001000100048171872120001000200010011000

Test 2: Latency 3->3

Code:

  str q0, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1427

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10214153162069310513101801051210002443311951302010820010010200200201000310000100
10204114232010410104100001010410001434711944022010520010008200200161000410000100
10204114302010410104100001010410001434911947802010520010008200200161000410000100
10204114412010410104100001010410001434931950142010520010008200200161000410000100
10204114252010410104100001010410001434921942942010520010008200200161000410000100
10204114232010410104100001010410001434871945822010520010008200200161000410000100
10204114332010410104100001010410001434721944562010520010008200200161000410000100
10204114402010410104100001010410001434971947982010520010008200200161000410000100
10204114342010410104100001010410001434911949782010520010008200200161000410000100
10204114132010410104100001010410001434891947982010520010008200200161000410000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1378

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10034154042061110431101801043210002446531940502001620100082020000100011000010
10024114002001110011100001001010000431251939892001020100002020000100011000010
10024113802001110011100001001010000431161945632001020100002020000100011000010
10024113682001110011100001001010000431231940232001020100002020000100011000010
10024113732001110011100001001010000431241937892001020100002020000100011000010
10024113692001110011100001001010000431241938792001020100002020000100011000010
10024113712001110011100001001010000431241936452001020100002020000100011000010
10024113622001110011100001001010000431221945632001020100002020000100011000010
10024113722001110011100001001010000431241940952001020100002020000100011000010
10024113812001110011100001001010000431241936992001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  str q0, [x6], #0x10
  str q0, [x7], #0x10
  str q0, [x8], #0x10
  str q0, [x9], #0x10
  str q0, [x10], #0x10
  str q0, [x11], #0x10
  str q0, [x12], #0x10
  str q0, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8021482451160692805128018080511800022403121360162160106200800082001600168000580000100
8020480056160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600968003780000100
8020480063160105801058000080104800022403121359979160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020580103160154801378001780143800022403121360051160106200800082001600968003780000100
8020480056160105801058000080104800022403121362879160106200800082001600168000580000100
8020480048160105801058000080104800022403121360002160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0012

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8003482262160612804328018080432800022400421360147160016208000820160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160016800058000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360187160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010