Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STLLRB

Test 1: uops

Code:

  stllrb w0, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
10056159101911018100010473010001000200011000
10046065100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000
10046058100111000100010458810001000200011000

Test 2: throughput

Code:

  stllrb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0076

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
202066031620212101761003610175100003566210680222010110203100031020320006100021000010100
202046007220101101011000010100100003566210678962010110203100031020220004100011000010100
202046007220101101011000010100100003565810678962010010202100021020220004100011000010100
202046007220101101011000010100100003565810678962010010202100021020220004100011000010100
202046007220101101011000010100100003565810678962010010202100021020220004100011000010100
202046007220101101011000010100100313618910691402016310234100341020220004100011000010100
202046007220101101011000010100100003565810678962010010202100021020220004100011000010100
202046007220101101011000010100100003565810678962010010202100021020220004100011000010100
202056014620151101331001810132100003549010678212010110203100041020220004100011000010100
202046007220101101011000010100100003565810678962010010202100021020220004100011000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0072

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
200266031520120100841003610083100003543310680222001010022100021002020000100011000010010
200246007920011100111000010010100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310678962001010020100001002020000100011000010010
200256014620061100431001810042100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310678962001010020100001002020000100011000010010
200246007220011100111000010010100003543310679322001010020100001002020000100011000010010

Test 3: throughput

Code:

  stllrb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10205601521011910110018100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020104110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100
10204600651010110110000100100003001067730101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10024600651001111100001010000301067730100102010004202000811000010
10024600631001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010
10024600641001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010
10024600631001111100001010000301067694100102010000202000011000010