Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STRB (post-index)

Test 1: uops

Code:

  strb w0, [x6], #8

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100512762059104110181040100046411767720001000200010011000
100410812001100110001000100046411755120001000200010011000
100411102001100110001000100046411758720001000200010011000
100410802001100110001000100046411818120001000200010011000
100410822001100110001000100046411756920001000200010011000
100410732001100110001000100046411756920001000200010011000
100410772001100110001000100046411821720001000200010011000
100410772001100110001000100046371771320001000200010011000
100410782001100110001000100046411753320001000200010011000
100410852001100110001000100046411765920001000200010011000

Test 2: Latency 2->2

Code:

  strb w0, [x6], #8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0125

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10209115552040010310100901031010003722511710572010920010010200200201000510000100
10205101982015510137100181014010002435461718352010620010008200200161000410000100
10204101272010410104100001010410002436181714032010620010008200200161000310000100
10204101862010410104100001010410003477371720222010920010010200200201000510000100
10204101562010410104100001010410002435891719252010620010008200200161000410000100
10204101252010410104100001010410002436511714932010620010008200200161000410000100
10204101252010410104100001010410002436511714932010620010008200200161000410000100
10204101252010410104100001010410002436511714932010620010008200200161000410000100
10204101252010410104100001010410002436511714932010620010008200200161000410000100
10204101252010410104100001010410002436511714932010620010008200200161000410000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0111

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10029114082030410214100901021410002430051712412001620100082020020100051000010
10024101192001510015100001001610000429901713072001020100002020000100011000010
10024101182001110011100001001010000429921715232001020100002020000100011000010
10024101162001110011100001001010000429911712532001020100002020000100011000010
10024101062001110011100001001010000429671711812001020100002020000100011000010
10024101162001110011100001001010000429891714872001020100002020000100011000010
10024101112001110011100001001010000429911712712001020100002020000100011000010
10024101082001110011100001001010000429931711812001020100002020000100011000010
10024101402001110011100001001010000429731717392001020100002020000100011000010
10024101302001110011100001001010000429921718832001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  strb w0, [x6], #8
  strb w0, [x7], #8
  strb w0, [x8], #8
  strb w0, [x9], #8
  strb w0, [x10], #8
  strb w0, [x11], #8
  strb w0, [x12], #8
  strb w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
802098092216040180311800908031180003240318136006316010920080010200160016080005800000100
802048005616010580105800008010480002240312136070716010620080008200160016080005800000100
802048005316010580105800008010480002240312136015716010620080008200160016080005800000100
802048005316010580105800008010480002240312136015716010620080008200160016080005800000100
802048005316010580105800008010480002240312136015716010620080008200160340080166800000100
802048005316010580105800008010480002240312136015716010620080008200160016080005800000100
802048005316010580105800008010480002240312136015716010620080008200160016080005800000100
802048005316010580105800008010480002240312136015716010620080008200160016080005800000100
802048005316010580105800008010480002240312136015716010620080008200160016080005800000100
802048044116033780265800728026480035240419136082516017520080048200160016080005800000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8002980995160305802158009080214800022400421360319160016208000820160000800018000010
8002480056160011800118000080010800002400301360045160010208000020160000800018000010
8002580103160065800488001780052800002400301359996160010208000020160000800018000010
8002480048160011800118000080010800002400301360045160010208000020160000800018000010
8002480048160011800118000080010800002400301360045160010208000020160000800018000010
8002480048160011800118000080010800002400301360045160010208000020160000800018000010
8002480048160011800118000080010800002400301360045160010208000020160000800018000010
8002480048160011800118000080010800002400301360045160010208000020160096800378000010
8002480048160011800118000080010800352401491360665160085208004820160000800018000010
8002580108160064800478001780050800002400301360045160010208000020160000800018000010