Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STRH (post-index)

Test 1: uops

Code:

  strh w0, [x6], #8

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100513012059104110181040100046211776720001000200010011000
100410682001100110001000100047611749720001000200010011000
100410952001100110001000100047651738920001000200010011000
100410712001100110001000100047691798320001000200010011000
100410652001100110001000100047651755120001000200010011000
100412012001100110001000100046771753320001000200010011000
100410692001100110001000100046771893720001000200010011000
100411032001100110001000100046771832520001000200010011000
100411262001100110001000100046811796520001000200010011000
100410962001100110001000100046811809120001000200010011000

Test 2: Latency 2->2

Code:

  strh w0, [x6], #8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0095

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10209112182040010310100901031010003578781710882010920010010200200161000410000100
10204100942010510105100001010610003633671709082010920010010200200161000410000100
10204100922010410104100001010410002436251719972010620010008200200161000410000100
10204101112010410104100001010410002436461715112010620010008200200161000410000100
10204101192010410104100001010410002436311709532010620010008200200161000410000100
10204100962010410104100001010410002436311709532010620010008200200161000410000100
10204101182010410104100001010410004435371724382011120010011200200201000710000100
10205101992015210135100171014010000436221709832010420010008200200161000410000100
10204101092010410104100001010410002436331711512010620010008200200161000410000100
10204100912010410104100001010410002435831713892010620010008200200161000410000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0144

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10029112162030310213100901021310002438181719662001620100082020020100051000010
10024101502001410014100001001410000430751717032001020100002020000100011000010
10024101422001110011100001001010000430751717572001020100002020000100011000010
10024101472001110011100001001010000430711718292001020100002020000100011000010
10024101472001110011100001001010000430711718832001020100002020000100011000010
10024101862001110011100001001010000430711718292001020100002020000100011000010
10024101562001110011100001001010000430711719192001020100002020000100011000010
10024101552001110011100001001010000430751721352001020100002020000100011000010
10024101562001110011100001001010000430731718832001020100002020000100011000010
10024101522001110011100001001010000430711717932001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  strh w0, [x6], #8
  strh w0, [x7], #8
  strh w0, [x8], #8
  strh w0, [x9], #8
  strh w0, [x10], #8
  strh w0, [x11], #8
  strh w0, [x12], #8
  strh w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8020981214160403803138009080313800022403121360113160106200800082001600168000580000100
8020480056160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800352404191360971160175200800482001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100
8020480048160105801058000080104800022403121360051160106200800082001600168000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8002980941160305802158009080214800022400421360265160016208000820160000800018000010
8002480433160243801718007280170800002400301360099160010208000020160000800018000010
8002480045160011800118000080010800002400301360099160010208000020160096800378000010
8002480045160011800118000080010800352401491360701160085208004820160416801978000010
8002480051160011800118000080010800002400301360099160010208000020160000800018000010
8002480051160011800118000080010800002400301359991160010208000020160000800018000010
8002480051160011800118000080010801442405101364617160314208016020160000800018000010
8002480045160011800118000080010800002400301360099160010208000020160000800018000010
8002480437160243801718007280170800002400301360117160010208000020160000800018000010
8002480051160011800118000080010800352401491360665160085208004820160000800018000010