Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (pre-index, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005122120351020101510381000211621734220001000100010001000100110001000
2004106020011001100010001000213061754320001000100010001000100110001000
2004107420011001100010001000214811753920001000100010001000100110001000
2004107420011001100010001000211031753820001000100010001000100110001000
2004109820011001100010001000214011762620001000100010001000100110001000
2004107320011001100010001000212871757920001000100010001000100110001000
2004107320011001100010001000213811753820001000100010001000100110001000
2004107320011001100010001000216011755620001000100010001000100110001000
2004107320011001100010001000211411808020001000100010001000100110001000
2004107320011001100010001000215281756020001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0110

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
502097123350161401561000540247100021850258534710501084021110003702211000404000410000040100
502047009250103401031000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100131852289535116501534025310015702211000404000310000040100
502047010850104401041000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100031850172534716501094021210004702211000404000410000040100
502047009950104401041000040106100031850172534716501094021210004702211000404000410000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0138

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5003071440500794007210007401901000318508835352825001940032100047002010000400041000040010
5002470111500144001410000400101000018504445351335001040020100007002010000400031000040010
5002470126500134001310000400101000018509845353095001040020100007002010000400031000040010
5002470126500134001310000400101000018504175351245001040020100007002010000400031000040010
5002470118500134001310000400101000018508495352685001040020100007002010000400031000040010
5002470121500134001310000400101001218551645365175006240071100147002010000400031000040010
5002470154500134001310000400101000018509035352865001040020100007002010000400031000040010
5002470092500134001310000400101001218521115356585006240071100137002010000400041000040010
5002470135500134001310000400101000018506875352145001040020100007002010000400031000040010
5002470107500134001310000400101000018509035352865001040020100007002010000400031000040010

Test 3: throughput

Count: 8

Code:

  ldrsh x0, [x6, #8]!
  ldrsh x0, [x7, #8]!
  ldrsh x0, [x8, #8]!
  ldrsh x0, [x9, #8]!
  ldrsh x0, [x10, #8]!
  ldrsh x0, [x11, #8]!
  ldrsh x0, [x12, #8]!
  ldrsh x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442101604188031380105803168000924061364381216012280213800138021280012800098000080100
160204432361601098010980000801128001024061064399316012280212800128021280012800098000080100
160204432311601108010980001801128001124062363947916012380212800128021080010800078000080100
160204432301601098010980000801128001024061063725416012280212800128021280012800098000080100
160204432311601098010980000801128001124061064378116012380212800128021280012800098000080100
160204432291601058010580000801088001124061064540316012380212800128021280012800098000080100
160204432301601098010980000801128001124061064308416012380212800128021280012800098000080100
160204432301601098010980000801128000924061065029616012180212800128021280012800098000080100
160204432381601138010980004801128005424102762489916020880254800548021280012800098000080100
160204432301601098010980000801128001124061063508616012380212800128021080010800078000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443931603378022380114802268000024030464470216001080020800008002080000800018000080010
160024432301600118001180000800108000024022364458616001080020800008002080000800018000080010
160024432231600118001180000800108000024022363939816001080020800008002080000800018000080010
160024432231600118001180000800108000024022364589716001080020800008002080000800018000080010
160024432221600118001180000800108000024022364468016001080020800008002080000800018000080010
160024432211600118001180000800108000024022364518016001080020800008002080000800018000080010
160024432231600118001180000800108000024022364503816001080020800008002080000800018000080010
160024432231600118001180000800108000024022364375516001080020800008002080000800018000080010
160024432231600118001180000800108000024022362536316001080020800008002080000800018000080010
160024432231600118001180000800108000024022364604816001080020800008002080000800018000080010