Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (post-index, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005123520331020101310361000208231739720001000100010001000100110001000
2004106320011001100010001000213041766120001000100010001000100110001000
2004106420011001100010001000214421766620001000100010001000100110001000
2004106720011001100010001000214471759320001000100010001000100110001000
2004107720011001100010001000213041766520001000100010001000100110001000
2004107420011001100010001000213401766720001000100010001000100110001000
2004107720011001100010001000211651810120001000100010001000100110001000
2004106420011001100010001000217571788220001000100010001000100110001000
2004109020011001100010001000213881766120001000100010001000100110001000
2004109120011001100010001000211631789020001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0105

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971301501614015610005402471000218502045346485010840211100037022110004400041000040100
5020470142501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470105501044010410000401061000318502805347125010940212100047028910013400151000040100
5020470107501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470105501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470105501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470105501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470105501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470103501044010410000401061000318503345347305010940212100047022110004400041000040100
5020470105501044010410000401061000318502805347125010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0103

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971282500704006510005401561000318504245350925001940032100047002010000400041000040010
5002470103500144001410000400101000018502825350275001040020100007002010000400041000040010
5002470101500144001410000400101000018502825350275001040020100007002010000400041000040010
5002470101500144001410000400101001318522735356255006340073100157002010000400041000040010
5002470101500144001410000400101000018502825350275001040020100007002010000400041000040010
5002470101500144001410000400101000018502825350275001040020100007002010000400041000040010
5002470101500144001410000400101000018503635350545001040020100007002010000400041000040010
5002470101500144001410000400101000018502825350275001040020100007002010000400041000040010
5002470101500144001410000400101000018502825350275001040020100007002010000400041000040010
5002470101500144001410000400101000018502825350275001040020100007002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldrsh x0, [x6], #8
  ldrsh x0, [x7], #8
  ldrsh x0, [x8], #8
  ldrsh x0, [x9], #8
  ldrsh x0, [x10], #8
  ldrsh x0, [x11], #8
  ldrsh x0, [x12], #8
  ldrsh x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442611604288031380115803168001124061364224316012480213800138021280012800098000080100
160204432291601098010980000801128001224052964412116012480212800128021280012800098000080100
160204432271601098010980000801128001124052964068816012380212800128021280012800098000080100
160204432231601098010980000801128001224052964257216012480212800128021280012800098000080100
160204432251601098010980000801128001024071463156116012280212800128025280052800498000080100
160204432441601098010980000801128001124065364121016012380212800128021280012800098000080100
160204432241601108010980001801128001124052964409916012380212800128021280012800098000080100
160204432241601108010980001801128001124052963925816012380212800128021080010800078000080100
160205434481601828015180031801548001224064164106816012480212800128021280012800098000080100
160204432231601098010980000801128001124052964447116012380212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443641603268022380103802268001024027264183316003280032800128002080000800018000080010
160024432231600118001180000800108000024022364210216001080020800008002080000800018000080010
160024432231600118001180000800108000024022364212316001080020800008002080000800018000080010
160024432231600118001180000800108000024022363770116001080020800008002080000800018000080010
160024432231600118001180000800108000024022363885516001080020800008002080000800018000080010
160024432291600118001180000800108000024022364352716001080020800008002080000800018000080010
160024432271600118001180000800108000024033864357716001080020800008002080000800018000080010
160025432971600948006180033800648000024022364316616001080020800008002080000800018000080010
160024432231600118001180000800108000024022364605816001080020800008002080000800018000080010
160024432231600118001180000800108000024022364496016001080020800008002080000800018000080010