Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (pre-index, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005139720731043103010421000208211754720001000100010001000100110001000
2004106420011001100010001000212081754020001000100010001000100110001000
2004109320011001100010001000210671757320001000100010001000100110001000
2004106320011001100010001000210731764120001000100010001000100110001000
2004107320011001100010001000211821752920001000100010001000100110001000
2004106420011001100010001000211661734220001000100010001000100110001000
2004106120011001100010001000208081757420001000100010001000100110001000
2004108020011001100010001000212901747220001000100010001000100110001000
2004108920011001100010001000210791743420001000100010001000100110001000
2004107920011001100010001000215551790920001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0285

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971290501624015710005402471000218520455352495010840211100037022110004400071000040100
5020470166501074010710000401061000318519275352765010940212100047022110004400071000040100
5020470205501074010710000401061000318530615356525010940212100047022110004400071000040100
5020470226501074010710000401061000318522785353955010940212100047022110004400071000040100
5020470164501074010710000401061000318519275352765010940212100047022110004400071000040100
5020470164501074010710000401061000318519275352765010940212100047022110004400071000040100
5020470164501074010710000401061000318519275352765010940212100047022110004400071000040100
5020470164501074010710000401061000318519275352765010940212100047022110004400071000040100
5020470164501074010710000401061000318534125357695010940212100047022110004400071000040100
5020470164501074010710000401061000318519275352765010940212100047022110004400071000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0123

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971248500694006410005401561000318519955355805001940032100047002010000400041000040010
5002470127500144001410000400101000318515315354515001940032100047002010000400041000040010
5002470123500144001410000400101000018508765352675001040020100007002010000400041000040010
5002470121500144001410000400101000018508225352495001040020100007002010000400041000040010
5002470121500144001410000400101000018508765352675001040020100007002010000400051000040010
5002470123500144001410000400101000018508225352495001040020100007002010000400041000040010
5002470123500144001410000400101000018508765352675001040020100007002010000400041000040010
5002470123500144001410000400101000018508225352495001040020100007002010000400041000040010
5002570222500284002610002400501000018510925353415001040020100007010910013400151000040010
5002470138500144001410000400101000018510655353185001040020100007002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldrsh w0, [x6, #8]!
  ldrsh w0, [x7, #8]!
  ldrsh w0, [x8, #8]!
  ldrsh w0, [x9, #8]!
  ldrsh w0, [x10, #8]!
  ldrsh w0, [x11, #8]!
  ldrsh w0, [x12, #8]!
  ldrsh w0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020944204160411803138009880316800122405816412571601258021380013802128001208000980000080100
16020443228160109801098000080112800092404856342991601218021280012802128001208000980000080100
16020443216160109801098000080112800112404856460111601238021280012802128001208000980000080100
16020443217160109801098000080112800092404856428901601218021280012802128001208000980000080100
16020443217160110801098000180112800092404796439011601198021080010802128001208000980000080100
16020443217160110801098000180112800122404856392881601248021280012802128001208000980000080100
16020443217160109801098000080112800122404856442291601248021280012802128001208000980000080100
1602044323416011380109800048011280008240485644676160120802128001283095826506681544813513882330
16020443237160109801098000080112800112404856360911601238021280012802128001208000980000080100
16020443217160109801098000080112800112404856465681601238021280012802128001208000980000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029444421603418022280119802258001124034064526516003380032800128002080000800018000080010
160024432301600118001180000800108000024022364395916001080020800008002080000800018000080010
160024432231600118001180000800108000024022364124416001080020800008002080000800018000080010
160024432231600118001180000800108005424062762118916011880074800548002080000800018000080010
160024432231600118001180000800108000024022364667316001080020800008002080000800018000080010
160024432231600118001180000800108000024022364540816001080020800008002080000800018000080010
160024432231600118001180000800108000024022364127616001080020800008002080000800018000080010
160024432271600118001180000800108000024022363905616001080020800008002080000800018000080010
160024432231600118001180000800108000024022364112316001080020800008002080000800018000080010
160024432241600118001180000800108000024022364579716001080020800008002080000800018000080010