Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (pre-index, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005122820331020101310361000206061736320001000100010001000100110001000
2004107520011001100010001000212781739520001000100010001000100110001000
2004106620011001100010001000212301741320001000100010001000100110001000
2004112220011001100010001000206651821620001000100010001000100110001000
2004107920011001100010001000215261764720001000100010001000100110001000
2004107820011001100010001000212241762520001000100010001000100110001000
2004111920011001100010001000213761763120001000100010001000100110001000
2004111820011001100010001000213001798620001000100010001000100110001000
2004109820011001100010001000214041764820001000100010001000100110001000
2004110120011001100010001000208561771020001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0101

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971253501614015610005402471000218509115349005010840211100037029210015400151000040100
5020470141501044010410000401061000318509015349635010940212100047022110004400041000040100
5020470164501044010410000401061000318516305352065010940212100047022110004400041000040100
5020470146501044010410000401061000318512795350885010940212100047022110004400041000040100
5020470139501044010410000401061000318518465352725010940212100047022110004400041000040100
5020470207501054010510000401061000318527205355015010940212100047022110004400041000040100
5020470101501044010410000401061001218535305357035015240251100137022110004400041000040100
5020470129501044010410000401061000318521165353615010940212100047022110004400041000040100
5020470121501044010410000401061000318509825349865010940212100047022110004400041000040100
5020470101501044010410000401061000318504695348155010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0248

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971433500764007110005401561000318522605357135001940032100047002010000400071000040010
5002470186500174001710000400101000018520375356215001040020100007011210015400191000040010
5002470184500184001810000400101000018518755355675001040020100007002010000400071000040010
5002470160500174001710000400101000018518755355675001040020100007002010000400071000040010
5002470160500174001710000400101000918536975358135005340059100107002010000400071000040010
5002470202500174001710000400101000018522535356935001040020100007002010000400071000040010
5002470160500174001710000400101000018518755355675001040020100007002010000400071000040010
5002470202500174001710000400101000018524965357745001040020100007002010000400071000040010
5002470185500174001710000400101000018524695357635001040020100007002010000400071000040010
5002470177500174001710000400101000018521995356755001040020100007002010000400071000040010

Test 3: throughput

Count: 8

Code:

  ldrsb x0, [x6, #8]!
  ldrsb x0, [x7, #8]!
  ldrsb x0, [x8, #8]!
  ldrsb x0, [x9, #8]!
  ldrsb x0, [x10, #8]!
  ldrsb x0, [x11, #8]!
  ldrsb x0, [x12, #8]!
  ldrsb x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209442381604238031280111803158001124057863808616012380212800128021280012800098000080100
160204432281601118010980002801128001124048564498916012380212800128021280012800098000080100
160204432151601098010980000801128001124048564641116012380212800128025480054800518000080100
160204432151601098010980000801128000924048564445916012180212800128021280012800098000080100
160204432181601098010980000801128001124048564214316012380212800128021280012800098000080100
160204432161601098010980000801128001024048564055416012280212800128021280012800098000080100
160204432221601098010980000801128001124048564272116012380212800128021280012800098000080100
160204432151601098010980000801128001124048564793116012380212800128021280012800098000080100
160204432191601108010980001801128001024048563519816012280212800128021280012800098000080100
160204432191601108010980001801128001224062664032416012480212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029443291603358022280113802258005324068964123116011780074800548003280012800098000080010
160024432311600118001180000800108000024027264128216001080020800008002080000800018000080010
160024432221600118001180000800108000024027263470716001080020800008002080000800018000080010
160024432231600118001180000800108000024027263758016001080020800008002080000800018000080010
160024432241600118001180000800108000024027263933716001080020800008002080000800018000080010
160024432231600118001180000800108000024027264026616001080020800008002080000800018000080010
160025436211600918006180030800648005424084058437916011880074800548003080010800078000080010
160024432291600118001180000800108000024030464645116001080020800008002080000800018000080010
160024432301600118001180000800108000024033164402816001080020800008002080000800018000080010
160024432311600118001180000800108000024030464364416001080020800008002080000800018000080010