Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (post-index, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
2005122720401021101910421000209541764520001000100010001000100110001000
2004107920011001100010001000208451803420001000100010001000100110001000
2004108320011001100010001000209451762720001000100010001000100110001000
2004108420011001100010001000211331770020001000100010001000100110001000
2004110820011001100010001000209271788020001000100010001000100110001000
2004112220011001100010001000212981789020001000100010001000100110001000
2004108420011001100010001000202761857220001000100010001000100110001000
2004106820011001100010001000212221782820001000100010001000100110001000
2004106720011001100010001000210531760920001000100010001000100110001000
2004106720011001100010001000207791801520001000100010001000100110001000

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0119

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020971246501594015410005402471000218505065347225010840211100037022110004400041000040100
5020470117501044010410000401061000318506585348405010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348405010940212100047028910013400151000040100
5020470117501044010410000401061000318506585348405010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348405010940212100047022110004400041000040100
5020470117501044010410000401061000318506045348245010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348405010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348405010940212100047022110004400041000040100
5020470117501044010410000401061000318506585348405010940212100047022110004400041000040100
5020470117501044010410000401061000318506045348245010940212100047022110004400041000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0147

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002971218500704006510005401561000318507485352025001940032100047002010000400041000040010
5002470119500144001410000400101000018506065351375001040020100007002010000400041000040010
5002470113500144001410000400101000018505525351215001040020100007011210014400151000040010
5002470115500144001410000400101000018506065351375001040020100007002010000400041000040010
5002470113500144001410000400101000018506065351375001040020100007002010000400041000040010
5002470113500144001410000400101000018506065351375001040020100007002010000400041000040010
5002470113500144001410000400101000018506065351375001040020100007002010000400041000040010
5002470113500144001410000400101000018506065351375001040020100007002010000400041000040010
5002570268500274002510002400491000018510385352745001040020100007010910013400161000040010
5002470137500144001410000400101000018514975354165001040020100007002010000400041000040010

Test 3: throughput

Count: 8

Code:

  ldrsb x0, [x6], #8
  ldrsb x0, [x7], #8
  ldrsb x0, [x8], #8
  ldrsb x0, [x9], #8
  ldrsb x0, [x10], #8
  ldrsb x0, [x11], #8
  ldrsb x0, [x12], #8
  ldrsb x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160209441971604168031380103803168001124061064190316012380212800128021280012800098000080100
160204432341601098010980000801128001024052364522816012080210800108021280012800098000080100
160204432241601098010980000801128001224052964072216012480212800128021080010800078000080100
160204432271601098010980000801128001124052964312216012380212800128021280012800098000080100
160204432231601098010980000801128000924052964449516012180212800128025480054800518000080100
160204432311601098010980000801128001124052963897016012380212800128025480054800518000080100
160204432321601098010980000801128001124052964533516012380212800128021280012800098000080100
160204432221601078010780000801088001024052964154216012280212800128021280012800098000080100
160204432231601098010980000801128001224115963244116012480212800128021280012800098000080100
160204432651601098010980000801128000824092064188816012080212800128021280012800098000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
160029442971603208022080100802218001124030864086416003380032800128002080000800018000080010
160024432221600118001180000800108000024017964600816001080020800008002080000800018000080010
160024432171600118001180000800108000024017964264316001080020800008002080000800018000080010
160024432171600118001180000800108000024017963923516001080020800008002080000800018000080010
160024432191600118001180000800108000024017964279316001080020800008002080000800018000080010
160024432151600118001180000800108000024017964474516001080020800008002080000800018000080010
160024432171600118001180000800108000024017964524816001080020800008002080000800018000080010
160024432171600118001180000800108000024017963923516001080020800008002080000800018000080010
160024432171600118001180000800108000024017964404116001080020800008002080000800018000080010
160024432171600118001180000800108000024017963648616001080020800008002080000800018000080010