Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, D)

Test 1: uops

Code:

  ldr d0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005122920761046103010521000179771696320001000100010011000
1004103920011001100010001000180651681520001000100010011000
1004103320011001100010001000181891714120001000100010011000
1004103720011001100010001000183051681520001000100010011000
1004103820011001100010001000184131681920001000100010011000
1004105020011001100010001000179491690320001000100010011000
1004103720011001100010001000181211716620001000100010011000
1004103320011001100010001000181691681420001000100010011000
1004103320011001100010001000184731681320001000100010011000
1004106520011001100010001000179971681420001000100010011000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr d0, [x6], #8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0061

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020910133270163501071005110005402451004510003266036176699379453460110302091000410003602181000410003500011000040100
5020410007670104501011000310000401041000310003265998776688479442260110302091000410003602181000410003500011000040100
5020410006770103501011000210000401041000310003266004276693279446760110302091000410003602181000410003500011000040100
5020410006770103501011000210000401041000310003266004276693279446760110302091000410003602181000410003500011000040100
5020410006770103501011000210000401041000310003266004276693279446760110302091000410003602181000410003500011000040100
5020410006770103501011000210000401041000310003266004276693279446760110302091000410003602181000410003500011000040100
5020410006770103501011000210000401041000310012266160376733379492360158302391001210013602181000410003500011000040100
5020410006770103501011000210000401041000310003266004276693279446760110302091000410003602761001310013500081000040100
5020410007670104501011000310000401041000310003266009676695079448560110302091000410003602181000410003500011000040100
5020410006770103501011000210000401041000310003266004276693279446760110302091000410003602181000410003500011000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0074

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50030101216700825002310053100064018510052100032660105767412794952600203002910004100036003810004100035000110000040010
50024100066700145001110003100004001010000100082661316767721795274600583004910009100106002010000100005000110000040010
50024100073700135001110002100004001010000100002660376767495795012600103002010000100004957315305654037566103063932005
50024100145700155001110004100004001010000100002659917767342794869600103002010000100006009810014100145000810000040010
50024100072700145001110003100004001010000100002659890767333794860600103002010000100006002010000100005000110000040010
50024100059700135001110002100004001010000100002659890767333794860600103002010000100006002010000100005000110000040010
50025100180700265001810007100014004510011100112662336768025795586600683005810013100136002010000100005000110000040010
50024100059700135001110002100004001010000100002659890767333794860600103002010000100006002010000100005000110000040010
50024100065700135001110002100004001010000100002660106767405794931600103002010000100006002010000100005000110000040010
50025100134700265001810007100014004510011100002659971767360794887600103002010000100006002010000100005000110000040010

Test 3: throughput

Count: 8

Code:

  ldr d0, [x6], #8
  ldr d0, [x7], #8
  ldr d0, [x8], #8
  ldr d0, [x9], #8
  ldr d0, [x10], #8
  ldr d0, [x11], #8
  ldr d0, [x12], #8
  ldr d0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80209449791605258037580150803758000724033069657716011720080014200800168000980000100
80204432161601098010980000801128000724033669641416011920080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100
80204432191601098010980000801128005524049269290816021920080072200800168000980000100
80204432171601098010980000801128000624033669706016011820080016200800168000980000100
80204432171601098010980000801128000624033669706016011820080016200800168000980000100
80204432151601098010980000801128000624033669706016011820080016200800168000980000100
80204432181601098010980000801128000624033669713216011820080016200800168000980000100
80204432221601098010980000801128000624033669715016011820080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80029441881604298027980150802828001124006669732016003320800162080000800018000010
80024432291600118001180000800108000024003069711016001020800002080000800018000010
80024432191600118001180000800108000024003069711016001020800002080000800018000010
80024432191600118001180000800108000024003069711016001020800002080000800018000010
80024432191600118001180000800108000024003069711016001020800002080000800018000010
80024432281600118001180000800108000024003069711016001020800002080000800018000010
80024432191600118001180000800108005824022269267716013220800722080000800018000010
80024432211600118001180000800108000024003069711016001020800002080000800018000010
80024432191600118001180000800108000024003069712716001020800002080000800018000010
80024432201600118001180000800108000024003069711016001020800002080000800018000010