Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, S)

Test 1: uops

Code:

  ldr s0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005124620761046103010521000180531690420001000100010011000
1004104520011001100010001000183171703220001000100010011000
1004108120011001100010001000181131751920001000100010011000
1004105120011001100010001000184931699720001000100010011000
1004104620011001100010001000184291697420001000100010011000
1004104120011001100010001000177331764520001000100010011000
1004106220011001100010001000173211708720001000100010011000
1004106620011001100010001000178691762220001000100010011000
1004104320011001100010001000181531703020001000100010011000
1004108520011001100010001000187651710620001000100010011000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr s0, [x6, #8]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0106

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020910111770161501071004910005402451004510011266193076746679500660158302381001310013602181000410003500011000040100
5020410006670104501011000310000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020510012870118501101000710001401371001310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100
5020410005970103501011000210000401041000310003265982676687879441060110302091000410003602181000410003500011000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0086

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50029101415700685001610047100054015410043100032659943767382794914600203002910004100036002010000100005000110000040010
50024100075700135001110002100004001010000100382666419769331796898602283013910038100406008010010100115000810000040010
50024100060700135001110002100004001010000100002659890767343794869600103002010000100006002010000100005000110000040010
50024100084700135001110002100004001010000100002660268767463794989600103002010000100006002010000100005000110000040010
50024100059700135001110002100004001010000100002659890767343794869600103002010000100006002010000100005000110000040010
50024100425700735005110014100084015010041100002660025767386794912600103002010000100006002010000100005000110000040010
50024100059700135001110002100004001010000100002659890767343794869600103002010000100006002010000100005000110000040010
50024100471700745005210013100094014710040100002659917767352794878600103002010000100006009810014100145000810000040010
50024100060700135001110002100004001010000100002659890767343794869600103002010000100006002010000100005000110000040010
50024100060700135001110002100004001010000100002660457767522795048600103002010000100006009810012100135000810000040010

Test 3: throughput

Count: 8

Code:

  ldr s0, [x6, #8]!
  ldr s0, [x7, #8]!
  ldr s0, [x8, #8]!
  ldr s0, [x9, #8]!
  ldr s0, [x10, #8]!
  ldr s0, [x11, #8]!
  ldr s0, [x12, #8]!
  ldr s0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5404

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80209444851605228037280150803748000724032469681816011520080012200800128000980000100
80204435991601068010680000801098000724032469651616011520080012200800128000580000100
80204432291601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100
80204432241601058010580000801088000624032469728016011420080012200800128000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80029446861604298027980150802788000724005469747416002520800122080000800018000010
80024432221600118001180000800108000024003069712616001020800002080000800018000010
80024432301600118001180000800108000024003069743216001020800002080000800018000010
80024432191600118001180000800108000024003069712616001020800002080000800018000010
80024432191600118001180000800108000024003069727716001020800002080000800018000010
80024432181600118001180000800108000024003069712616001020800002080000800018000010
80024432191600118001180000800108000024003068254716001020800002080000800018000010
80024432191600118001180000800108000024003069712616001020800002080000800018000010
80024432201600118001180000800108000024003069746816001020800002080000800018000010
80024432281600118001180000800108000024003069711516001020800002080000800018000010