Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single, D)

Test 1: uops

Code:

  ld1 { v0.d }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
6200530560201011007100210021000030027775200001000100010002000110001000
6200429985200311002100010001000030007770200001000100010002000110001000
6200429681200311002100010001000030037773200001000100010002000110001000
6200429761200311002100010001000030007772200001000100010002000110001000
6200429432200311002100010001000030007770200001000100010002000110001000
6200429644200311002100010001000030007770200001000100010002000110001000
6200429843200311002100010001000030007770200001000100010002000110001000
6200429288200311002100010001000030007770200001000100010002000110001000
6200429299200311002100010001000030007770200001000100010002000110001000
6200429298200311002100010001000030007770200001000100010002000110001000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.d }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0042

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60205120525701144010120012100013013020031100043208694125743725753906011430212100042000860224100043001240001100001000040100
60204120040701034010120002100003010320007100043208774125760625756906011430212100042000860224100043001240001100001000040100
60204120040701034010120002100003010320007100043208774125760625756906011430212100042000860224100043001240001100001000040100
60205120073701124010620005100013013320031100043208774125760625756906011430212100042000860294100163004940006100001000040100
60204120045701034010120002100003010320007100043208774125760625756906011430212100042000860224100043001240001100001000040100
60204120040701034010120002100003010320007100043208774125760625756906011430212100042000860224100043001240001100001000040100
60204120040701034010120002100003010320007100043208774125760625756906011430212100042000860224100043001240001100001000040100
60204120040701034010120002100003010320007100043208774125760625756906011430212100042000860300100163004940006100001000040100
60204120055701034010120002100003010320007100043208828125762825757346011430212100042000860224100043001240001100001000040100
60204120040701034010120002100003010320007100043208774125760625756906011430212100042000860224100043001240001100001000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0040

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60025123428700244001120012100013004020031100043209053125930325787306002430032100042000860020100003000040001100001000040010
60024120041700134001120002100003001020000100003209060125925925786316001030020100002000060318100493014740030100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120231700414002720010100043007620048100003209033125924325786046001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120426700694004320018100083014220098100003208952125921025785386001030020100002000060020100003000040001100001000040010

Test 3: throughput

Count: 8

Code:

  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020516015816013910180006800321008002680002300240170407984416010220080004800042008004416008818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080040300240340408024216017420080044800442008000416000818000080000100
16020416005516010510180002800021008000080040300240395408024216017420080044800442008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020516009016014310180010800321008003480002300240834408041616010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002516016216004911800068003210800268000230240188407979216001220800048000420800001600001800008000010
16002516008816005311800108003210800348000030240186407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008004030240408408019016008420800448004420800001600001800008000010
16002416005716001311800028000010800008000030240188407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002516008816005311800108003210800348000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800441600881800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800441600881800008000010