Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single, S)

Test 1: uops

Code:

  ld1 { v0.s }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.001

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
62005297152011110081002100210003000776720001000100010002000110001000
62004293592003110021000100010003000776720001000100010002000110001000
62004293552003110021000100010003000776720001000100010002000110001000
62004293762003110021000100010003000776720001000100010002000110001000
62004293762003110021000100010003000776720001000100010002000110001000
62004293762003110021000100010003000776720001000100010002000110001000
62004295422003110021000100010003000776720001000100010002000110001000
62004295702003110021000100010003000776720001000100010002000110001000
62004295902003110021000100010003000776820001000100010002000110001000
62004297442003110021000100010003000776720001000100010002000110001000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.s }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0048

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60205120154701144010120012100013013020031100043208883125751425755516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043209098125774025759636011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0042

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60025120385700244001120012100013004020031100003209087125926425786566001030020100002000060020100003000040001100001000040010
60024120049700154001120004100003001020000100003208898125918725784956001030020100002000060118100173004940006100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100003000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060118100163004940006100001000040010
60024120050700134001120002100003001020000100003208898125918725784956001030020100002000060020100003000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100003000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060114100163004940006100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060120100163004940006100001000040010
60024120044700134001120002100003001020000100003208898125918725784956001030020100002000060020100003000040001100001000040010
60024120042700134001120002100003001020000100003208952125920925785396001030020100002000060020100003000040001100001000040010

Test 3: throughput

Count: 8

Code:

  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020516015816013910180006800321008002680002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240198407984416010220080004800042008000416000818000080000100
16020516009016014310180010800321008003480002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240176407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240188407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
16020416005516010510180002800021008000080040300240662408024216017420080044800442008000416000818000080000100
16020416005516010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002516015916004911800068003210800268000230240200407979216001220800048000420800001600001800008000010
16002416004516001211800018000010800008000030240110407961016001020800008000020800001600001800008000010
16002416004516001211800018000010800008000030240052407966216001020800008000020800001600001800008000010
16002416004516001211800018000010800008000230240170407984416001220800048000420800001600001800008000010
16002416004716001211800018000010800008000030240074407963616001020800008000020800001600001800008000010
16002416004716001211800018000010800008000030240074407963616001020800008000020800001600001800008000010
16002416004716001211800018000010800008004030240304408022616008420800448004420800001600001800008000010
16002416004716001211800018000010800008000030240074407963616001020800008000020800001600001800008000010
16002416004716001211800018000010800008004030240304408022616008420800448004420800001600001800008000010
16002416004716001211800018000010800008000030240104407963616001020800008000020800001600001800008000010