Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single, H)

Test 1: uops

Code:

  ld1 { v0.h }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.001

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
62005298722011110081002100210003000777020001000100010002000110001000
62004294532002110011000100010003000776720001000100010002000110001000
62005295902002110011000100010003006776820001000100010002000110001000
62004295862002110011000100010003000776220001000100010002000110001000
62004296432002110011000100010003000776220001000100010002000110001000
62004295242002110011000100010003001781020001000100010002000110001000
62004298912002110011000100010003000776020001000100010002000110001000
62005294882002110011000100010003000776020001000100010002000110001000
62004297112002110011000100010003000776020001000100010002000110001000
62004295062002110011000100010003000776720001000100010002000110001000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.h }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60205120225701144010120012100013013020031100043209095125760325757276011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860294100163004940006100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120050701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860302100173004940006100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100043001240001100001000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0042

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60025120156700244001120012100013004020031100043209019125922125785866002430032100042000860020100003000040001100001000040010
60024120052700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120045700134001120002100003001020000100483215466126277525848166028730175100482010060020100003000040001100001000040010
60024120040700134001120002100003001020000100283209585125947325790716015630110100282005860120100163004940007100001000040010
60024120047700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100003208844125916525784496001030020100002000060020100003000040001100001000040010
60024120040700134001120002100003001020000100153209208125933925788016008830069100162003360020100003000040001100001000040010
60024120040700134001120002100003001020000100003209141125929425786996001030020100002000060250100363011140025100001000040010
60025120073700224001620005100013004320031100033208934125906225782876002230030100042000812019982337126585211841331425359623
60024120042700134001120002100003001320007100043208884125916625784716002430032100042000860020100003000040001100001000040010

Test 3: throughput

Count: 8

Code:

  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020516015916013910180006800321008002680041300240419408019016017520080045800452008000416000818000080000100
16020416005316010510180002800021008000080002300240188407979216010220080004800042008000416000818000080000100
16020416005316010510180002800021008000080041300240361408017516017520080045800452008000416000818000080000100
16020416005316010510180002800021008000080002300240188407979216010220080004800042008000416000818000080000100
16020516008816014310180010800321008003480002300240188407979216010220080004800042008000416000818000080000100
16020416005316010510180002800021008000080002300240188407979216010220080004800042008000416000818000080000100
16020416005316010510180002800021008000080002300240188407979216010220080004800042008000416000818000080000100
16020416005316010510180002800021008000080002300240170407984416010220080004800042008000416000818000080000100
1602041600471601041018000180002100800008004030024040640802421601742008004480044470874869142526204272596712202491
16020516009016014310180010800321008003480002300240102407963616010220080004800042008000416000818000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
162103176955161865110380016807461066800608004030240408408019016008420800448004420800441600881800008000010
16002416005316001611800028000310800008000030240188407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800451600901800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008004030240358408019016008420800448004420800001600001800008000010
16002416005316001311800028000010800008000030240182407979216001020800008000020800001600001800008000010
16002416005316001311800028000010800008000030240184407979216001020800008000020800001600001800008000010