Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, Q)

Test 1: uops

Code:

  ldr q0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005124720771047103010521000177971704720001000100010011000
1004104620011001100010001000180331690620001000100010011000
1004105520011001100010001000177291693820001000100010011000
1004104720011001100010001000181451760520001000100010011000
1004105120011001100010001000171211742420001000100010011000
1004105120011001100010001000179771695720001000100010011000
1004104020011001100010001000183571694220001000100010011000
1004104120011001100010001000183251695920001000100010011000
1004107120011001100010001000179651681020001000100010011000
1004104320011001100010001000185531695920001000100010011000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, #8]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 8.9003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020915349571403501071129110005402451004610002317125590931895579660108302081000310003602181000410003500011000040100
5020511921071381501061127410001401351001310003317134590851895500560110302091000410003602181000410003500011000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602181000410003500011000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602181000410003500011000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602181000410003500011000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602781001310013500081000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602181000410003500011000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602181000410003500011000040100
5020411938771373501011127210000401041000310003317318190904695560160110302091000410003602181000410003500011000040100
5020411900371373501011127210000401041000310003317099490842795490160110302091000410003602181000410003500011000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 8.8964

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002918277471310500161128910005401541004310003317097190935695585260020300291000410003600201000010000500011000040010
5002511924771294500181127510001400451001110000317035690918595564860010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000316962790896995540560010300201000010000600201000010000500011000040010
5002411896471280500111126910000400101000010000317000590908195553160010300201000010000600201000010000500011000040010

Test 3: throughput

Count: 8

Code:

  ldr q0, [x6, #8]!
  ldr q0, [x7, #8]!
  ldr q0, [x8, #8]!
  ldr q0, [x9, #8]!
  ldr q0, [x10, #8]!
  ldr q0, [x11, #8]!
  ldr q0, [x12, #8]!
  ldr q0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5514

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80209453171605218037180150803748000724032471281016011520080012200800128000580000100
80204441161601058010580000801088000724032471320416011520080012200800128000580000100
80204441101601058010580000801088000724032471316816011520080012200800668005880000100
80204441081601058010580000801088000724032471340216011520080012200800128000580000100
80204441101601058010580000801088000724032471320416011520080012200800128000580000100
80204441101601058010580000801088000724032471320416011520080012200800128000580000100
80204441101601058010580000801088000724032471316816011520080012200800128000580000100
80204441101601058010580000801088000724032471316816011520080012200800128000580000100
80204441221601058010580000801088000724032471320416011520080012200800128000580000100
80204441251601058010580000801088000724032471324016011520080012200800128000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5517

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80029454731604318028180150802808000724005471372816002520800122080000800018000010
80024441361600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010
80024441381600118001180000800108000024003071371516001020800002080000800018000010