Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, D)

Test 1: uops

Code:

  ldr d0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005123320731043103010521000180731704820001000100010011000
1004104220011001100010001000184731681320001000100010011000
1004105120011001100010001000178771722720001000100010011000
1004103420011001100010001000179531730420001000100010011000
1004103320011001100010001000184531697220001000100010011000
1004104920011001100010001000185251681420001000100010011000
1004103820011001100010001000183691724620001000100010011000
1004103820011001100010001000180851715420001000100010011000
1004103320011001100010001000183371694420001000100010011000
1004103320011001100010001000183171681220001000100010011000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr d0, [x6, #8]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0074

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50209101151701615010710049100054024510045100022660236766940794475601083020810003100036021810004100035000110000040100
50204100086701045010110003100004010410003100032660555767097794638601103020910004100036021810004100035000110000040100
50204100088701045010110003100004010410003100032660285767009794550601103020910004100036021810004100035000110000040100
50204100088701045010110003100004010410003100032660636767121794662601103020910004100036021810004100035000110000040100
50204100094701045010110003100004010410003100112662440767647795233601583023810013100136021810004100035000110000040100
50204100074701045010110003100004010410003100032660231766993794534601103020910004100036021810004100035000110000040100
50204100074701045010110003100004010410003100032660231766993794534601103020910004100036021810004100035000110000040100
50204100074701045010110003100004010410003100032660231766993794534601103020910004100036021810004100035000110000040100
50204100074701045010110003100004010410003100032660231766993794534601103020910004100036021810004100035000110000040100
50204100074701045010110003100004010410003100032660231766993794534601103020910004100036021810004100035000110000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0076

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002910120470067500161004610005401541004410003266215276805479557960020300291000410003600201000010000500011000040010
5002410006670014500111000310000400101000010000266007976739279492560010300201000010000600201000010000500011000040010
5002410006670014500111000310000400101000010000266086276763479516660010300201000010000600201000010000500011000040010
5002410006670014500111000310000400101000010000266007976739279492560010300201000010000600201000010000500011000040010
5002410006670014500111000310000400101000010000266007976739279492560010300201000010000600201000010000500011000040010
5002410006670014500111000310000400101000010000266007976739279492560010300201000010000600201000010000500011000040010
5002410007670015500111000410000400101000010000266034976747279500560010300201000010000600201000010000500011000040010
5002410006670014500111000310000400101000010000266007976739279492560010300201000010000600201000010000500011000040010
5002510014670028500181000910001400451001210000266007976739279492560010300201000010000600201000010000500011000040010
5002410006670014500111000310000400101000010000266007976739279492560010300201000010000600201000010000500011000040010

Test 3: throughput

Count: 8

Code:

  ldr d0, [x6, #8]!
  ldr d0, [x7, #8]!
  ldr d0, [x8, #8]!
  ldr d0, [x9, #8]!
  ldr d0, [x10, #8]!
  ldr d0, [x11, #8]!
  ldr d0, [x12, #8]!
  ldr d0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8020944187160524803748015080376800072403246964740160115200800120200800128000980000100
8020443223160109801098000080108800062403246971500160114200800120200800128000980000100
8020443220160109801098000080108800062403246971500160114200800120200800128000980000100
8020443220160109801098000080108800062403246971500160114200800120200800128000980000100
8020543282160191801618003080160800072403246973840160115200800120200800128000980000100
8020543473160189801598003080161800062403246971860160114200800120200800128000980000100
8020443220160109801098000080108800062403246972940160114200800120200800128000980000100
8020443220160109801098000080108800062403246972400160114200800120202800708006480000100
80204432301601098010980000801128224027813271330231116499539148266323200800128000980000100
8020443230160109801098000080108800072403246972450160115200800120200800128000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80029442101604258027580150802788000724005469729416002520800122080000800018000010
80024432241600118001180000800108000024003069726016001020800002080000800018000010
80024432231600118001180000800108000024003069726016001020800002080000800018000010
80024432231600118001180000800108000024003069726016001020800002080000800018000010
80024432231600118001180000800108000024003069726016001020800002080000800018000010
80024432231600118001180000800108000024003069726016001020800002080000800018000010
80024432331600118001180000800108000024003069726016001020800002080000800018000010
80024432311600118001180000800108000024003069732316001020800002080000800018000010
80024432301600118001180000800108000024003069729616001020800002080000800018000010
80024432251600118001180000800108000024003069726016001020800002080000800018000010