Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, Q)

Test 1: uops

Code:

  ldr q0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005123120761046103010521000179291715420001000100010011000
1004105520011001100010001000174451692120001000100010011000
1004104320011001100010001000179251765820001000100010011000
1004103720011001100010001000180331692120001000100010011000
1004105120011001100010001000179851731520001000100010011000
1004105220011001100010001000180051713820001000100010011000
1004106820011001100010001000178331693820001000100010011000
1004105120011001100010001000175651711920001000100010011000
1004103720011001100010001000180211731620001000100010011000
1004104220011001100010001000182851753320001000100010011000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6], #8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 8.8977

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020915441071403501071129110005402451004510003317060091816995563560110302091000410003602181000410003500011000040100
5020411897671368501011126710000401041000310003316989891796195540160110302091000410003602181000410003500011000040100
5020411896371367501011126610000401041000310003316989891796195540160110302091000410003602181000410003500011000040100
5020411905371368501011126710000401041000310003316989891796195540160110302091000410003602181000410003500011000040100
5020411896371367501011126610000401041000310003316952091784995528960110302091000410003602181000410003500011000040100
5020411896371367501011126610000401041000310003316989891796195540160110302091000410003602181000410003500011000040100
5020411896371367501011126610000401041000310012317431291928495679760160302391001410014602741001310013500081000040100
5020411896371367501011126610000401041000310003316989891796195540160110302091000410003602181000410003500011000040100
5020411896371367501011126610000401041000310003317551491962595713060110302091000410003602181000410003500011000040100
5020411919771366501011126510000401041000310003319082692416296200860110302091000410003602181000410003500011000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 8.9003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002916312271312500161129110005401541004310002317097391870395395760018300281000310003600201000010000500011000040010
5002411897571281500111127010000400101000010000317099491874995570760010300201000010000600201000010000500011000040010
5002411900371283500111127210000400101000010000317061691863795558160010300201000010000600201000010000500011000040010
5002411901771283500111127210000400101000010000317426191971795672360010300201000010000600201000010000500011000040010
5002411900371283500111127210000400101000010000317099491874995570760010300201000010000600201000010000500011000040010
5002411900371283500111127210000400101000010000317099491874995570760010300201000010000600201000010000500011000040010
5002411900371283500111127210000400101000010000317099491874995570760010300201000010000600201000010000500011000040010
5002411900371283500111127210000400101000010000317099491874995570760010300201000010000600201000010000500011000040010
5002411900371283500111127210000400101000010000317099491874995570760010300201000010000600201000010000500011000040010
5002511903571293500181127410001400451001110000317099491874995570760010300201000010000600201000010000500011000040010

Test 3: throughput

Count: 8

Code:

  ldr q0, [x6], #8
  ldr q0, [x7], #8
  ldr q0, [x8], #8
  ldr q0, [x9], #8
  ldr q0, [x10], #8
  ldr q0, [x11], #8
  ldr q0, [x12], #8
  ldr q0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5514

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8020945303160527803778015080377800072403367120910160119200800160200800168000980000100
8020444117160109801098000080112800072403367131300160119200800160200800168000980000100
8020444109160109801098000080112800072403367132020160119200800160200800168000980000100
8020444111160109801098000080112800072403367132020160119200800160200800168000980000100
8020444108160109801098000080112800072403367130940160119200800160200800168000980000100
8020444108160109801098000080112800072403367132020160119200800160200800168000980000100
8020444108160109801098000080112800072403367131300160119200800160200800168000980000100
8020444108160109801098000080112800072403367131660160119200800160200800168000980000100
8020444111160109801098000080112800072403367130940160119200800160200800168000980000100
8020444111160109801098000080112800072403367132020160119200800160200800168000980000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5516

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80029452441604298027980150802828001124006671374816003320800162080000800018000010
80024441381600118001180000800108000024003071347916001020800002080072800618000010
80024441311600118001180000800108000024003071347916001020800002080000800018000010
80024441261600118001180000800108000024003071347916001020800002080000800018000010
80024441291600118001180000800108000024003071349716001020800002080000800018000010
80024441251600118001180000800108000024003071347916001020800002080000800018000010
80024441301600118001180000800108000024003071347916001020800002080000800018000010
80024441271600118001180000800108000024003071347916001020800002080000800018000010
80024441271600118001180000800108000024003071351516001020800002080000800018000010
80024441221600118001180000800108000024003071347916001020800002080000800018000010