Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, S)

Test 1: uops

Code:

  ldr s0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
1005122020761046103010521000182091693820001000100010011000
1004104120011001100010001000186411699420001000100010011000
1004105720011001100010001000186211688420001000100010011000
1004104420011001100010001000184611680420001000100010011000
1004104520011001100010001000180531694520001000100010011000
1004104020011001100010001000183291704820001000100010011000
1004104620011001100010001000185811687020001000100010011000
1004104220011001100010001000185371705020001000100010011000
1004105620011001100010001000185371692320001000100010011000
1004106320011001100010001000181451733620001000100010011000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr s0, [x6], #8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0077

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020910138570175501171005110007402801005510002266123576724979477960108302081000310003602181000410003500011000040100
5020410008470103501011000210000401041000310003266058276710879463760110302091000410003602781001210013500111000040100
5020410009070103501011000210000401041000310003266033976703179456160110302091000410003602181000410003500011000040100
5020410008670103501011000210000401041000310003266077176716979470060110302091000410003602181000410003500011000040100
5020410009670103501011000210000401041000310003266084476719779472060110302091000410003602181000410003500011000040100
5020410009070103501011000210000401041000310003266044776706779460060110302091000410003602181000410003500011000040100
5020410007770103501011000210000401041000310003266036676704079457160110302091000410003602181000410003500011000040100
5020410008170103501011000210000401041000310003266055576710379462960110302091000410003602181000410003500011000040100
5020410007670103501011000210000401041000310003266039376704979457660110302091000410003602181000410003500011000040100
5020410007770103501011000210000401041000310003266398476818479568360110302091000410003602161000310003500011000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 7.0106

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50029101386700695001610048100054015410043100032660780767628795163600203002910004100036019810027100305003110000040010
50024100080700145001110003100004001010000100002660160767415794942600103002010000100006002010000100005000110000040010
500241005187004350031100081000440080100211000026608087676287951456001030020100001000052796124307612408599555733285
50024100563700445003110009100044008410024100002659897767307794834600103002010000100006002010000100005000110000040010
50024100418700285002110005100024004510011100002660187767432794956600103002010000100006002010000100005000110000040010
50024100064700135001110002100004001010000100002659917767342794869600103002010000100006009810012100135000810000040010
50024100063700135001110002100004001010000100002659944767351794878600103002010000100006002010000100005000110000040010
50024100066700135001110002100004001010000100092661556767815795366600643004910009100106019610028100315003110000040010
50024100889701045007110021100124022010061100112663012768270795855600683005810013100136002010000100005000110000040010
50024100503700435003110008100044008010021100102660566767532795081600653005010010100106002010000100005000110000040010

Test 3: throughput

Count: 8

Code:

  ldr s0, [x6], #8
  ldr s0, [x7], #8
  ldr s0, [x8], #8
  ldr s0, [x9], #8
  ldr s0, [x10], #8
  ldr s0, [x11], #8
  ldr s0, [x12], #8
  ldr s0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5403

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80209442841605258037580150803758000624033669726016011820080016200800168000980000100
80204432241601098010980000801128001024033669661816012220080016200800168000980000100
80204432151601098010980000801128000624033669709616011820080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100
80204432151601098010980000801128000624033669706016011820080016200800168000980000100
80204432171601098010980000801128000624033669709616011820080016200800168000980000100
80204432151601098010980000801128000624033669709616011820080016200800168000980000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5402

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
80025436081601018007180030800748005824022569253716013320800702080014800108000010
80024432161600118001180000800108000024003069716316001020800002080000800018000010
80024432181600118001180000800108000024003069717416001020800002080000800018000010
80024432161600118001180000800108000024003069703716001020800002080000800018000010
80024432331600118001180000800108000024003069711016001020800002080000800018000010
80024432171600118001180000800108000024003069707316001020800002080000800018000010
80024432161600118001180000800108000024003069707316001020800002080000800018000010
80024432211600118001180000800108000024003069705516001020800002080000800018000010
80024432181600118001180000800108000024003069707316001020800002080000800018000010
80024432181600118001180000800108000024003069722216001020800002080000800018000010