Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 4H)

Test 1: uops

Code:

  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
6300529696300713006300009000300003000300013000
6300429381300113000300009000300003000300013000
6300429369300113000300009000300003000300013000
6300429369300113000300009000300003000300013000
6300429352300113000300009000300003000300313000
6300429367300113000300009000300003000300013000
6300429374300113000300009000300003000300013000
6300429358300113000300009000300003000300013000
6300429371300113000300009000300003000300013000
6300429382300113000300009000300003000300013000

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240205120182240135101240034100240008300744242024010820024001202002400121240000100
2402041200622401051012400041002400083001440242024010820024001202002400121240000100
2402041200562401011012400001002400083001927160024010820024001202022402382240000100
2402041205692402271012401261002400593001063373024015920024007202002400121240000100
2402041200562401011012400001002400083001920446024010820024001202002400121240000100
2402041200702401011012400001002400083001920248024010820024001202002400121240000100
2402041200562401011012400001002400083001920248024010820024001202002400681240000100
2402041201662401011012400001002400083001920428024010820024001202002400121240000100
240204120056240101101240000100240008300840242024010820024001202002400121240000100
2402041202982401681012400671002401063001014930024020620024012402002401231240000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
24002512030524004111240030102400083019200122400182024001220240000124000010
24002412005524001111240000102400003019201442400102024000020240000124000010
24002412007624001111240000102400083014158702400182024001220240000124000010
24002412006024001111240000102400003019203262400102024000020240000124000010
24002412005424001111240000102400003019202182400102024000020240000124000010
24002412005424001111240000102400003019202182400102024000020240068124000010
24002412004524001111240000102400003019202182400102024000020240000124000010
24002412005424001111240000102400003019202182400102024000020240000124000010
24002412005424001111240000102400003019202182400102024000020240000124000010
24002412005424001111240000102400003019202182400102024000020240000124000010