Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 2D)

Test 1: uops

Code:

  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
630063033330071300630009000300030000300001300000
630042972930011300030009000300030000300001300000
630042969930011300030009000300030000300001300000
630042969330011300030009000300030000300001300000
630042970130011300030009000300030000300001300000
630042970430011300030009000300030000300001300000
630042969730011300030009000300030000300001300000
630043111530011300030009009300030000300001300000
630042973930011300030009000300030000300001300000
630043001330011300030009000300030000300001300000

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
240205120188240131101024003010002400083007680142401082002400122002400681240000100
2402041200762401011010240000100024000830019200662401082002400122002400121240000100
2402041200452401011010240000100024000830019200122401082002400122002400121240000100
2402041200452401011010240000100024000830019200122401082002400122002400121240000100
2402051203722401311010240030100024000830019201922401082002400122002400121240000100
2402041200452401011010240000100024000830019200122401082002400122002400681240000100
2402041200632401011010240000100024000830019204262401082002400122002400681240000100
17674098673176683185541748241676424010730718510602402092022401252002400121240000100
2402041200752401011010240000100024000830019201022401082002400122002400121240000100
2402041200452401011010240000100024000830019200122401082002400122002400121240000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
24002512017624004511024003410024000830120018824001820240012202400001240000010
24002412005124001111024000010024000030192016424001020240000202400001240000010
24002412005124001111024000010024000030192016424001020240000202400001240000010
24002512010424004511024003410024000030192016424001020240000202400001240000010
24002412005124001111024000010024000030192016424001020240000202400001240000010
240024120463240011110240000100240192301627985240202202402239952408724362403803572
24002412029324007111024006010024010730154298924011720240128202400001240000010
2400241202852400711102400601002400963094934024010620240113202401121240000010
2400241216732404011102403901002400083096018824001820240012202400001240000010
24002412005124001111024000010024000030192016424001020240000202400681240000010