Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 2D)

Test 1: uops

Code:

  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.001

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
6400531150400914008400001200204000040000400014000
6400430228400114000400001200004000040000400014000
6400429994400114000400001200004000040000400014000
6400430278400114000400001200004000040000400014000
6400429660400114000400001200004000040000400014000
6400429642400114000400001200004000040000400014000
6400429731400114000400001203004000040000400214000
6400429759400114000400001200004000040000400014000
6400429606400114000400001200004000040000400014000
6400429609400114000400001200004000040000400014000

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
3202051601633201311013200301003200103009922483201102003200162003200161320000100
32020416011532010110132000010032010630023598323202062003201282003200161320000100
32020416006632010110132000010032001030025603143201102003200162003200161320000100
32020516036232013110132003010032005830020572083201582003200722003200721320000100
32020416007332010110132000010032001030011201943201102003200162003200161320000100
32020416005432010110132000010032001030025602063201102003200162003200161320000100
32020416005432010110132000010032001030025602063201102003200162003200721320000100
32020416005432010110132000010032001030025602063201102003200162003200161320000100
32020416005432010110132000010032001030025602063201102003200162003200161320000100
32020416005432010110132000010032001030025602063201102003200162003200161320000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
32002516020132004711320036103200103017431123200202032001620320000132000010
32002416005132001111320000103200003025599823200102032000020320000132000010
32002416004332001111320000103200583016596083200682032007220320000132000010
32002416004332001111320000103200003025599823200102032000020320000132000010
32002416004332001111320000103200003025599823200102032000020320000132000010
32002416004332001111320000103200003025599823200102032000020320000132000010
32002516010832004711320036103200583021484263200682032007220320000132000010
32002416004332001111320000103200003025599823200102032000020320000132000010
32002416004332001111320000103200003025599823200102032000020320000132000010
32002416008532001111320000103200003025599823200102032000020320072132000010