Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (2S)

Test 1: uops

Code:

  ld1r { v0.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
62005298292017110141002100210003000776720001000100010001000110001000
62004295892003110021000100010003000776720001000100010001000110001000
62004295182003110021000100010003000776720001000100010001000110001000
62004294482003110021000100010003000776820001000100010001000110001000
62004295602003110021000100010003000776720001000100010001000110001000
62004296852003110021000100010003000776720001000100010001000110001000
62004297452003110021000100010003000776720001000100010001000110001000
62004295232003110021000100010003000776720001000100010001000110001000
62004296222003110021000100010003000776720001000100010001000110001000
62004295562003110021000100010003000776720001000100010001000110001000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.2s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0049

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60205120152701144010120012100013013020031100043208937125753625755976011430212100042000860224100042000840001100001000040100
60204120049701054010120004100003010320007100043209017125770525758976011430212100042000860224100042000840001100001000040100
60204120052701054010120004100003010320007100043209017125770525758976011430212100042000860224100042000840001100001000040100
60204120049701054010120004100003010320007100043209017125770525758976011430212100042000860224100042000840001100001000040100
60204120055701054010120004100003010320007100043209179125777325760326011430212100042000860224100042000840001100001000040100
60204120049701054010120004100003010320007100043209017125770525758976011430212100042000860224100042000840001100001000040100
60204120049701054010120004100003010320007100043209017125770525758976011430212100042000860224100042000840001100001000040100
60204120049701054010120004100003010320007100163209376125782925761516018030251100162003360224100042000840001100001000040100
60204120049701054010120004100003010320007100043209071125772725759436011430212100042000860224100042000840001100001000040100
60204120051701054010120004100003010320007100043209017125770525758976011430212100042000860224100042000840001100001000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0049

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60025120159700244001120012100013004020031100163209420125935925788656009030071100162003360044100042000840001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010
60024120049700154001120004100003001020000100003209087125926425786566001030020100002000060020100002000040001100001000040010

Test 3: throughput

Count: 8

Code:

  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5014

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020540287160267101801288003810080038800123002402686411641601242008001280012200800128001218000080000100
16020440113160153101800448000810080012800123002402686418921601242008001280012200800128001218000080000100
16020440111160153101800448000810080012800123002402686418921601242008001280012200800128001218000080000100
16020440111160153101800448000810080012800123002916017019251601242008001280012200800128001218000080000100
16020440130160153101800448000810080012800543002701996782521602082008005480054200800128001218000080000100
16020440117160153101800448000810080012800123002402686418921601242008001280012200800128001218000080000100
16020440111160153101800448000810080012800123002402686418921601242008001280012200800128001218000080000100
16020440111160153101800448000810080012800123002402686418921601242008001280012200800128001218000080000100
16020440111160153101800448000810080012800123002402686418921601242008001280012200800128001218000080000100
16020440111160153101800448000810080012800123002582206639221601242008001280012200800128001218000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002540606160177118012880038108003880012302402046404201600342080012800122080000800001800008000010
16002440063160031118002080000108000080000302401686403401600102080000800002080000800001800008000010
16002440056160031118002080000108000080000302401686403401600102080000800002080000800001800008000010
16002440064160031118002080000108000080000302401686403401600102080000800002080000800001800008000010
16002440056160031118002080000108000080000302401686403401600102080000800002080055800551800008000010
1600244016116003111800208000010800008106765449360269751746163017256681264800652080054800541800008000010
16002440059160033118002280000108000080000302879687041171600102080000800002080000800001800008000010
16002440058160031118002080000108000080000302401686403401600102080000800002080000800001800008000010
16002440060160031118002080000108000080000302401686403401600102080000800002080000800001800008000010
16002440056160031118002080000108000080000302401666403401600102080000800002080000800001800008000010