Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (4H)

Test 1: uops

Code:

  ld1r { v0.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
62005299532018110151002100210003000777020001000100010001000110001000
62004294952003110021000100010003000777020001000100010001000110001000
62004294792003110021000100010003000777020001000100010001000110001000
62004294632003110021000100010003000777020001000100010001000110001000
62005294772005110031001100110003000777020001000100010001000110001000
62004294502003110021000100010003000777020001000100010001000110001000
62004295032003110021000100010003000777020001000100010001000110001000
62004294702003110021000100010003002777220001000100010001000110001000
62004302622003110021000100010003000779320001000100010001000110001000
62004295442003110021000100010003001777120001000100010001000110001000

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.4h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60205120176701144010120012100013013020031100043208956125761325757306011430212100042000860224100042000840001100001000040100
60205120080701144010620007100013013320031100043208917125759525756946011430212100042000860224100042000840001100001000040100
60204120047701054010120004100003010320007100163209290125772925759696018030251100162003360224100042000840001100001000040100
60204120047701054010120004100003010320007100043208963125768325758516011430212100042000860224100042000840001100001000040100
60205120080701144010620007100013013320030100043209422125787325762286011430212100042000860224100042000840001100001000040100
60205120080701144010620007100013013320030100043209341125784425761666011430212100042000860294100162003340006100001000040100
60204120047701054010120004100003010320007100043210529125835025771566011430212100042000860302100162003340006100001000040100
60204120064701054010120004100003010320007100163209315127163325902446017930250100162003360224100042000840001100001000040100
60205120080701144010620007100013013320031100043209341125784025761626011430212100042000860224100042000840001100001000040100
60205120136701144010620007100013013320029100043209044125771925759206011430212100042000860224100042000840001100001000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60025120158700244001120012100013004020031100043208911125917725784946002430032100042000860020100002000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100002000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100002000040001100001000040010
60024120042700134001120002100003001020000100113210745125992925799726007430057100122002560020100002000040001100001000040010
60024120042700134001120002100003001020000100163210390125985425798046009030071100162003360020100002000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100002000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100002000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100002000040001100001000040010
60025120075700224001620005100013004320031100003208898125918725784956001030020100002000060020100002000040001100001000040010
60024120042700134001120002100003001020000100003208898125918725784956001030020100002000060020100002000040001100001000040010

Test 3: throughput

Count: 8

Code:

  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5013

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020540282160269101801298003910080039800123002402786406601601242008001280012202800568005628000080000100
16020440122160150101800418000810080012800133002403436408601601262008001380013200800128001218000080000100
16020440116160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100
16020440103160139101800308000810080012800123002401706405041601242008001280012200800128001218000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002540673160177118012880038108003880000302402326410681600102080000800002080000800001800008000010
16002440081160049118003880000108000080000302402326468121600102080000800002080000800001800008000010
16002440062160049118003880000108000080000302402326526121600102080000800002080000800001800008000010
16002440062160049118003880000108000080000302402326526121600102080000800002080000800001800008000010
16002440062160049118003880000108000080000302402326526121600102080000800002080000800001800008000010
16002440062160049118003880000108000080000302402326526121600102080000800002080000800001800008000010
16002440062160049118003880000108000080000302402326526121600102080000800002080000800001800008000010
16002440062160049118003880000108000080000302402326526121600102080000800002080000800001800008000010
16002540122160125118007580039108005580000302840996959591600102080000800002080000800001800008000010
16002440064160049118003880000108000080000302677446960721600102080000800002080000800001800008000010