Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD2 (multiple, post-index, 4S)

Test 1: uops

Code:

  ld2 { v0.4s, v1.4s }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 5.002

Integer unit issues: 1.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 2.002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
64005296165035100320282004100220042000300060001527450002000200030004000100120002000
64004294235003100120022000100020002000300060001527450002000200030004000100120002000
64004294155003100120022000100020002000300060001527450002000200030004000100120002000
64004294085003100120022000100020002000300060001527450002000200030004000100120002000
64004294095003100120022000100020002000300060001527450002000200030004000100120002000
64004294235003100120022000100020002000300060001527450002000200030004000100120002000
64004294235003100120022000100020002000300060001527450002000200030004000100120002000
64004294065003100120022000100020002000300060001527450002000200030004000100120002000
64004294095003100120022000100020002000300060001527450002000200030004000100120002000
64004294235003100120022000100020002000300060001527450002000200030004000100120002000

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ld2 { v0.4s, v1.4s }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0040

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
802051201531001225010230018200024013230030200063198925189831029030939011830209200063000960218300095001550001200002000040100
802051200731001185010930007200024013730033200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960272300365006050007200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100
802041200401001045010130003200004010430008200063199073189873229036549011830209200063000960218300095001550001200002000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
800251201491000325001230018200024004230030200063199302189924029045049002830029200063000960020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010
800251200801000295001730010200024004530035200003199326189928229045479001030020200003000060020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060092300365006050007200002000040010
800241200621000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010
800241200471000175001130006200004001030000200003199326189928229045479001030020200003000060020300005000050001200002000040010

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ld2 { v0.4s, v1.4s }, [x6], x8
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0075

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
802051201791001225010230018200024013230030200063199895189900029040449011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200811001075010130006200004010430008200063200018189929229044249011830209200063000960274300365005650007200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100
802041200751001075010130006200004010430008200063200018189929229044249011830209200063000960218300095001550001200002000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0053

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
800261201841000445001830022200044007330057200063199409189930429045929002830029200063000960020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800251200841000295001730010200024004530035200003200055189971429051419001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010
800241200531000175001130006200004001030000200003199488189937829046799001030020200003000060020300005000050001200002000040010

Test 4: throughput

Count: 8

Code:

  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  ld2 { v0.4s, v1.4s }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.2512

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3202051002194002878012116012816003880122160038160052240378486780128727640023020016005216005220024001832002480005160000160000100
3202041001004001438010516003016000880106160012160012240318480184128031440013020016001216001220024001832002480005160000160000100
3202041001004001438010516003016000880106160012160012240318480184128031440013020016001216001220024001832002480005160000160000100
3202041001004001438010516003016000880106160012160012240318480184128031440013020016001216001220024001832002480005160000160000100
3202041000994001408010416003016000680105160010160012240318480184128031440013020016001216001220024001532002080004160000160000100
3202041001004001438010516003016000880106160012160012240318480184128031440013020016001216001220024001832002480005160000160000100
3202041001004001438010516003016000880106160012160012240318480184128031440013020016001216001220024001832002480005160000160000100
3202041001004001438010516003016000880106160012160010240315480178128030240012520016001016001020024001832002480005160000160000100
3202041000994001408010416003016000680105160010160050240375480602128091440022520016005016005020024001832002480005160000160000100
3202041000994001408010416003016000680105160010160012240318480184128031440013020016001216001220024001832002480005160000160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.2507

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3200251001694001978003116012816003880032160038160012240048480292128065240004020160012160012202400003200008000116000016000010
3200241000584000478001116003616000080010160000160000240030481596128175040001020160000160000202400003200008000116000016000010
3200241000524000478001116003616000080010160000160000240030480256128057440001020160000160000202400003200008000116000016000010
3200241000514000478001116003616000080010160000160000240030480256128057440001020160000160000202400783201048002516000016000010
3200241000514000478001116003616000080010160000160000240030480256128057440001020160000160000202400003200008000116000016000010
3200241000524000478001116003616000080010160000160000240030480256128057440001020160000160000202400003200008000116000016000010
3200241000514000478001116003616000080010160000160000240030480256128057440001020160000160000202400003200008000116000016000010
3200241000514000478001116003616000080010160000160000240030480256128057440001020160000160000202400003200008000116000016000010
3200241000524000478001116003616000080010160000160000240030480256128057440001020160000160000202400003200008000116000016000010
3200241000534000478001116003616000080010160000160000240030480256128057440001020160000160000202400783201048002516000016000010