Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD2 (multiple, 8H)

Test 1: uops

Code:

  ld2 { v0.8h, v1.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.004

Integer unit issues: 0.001

Load/store unit issues: 2.000

SIMD/FP unit issues: 2.004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
640052969040331202820042004200060001529040002000200020004000120002000
640042950440051200420002000200060001529040002000200020004000120002000
640042948940051200420002000200060001529040002000200020004000120002000
640042947240051200420002000200060001529040002000200020004000120002000
640042947440051200420002000200060001529040002000200020004000120002000
640042947640051200420002000200060001529040002000200020004000120002000
640042953040051200420002000200060001529040002000200020004000120002000
640042947340051200420002000200060001529040002000200020004000120002000
640042947640051200420002000200060001529040002000200020004000120002000
640042957640051200420002000200060001529040002000200020004000120002000

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ld2 { v0.8h, v1.8h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0049

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80205120155901214010130018200023013030024200043208854213908430844428010930209200063000960218200065001540001200002000040100
80204120051901074010130006200003010330002200043208952213919230845448010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80205120080901184010630010200023013330032200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200243210329214009630858388018930242200263004060218200065001540001200002000040100
80204120047901074010130006200003010330002200043209006213922830845968010930209200063000960284200265006640006200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0042

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80025120147900314001130018200023004030024200003209011213909630844968001030020200003000060020200005000040001200002000040010
80024120047900174001130006200003001030000200003208833213903230843618001030020200003000060020200005000040001200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060020200005000040001200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060104200265006640006200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060020200005000040001200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060020200005000040001200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060020200005000040001200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060020200005000040001200002000040010
80024120040900144001130003200003001030000200003208779213899630843098001030020200003000060086200205005140006200002000040010
80024120040900144001130003200003001030000200003208860213905030843878001030020200003000060104200265006640006200002000040010

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ld2 { v0.8h, v1.8h }, [x6]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80205120147901214010130018200023013030024200043208822213902430844018010930209200063000960218200065001540001200002000040100
80204120162901074010130006200003010330002200043209033213924630846228010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960280200265006540007200002000040100
80204120050901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80205120080901184010630010200023013330032200043208908213912030844948010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200243210896214047230863908018930242200263004060218200065001540001200002000040100
80204120050901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120047901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100
80204120127901074010130006200003010330002200043208898213915630844928010930209200063000960218200065001540001200002000040100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80025120151900314001130018200023004030024200243209407213937030848388009930062200263004060038200065001540001200002000040010
80024120047900174001130006200003001030000200003208968213910830844638001030020200003000060020200005000040001200002000040010
80024120047900174001130006200003001030000200003208968213910830844638001030020200003000060020200005000040001200002000040010
80024120047900174001130006200003001030000200003208968213910830844638001030020200003000060020200005000040001200002000040010
80024120047900174001130006200003001030000200243209282213933430847508009930062200263004060020200005000040001200002000040010
80024120047900174001130006200003001030000200243209254213931630847308009930062200263004060020200005000040001200002000040010
80024120047900174001130006200003001030000200003208968213910830844638001030020200003000060020200005000040001200002000040010
80024120047900174001130006200003001030000200003208968213910830844638001030020200003000060020200005000040001200002000040010
80024120050900174001130006200003001030000200003209076213918030845678001030020200003000060020200005000040001200002000040010
80024120047900174001130006200003001030000200003208968213910830844638001030020200003000060020200005000040001200002000040010

Test 4: throughput

Count: 8

Code:

  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  ld2 { v0.8h, v1.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0020

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3202058028032026710116012816003810016003816001230048027212820963201242001600121600122001600123200241160000160000100
3202048016032015310116004416000810016001216001230048027212820923201242001600121600122001600123200241160000160000100
3202048016032015310116004416000810016001216001230048026812820923201242001600121600122001600123200241160000160000100
3202048016032015310116004416000810016001216001230048027212820923201242001600121600122001600123200241160000160000100
3202048016032015310116004416000810016001216001230049837013001943201242001600121600121501612605923292837611212161164103095
3202048017032015310116004416000810016001216009630051398813165363202922001600961600962001600123200241160000160000100
3202048016032015310116004416000810016001216001230048026812820923201242001600121600122001601803203601160000160000100
3202048016032015310116004416000810016001216001230048027212820923201242001600121600122001600123200241160000160000100
3202048016032015310116004416000810016001216001230048026812820923201242001600121600122001600123200241160000160000100
3202048016032015310116004416000810016001216001230048026812820923201242001600121600122001600123200241160000160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3200258019532017711160128160038101600381600003050303613184283200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600543048058012971763201182016005416005420160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023212965963200102016000016000020160000320000116000016000010
3200248006632004911160038160000101600001600003048023612965963200102016000016000020160000320000116000016000010