Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASAL (64-bit)

Test 1: uops

Code:

  casal x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073515430101300930001503530001000300010006000130001000
740043478330011300030001503930001000300010006000130001000
740043457530011300030001503330001000300010006000130001000
740043476530011300030001503330001000300010006000130001000
740043545530011300030001510430001000300010006000130001000
740043513430011300030001503430001000300010006000130001000
740043462030011300030001503230001000300010006000130001000
740043447730011300030001503330001000300010006000130001000
740043454030011300030001503330001000300010006000130001000
740043445530011300030001503330001000300010006000130001000

Test 2: throughput

Code:

  casal x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50214905504190611807300991180230036365313556794217820223300362020160006141683000020100
50204900584426914268300011426530003428873554844426820201300032020160006141683000020100
50204900584426914268300011426530003428873554844426820201300032020160006141683000020100
50204900584426914268300011426530003428873554844426820201300032020160006141683000020100
50204900584426914268300011426530003428873554864426820201300032020160006141683000020100
50204900584426914268300011426530003428873554654426820201300032020160006141683000020100
50204900604426814267300011426530036387703557444292120223300362020160006141683000020100
50204900584426914268300011426530003428873554844426820201300032020160006141683000020100
50204900584426914268300011426530003428873554844426820201300032020160006141683000020100
50204900584426914268300011426530003428873554844426820201300032020160006141683000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50034905344181411715300991171230003426173557464417820021300032002160006141693000020010
50024900584417814178300001417530000425923556294417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002004360072139123000020010
50024900584417814178300001417530000425873555864417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002002060000141673000020010
50024900514417714177300001417530000425923556294417520020300002002060000141673000020010

Test 3: throughput

Code:

  casal x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
40205240126384498432030017426903000392064629384350342681020130003010201600060831130000010100
40204240046383988397030001426503000392063729383570342681020130003010201600060829730000010100
40205240098355055475030030280003000392063729383570342681020130003010201600060829730000010100
40204240039383988397030001426503000392063729383570342681020130003010201600060829730000010100
40204240039383988397030001426503000392063729383570342681020130003010212600720377830000010100
40204240039383988397030001426503000392063729383570342681020130003010201600060829730000010100
40204240042383988397030001426503000392063729383570342681020130003010214600720770230000010100
40205240072370497019030030357403000392063729383570342681020130003010201600060829730000010100
40204240039383988397030001426503000392063729383570342681020130003010201600060829730000010100
40204240039383988397030001426503000392063729383570342681020130003010201600060833030000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40029240299334183345300731682300365517422939132325431003230036100216000683293000010010
40024240046383408339300014175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100276003083303000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300367500362938787334421003230036100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40025240114352725242300302631300009203882938554341751002030000100326007269083000010010
40024240046383398339300004175300039203802938546341781002130003100206000083303000010010