Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASALB

Test 1: uops

Code:

  casalb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
74007345613010130093000150333000100030001000600001300001000
74004345863001130003000150333000100030001000600001300001000
74004342333001130003000150333000100030001000600001300001000
74004342293001130003000150333000100030001000600001300001000
74004342303001130003000150333000100030001000600001300001000
74004342353001130003000150333000100030001000600001300001000
74004342343001130003000150333000100030001000600001300001000
74004342663001130003000150353000100030001000600001300001000
74004343723001130003000150333000100030001000600001300001000
74004344443001130003000150333000100030001000600001300001000

Test 2: throughput

Code:

  casalb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50208902544183011785300451178530003428983549874426820201300032022360072128703000020100
50204900604426714266300011426530003428933549724426820201300032020160006141663000020100
50204900604426714266300011426530003428933549994426820201300032020160006141663000020100
50204900604426714266300011426530003428933550064426820201300032020160006141663000020100
50204900604426714266300011426530036385953554434286020223300362020160006141663000020100
50204900604426714266300011426530003428843549644426820201300032020160006141683000020100
50204900604426714266300011426530003428933549774426820201300032020160006141663000020100
50204900604426714266300011426530003428933549974426820201300032020160006141663000020100
50204900604426714266300011426530003428933550044426820201300032020160006141663000020100
50204900604426714266300011426530003428933550004426820201300032020160006141663000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50028909244174511697300481169530003426113552434417820021300032002060000141663000020010
50024900584417614176300001417530000426133552234417520020300002002060000141663000020010
50024900514417614176300001417530000426133552184417520020300002002060000141663000020010
50024900514417614176300001417530000426133552134417520020300002004360072118913000020010
50024900514417614176300001417530000426133552234417520020300002002060000141663000020010
50024900514417614176300001417530000426133552124417520020300002002060000141663000020010
50024900514417614176300001417530000426133552184417520020300002002060000141663000020010
50024900514417614176300001417530000426113552214417520020300002002060000141663000020010
50024900514417614176300001417530000426133552184417520020300002002060000141663000020010
50024900514417614176300001417530000426133552224417520020300002002060000141663000020010

Test 3: throughput

Code:

  casalb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40207240155334733435300381771300039206132938282342681020130003102016000683283000010100
40204240036384298428300014265300039206252938305342681020130003102016000683283000010100
40205240065375147484300303798300039206252938305342681020130003102016000683283000010100
40204240036384298428300014265300039206252938328342681020130003102016000683283000010100
40204240046384318430300014265300039206372938334342681020130003102016000682973000010100
40204240042383988397300014265300039206372938361342681020130003102126007270903000010100
40204240042383988397300014265300039206412938370342681020130003102016000682973000010100
40204240039383988397300014265300039206582938442342681020130003102016000682973000010100
40204240039383988397300014265300366009952939052328551021230036102016000682973000010100
40204240039383988397300014265300039206372938357342681020130003102126007260563000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40027240155333833345300381681300039203682938523341781002130003100326007252813000010010
40024240044383418340300014175300009203802938529341751002030000100326007241353000010010
40024240044383408340300004175300009203802938529341751002030000100206000083303000010010
40025240073382348204300304109300009203802938529341751002030000100206000083303000010010
40024240044383408340300004175300009203802938529341751002030000100206000083303000010010
40024240048383408340300004175300009203802938529341751002030000100326007266073000010010
40024240044383408340300004175300009203802938529341751002030000100206000083303000010010
40024240044383408340300004175300009203802938529341751002030000100206000083303000010010
40024240044383408340300004175300365590762939187325771003230036100206000083303000010010
40024240045383408340300004175300009203802938529341751002030000100206000083303000010010