Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASA (64-bit)

Test 1: uops

Code:

  casa x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740063456230071300630001503130001000300010006000130001000
740043424630011300030001502030001000300010006000130001000
740043422530011300030001502030001000300010006000130001000
740043677630011300030001501230001000300010006000130001000
740043424430011300030001502030001000300010006000130001000
740043422430011300030001502030001000300010006000130001000
740043422530011300030001502030001000300010006000130001000
740043422430011300030001502030001000300010006000130001000
740043424430011300030001502030001000300010006000130001000
740043422530011300030001502030001000300010006000130001000

Test 2: throughput

Code:

  casa x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50204700834427014269300011426530003429072799334426920201300032020160006141683000020100
50204700584426914268300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428702798384426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100
50204700514426814267300011426530003428692798364426820201300032020160006141673000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50034706804183311719301141171630003426252802574417820021300032002060000141683000020010
50025701094370413674300301367130000426182800994417620020300002002060000141673000020010
50024700584417814178300001417530000426232801954417520020300002002060000141673000020010
50024700514417714177300001417530000426002800904417520020300002002060000141673000020010
50024700514417714177300001417530000426002800904417520020300002004360072127113000020010
50024700584417814178300001417530000426522802714417520020300002002060000141673000020010
50024700584417814178300001417530000425982801664417520020300002002060000141673000020010
50024700514417714177300001417530000426002800904417520020300002002060000141673000020010
50024700514417714177300001417530000426002800904417520020300002004360072130483000020010
50024700584417814178300001417530000426002800904417520020300002002060000141673000020010

Test 3: throughput

Code:

  casa x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
402082201793349634313006517723000384569226985600342681020130003010201600060833030000010100
402042200443843184303000142653000384569226986040342681020130003010201600060833030000010100
402042200443843184303000142653000384569226986040342681020130003010212600720742830000010100
402042200443843184303000142653000384569226986040342681020130003010201600060833030000010100
402042200443843184303000142653000384569226986040342681020130003010201600060833030000010100
402042200443843184303000142653000384569226986040342681020130003010212600720401530000010100
402042200443843184303000142653000384569226986040342681020130003010201600060833030000010100
402042200443843184303000142653000384569226986040342681020130003010201600060833030000010100
402042200443843184303000142653003660393026995440331101021230036010201600060829830000010100
402042200443843184303000142653000384569226986040342681020130003010201600060833030000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400272202773338733463004116813000384541826986973417810021300031002060000832930000010010
400242200393833983393000041753000084543026987023417510020300001002060000832930000010010
400242200393833983393000041753003663488526989223317710032300361002060000832930000010010
400242200393833983393000041753003684563526994743421510032300361003160066832830000010010
400242201243836683373002941803003384554126996623421010031300331010860528831830000010010
400242212963869783203037742143003683373126990243415610032300361002060000833030000010010
400242200363833883383000041753000084542226986773417510020300001002060000833030000010010
400242200363833883383000041753000084542226986773417510020300001002060000833030000010010
400242200363833883383000041753003648638126988973244610032300361002060000833030000010010
400242200363833883383000041753000084542226986773417510020300001002060000833030000010010