Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASAL (32-bit)

Test 1: uops

Code:

  casal w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740063576730071300630001503330001000300010006000130001000
740043441430011300030001503330001000300010006000130001000
740043434530011300030001502030001000300010006000130001000
740043434830011300030001503330001000300010006000130001000
740043505130011300030001502030001000300010006000130001000
740043487730011300030001503930001000300010006000130001000
740043470630011300030001502230001000300010006000130001000
740043429130011300030001502230001000300010006000130001000
740043430230011300030001502230001000300010006000130001000
740043429630011300030001502230001000300010006000130001000

Test 2: throughput

Code:

  casal w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
502109035441853117903006311789300034289335517904426820201300030202016000601416730000020100
502049005844268142673000114265300034286235507904426820201300030202236007201273730000020100
502049005144268142673000114265300034288835521104426820201300030202016000601416730000020100
502049005144268142673000114265300034286235508104426820201300030202016000601416730000020100
502049005144268142673000114265300034286235508204426820201300030202016000601416730000020100
502049005844268142673000114265300034285535506404426820201300030202016000601416730000020100
502049005144268142673000114265300034288535533704426820201300030202016000601416730000020100
502049008444271142703000114265300034288635511304426820201300030202016000601416730000020100
502049005444268142673000114265300034288635514004426820201300030202016000601416730000020100
502049005444268142673000114265300034288635514204426820201300030202016000601416730000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
50030903454176011697030063116980300034256335531344178200213000320043600721205730000020010
50024900584417614176030000141750300004256335528944175200203000020020600001416630000020010
50025900954420714177030030141760300004258935540544175200203000020020600001416630000020010
50024900584417614176030000141750300004256335529344175200203000020020600001416630000020010
50024900514417614176030000141750300363581235567741949200433003620020600001416630000020010
50024900514417614176030000141750300004256335529144175200203000020020600001416630000020010
50024900584417614176030000141750300004256335528744175200203000020043600721236830000020010
50024900514417614176030000141750300004256335528744175200203000020020600001416630000020010
50024900514417614176030000141750300004256335529044175200203000020020600001416630000020010
50024900514417614176030000141750300004258935540044175200203000020020600001416630000020010

Test 3: throughput

Code:

  casal w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40208240320335043434300701773300367119872939246333591021230036102016000683303000010100
40204240044384318430300014265300039206502938456342681020130003102016000683303000010100
40204240044384318430300014265300039206502938456342681020130003102126007252923000010100
40204240044384318430300014265300039206502938456342681020130003102016000683303000010100
40204240044384318430300014265300039206502938456342681020130003102016000683303000010100
40204240044384318430300014265300364106692938784319941021230036102016000683303000010100
40204240044384318430300014265300039206502938456342681020130003102016000683303000010100
40204240044384318430300014265300039206502938456342681020130003102126007274053000010100
40204240044384318430300014265300039206502938456342681020130003102016000683303000010100
40204240044384318430300014265300039206502938456342681020130003102016000683303000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0044

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
4002924030233420334503007516820300039203882938548341781002130003100326007244113000010010
4002424004638339833903000041750300009203702938443341751002030000100326007253163000010010
4002424004638339833903000041750300009203672938430341751002030000100206000083293000010010
4002424003938339833903000041750300009203672938430341751002030000100326007276393000010010
4002424004438339833903000041750300039203802938546341781002130003100206000083303000010010
4002424004438340834003000041750300368073042938886337031003230036100436013863833000010010
4002424004938340834003000041750300009203802938529341751002030000100326007250113000010010
15038560480714471276795135066567769051320300364388002938970320351003230036100206000083303000010010
4002424004438340834003000041750300009203802938529341751002030000100206000083303000010010
4002424004438340834003000041750300366425232939131329561003230036100206000083303000010010