Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASPAL (64-bit)

Test 1: uops

Code:

  caspal x0, x1, x2, x3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
760073518230071030060030001100030002000400020007000130003000
760043486730011030000030001100030002000400020007000130003000
760043447130011030000030001100030002000400020007000130003000
760043437930011030000030001100030002000400020007000130003000
760043449430011030000030001100030002000400020007000130003000
760043449730011030000030001100030002000400020007000130003000
760043441430011030000030001100130002000400020007000130003000
760043479530011030000030001100030002000400020007000130003000
760043449330011030000030001100030002000400020007000130003000
760043464330011030000030001100030002000400020007000130003000

Test 2: throughput

Code:

  caspal x0, x1, x2, x3, [x6]
  add x6, x6, 16

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 19.0056

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
702341910905128021171301092103830003687857347195283030202400043020270007227363000040100
702041900615283722836300012282730003687857346965283030202400043020270007227363000040100
702051901015129321263300302125430003687857346965283030202400043020270007227363000040100
702041900565283722836300012282730003687857346965283030202400043020270007227363000040100
702041900565283722836300012282730003687857346965283030202400043020270007227363000040100
702051901005228822258300302224930003687857346995283030202400043020270007227363000040100
702041900565283722836300012282730036686407350925275930235400483020270007227363000040100
702041900565283722836300012282730003687857347015283030202400043020270007227363000040100
702051901005219322163300302215430003687857346975283030202400043020270007227363000040100
702041900595283722836300012282730003687857346965283030202400043020270007227363000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 19.0053

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
70054191073511892108303010620948030003688187350085274030022400043002270007227343000040010
70024190122527462274603000022737030000684637346285273730020400003018770392224303000040010
70024190117527492274803000122737030003685047348615274030022400043002270007227343000040010
70024190091527452274403000122737030003684857346365274030022400043002070000227343000040010
70025190094526712264103003022632030000685157346795273730020400003015270308228113000040010
70024190053527442274403000022737030000684777346055273730020400003015270308228203000040010
70024190053527442274403000022737030000684777346065273730020400003002070000227343000040010
70024190463529472283103011622824030036650327347775160130055400483002070000227363000040010
70024190053527442274403000022737030132690227365575295330152401763002070000227343000040010
70025190099513842135403003021347030132693887369635295630152401763002070000227343000040010

Test 3: throughput

Code:

  caspal x0, x1, x2, x3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 30.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
6020930014251996219590300371103203002434985923340416042862202164003202021670056254603000030102
6020630004355577255600300171283803005730636303341828041323202384007602021670056254603000030102
6020630004355577255600300171283803002434986163340460042862202164003202021670056254603000030102
6020630004355577255600300171283803005731657563341159041695202384007602021670056254603000030102
6020630004355577255600300171283803002434986163340460042862202164003202021670056254603000030102
6020630004355577255600300171283803005732839473341379042127202404007602021670056254603000030102
6020630004355577255600300171283803002434986163340460042862202164003202021670056254603000030102
6020630004355577255600300171283803005733986463340784042541202384007602021670056254563000030102
6020630004155573255560300171283703002434991033343963042861202164003202021670056254563000030102
6020630004155573255560300171283703009030842803342499041440202604012002021670056254563000030102

1000 unrolls and 10 iterations

Result (median cycles for code): 30.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
60029300151519132187330040109423002134983223340512427692003440028200347004902546030000030012
60026300043554862547030016127483002134983463340534427692003440028200587013302488030000030012
60026300043554862547030016127483007534014203341259424892007040100200347004902546030000030012
60026300043554862547030016127483002134983463340534427692003440028200347004902546030000030012
60027300073519292188430045109703002134983463340534427692003440028200347004902546030000030012
60026300043554862547030016127483002134983463340534427692003440028200347004902546030000030012
60028300113528062273130075114083002134983463340534427692003440028200347004902546030000030012
60026300043554862547030016127483002134983463340534427692003440028200347004902546030000030012
60029300125535752351230063117903002134983463340534427692003440028200347004902546030000030012
60026300043554862547030016127483002134983463340534427692003440028200347004902546030000030012