Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASPA (64-bit)

Test 1: uops

Code:

  caspa x0, x1, x2, x3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
760073469130071030060030001100130002000400020007000130003000
760043443830011030000030001100130002000400020007000130003000
760043447230011030000030031101130032002400420007000130003000
760043448530011030000030001100130002000400020007000130003000
760043445630011030000030001100130002000400020007000130003000
760043443630011030000030001100130002000400020007000130003000
760043446330011030000030001100130002000400020007000130003000
760043444330011030000030001100130002000400020007000130003000
760043443030011030000030001100130002000400020007000130003000
760043445230011030000030001100130002000400020007000130003000

Test 2: throughput

Code:

  caspa x0, x1, x2, x3, [x6]
  add x6, x6, 16

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 17.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
70234171057512752116930106210383003663616645052511133023540048302027000702273430000040100
70204170058528352283430001228273000368755644734528303020240004302357008402146030000040100
70204170530530442292730117229203000369147645491528303020240004302027000702273430000040100
70204170055528352283430001228273000368755644734528303020240004302357008402198630000040100
70204170423529902290230088228953010269196646310530013030340136304687062302291430000040100
70205170344522812219330088221863000368755644734528303020240004303017023802280330000040100
70204170058528352283430001228273006969077645762529413026840092303017023802256630000040100
70204170797530942294830146229413099373416663963545003119841324302027000702274030000040100
702041700585283522834300012282730003687416448185283030202400043801957394161208112520411734351
70204170061528372283630001228273000368755644734528303020240004302027000702273430000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 17.0075

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
7005517121351232210960301362096903000369088645404527403002240004300207000002274030000040010
7002417007052750227500300002273703000068779644968527373002040000300207000002274030000040010
7002417007052750227500300002273703003666068645262518143005540048300207000002274030000040010
7002417007052750227500300002273703003664578645190513533005540048300207000002274030000040010
7002417008352746227460300002273703000068779644990527373002040000300207000002274030000040010
7002417007052750227500300002273703000068779644978527373002040000300207000002274030000040010
7002417007652750227500300002273703000068779644979527373002040000300557008402238730000040010
7002417006852750227500300002273703000068779644968527373002040000300207000002274030000040010
7002417007052750227500300002273703000068779644977527373002040000300207000002274030000040010
7002417007552750227500300002273703000068779645089527373002040000300207000002274030000040010

Test 3: throughput

Code:

  caspa x0, x1, x2, x3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 15.0048

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
602091501535199221955300371103230003178157016943914283220202400042020270007254533000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100
602071500895406124030300311207930030178114316942584286720220400402020270007254553000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100
602071500895387723845300321198530030178114316942584286720220400402020270007254553000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100
602041500465555625555300011282730003178159016944334283020202400042020270007254553000030100

1000 unrolls and 10 iterations

Result (median cycles for code): 15.0081

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
600291501595190121863300381094230069160319616939014155520068400922004070070254563000030014
600281500525548425466300181274630027178086516944244277320038400362003870063254563000030014
600281500525548425466300181274630027178086516944244277320038400362003870063254563000030014
600281500765548025462300181274630021178127516949634276720034400282013870413254663000030020
600261500865547825462300161274630069155575016933804121820066400922003470049254523000030012
600261500785547825462300161274630021178119516948884276720034400282008670231254623000030016
600261500825547825462300161274630021178119516948884276720034400282003470049254523000030012
600261500835547825462300161274630177178626216976394296920138402362003870063254523000030014
600291501225541025362300481271230057172862716948854243620058400762003470049254523000030012
600261500765547825462300161274630021178120716949074276720034400282013270392254613000030024