Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASPAL (32-bit)

Test 1: uops

Code:

  caspal w0, w1, w2, w3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
760123476030221302130001100030002000400020007000130003000
760043442730011300030001100030002000400020007000130003000
760043438630011300030001100030002000400020007000130003000
760043439330011300030001100030002000400020007000130003000
760043438930011300030001100030002000400020007000130003000
760043438830011300030001100030002000400020007000130003000
760043438830011300030001100030002000400020027007130003000
760043465230011300030001100030002000400020007000130003000
760043438730011300030001100030002000400020007000130003000
760043438330011300030001100030002000400020007000130003000

Test 2: throughput

Code:

  caspal w0, w1, w2, w3, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 19.0050

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
7022219102151194210990300952105003003668030734716526233023540048302027000702273230000040100
7020419005052833228320300012282703000368679734570528303020240004302027000702273230000040100
7020419005052833228320300012282703000368679734570528303020240004302027000702273230000040100
7020419005052833228320300012282703000368679734570528303020240004302357008402139430000040100
7020419005052833228320300012282703000368679734570528303020240004302027000702273230000040100
7020419005052833228320300012282703000368679734570528303020240004302027000702273230000040100
7020419005052833228320300012282703000368679734570528303020240004302027000702273230000040100
7020419004652833228320300012282703000368665734554528303020240004302357008402236630000040100
6558217454048838216650271732138003000369048735009528303020240004302027000702273230000040100
7020419010752833228320300012282703010269082736030529913030140136302357008402275730000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 19.0050

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
700391905785105320991030062209380300036848273467505274030022400040300557008402242530000040010
700241900565274722746030001227370300036841673457505274030022400040300207000002273230000040010
700241900505274222742030000227370300006841873455505273730020400000300207000002273230000040010
70024190050527422274203000022737030000684167345810527373002040000053951888563830695389798851566
700241900555274522744030001227370300006840073453505273730020400000300207000002273230000040010
700241900495274222742030000227370300006840073455205273730020400000300207000002273230000040010
700251901235213222102030030220970300006844473462305273730020400000300207000002273230000040010
700241900495274222742030000227370300006840073455205273730020400000300207000002273230000040010
700241900495274222742030000227370300006840073455205273730020400000300207000002273230000040010
700241900495274222742030000227370300366366773481105117030055400480300207000002273230000040010

Test 3: throughput

Code:

  caspal w0, w1, w2, w3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 30.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
6020930014651997219593003811032300243498592334043804286220216400320202167005602546030000030102
6020730014454510244643004612302300243498616334046004286220216400320202647021702384130000030104
6020630004355577255603001712838300243498616334046004286220216400320202387013302320830000030102
6020630004555577255603001712838300243498616334046004286220216400320202387013302453230000030102
6020630004355577255603001712838300243498616334046004286220216400320202167005602546030000030102
6020630004355577255603001712838300243498616334046004286220216400320202387013302535430000030102
6020630004355577255603001712838300243498616334046004286220216400320202167005602546030000030102
6020630004355577255603001712838300243498616334046004286220216400320202167005602546030000030102
6020730012152540224943004611320300243498616334046004286220216400320202167005602546030000030102
6020630004355577255603001712838300573251854334158704200720238400760202167005602546030000030102

1000 unrolls and 10 iterations

Result (median cycles for code): 30.0043

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
60029300146519052186703003810942030024349807133432444277120036400322003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282005870133218713000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60026300041554822546603001612747030021349809533432504276820034400282003470049254563000030012
60027300071526902264403004611348030021349809533432504276820034400282005870133236003000030012