Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASALH

Test 1: uops

Code:

  casalh w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740093521730251302430001503530001000300010006000130001000
740043456030011300030001503530001000300010006000130001000
740043439830011300030001503530001000300010006000130001000
740043444730011300030001503530001000300010006000130001000
740043439730011300030001503530001000300010006000130001000
740043443230011300030001503530001000300010006000130001000
740043440030011300030001503530001000300010006000130001000
740043439630011300030001503530001000300010006000130001000
740043439630011300030001503530001000300010006000130001000
740043438330011300030001503530001000300010006000130001000

Test 2: throughput

Code:

  casalh w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
502089026341834117863004811785300034288335497604426820201300030202016000601416630000020100
502049005844267142663000114265300034286935499804426820201300030202016000601416630000020100
502059010243740137103003013708300034288335496904426820201300030202016000601416630000020100
502049005844267142663000114265300034288335496604426820201300030202016000601416630000020100
502049005844267142663000114265300034288335497104426820201300030202016000601416630000020100
502049005844267142663000114265300034288335496904426820201300030202016000601416630000020100
502049005844267142663000114265300034288335496504426820201300030202016000601416630000020100
502049005844267142663000114265300034288135496504426820201300030202016000601416630000020100
502049005844267142663000114265300363886435524104295420223300360202016000601416630000020100
502049005844267142663000114265300034288135496504426820201300030202016000601416630000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5002890455417451169730048116953000342611355239044178200213000302002060000141663000020010
5002490058441761417630000141753000042613355217044175200203000002002060000141663000020010
5002490051441761417630000141753000042614355216044175200203000002002060000141663000020010
5002490051441761417630000141753000042609355186044175200203000002002060000141663000020010
5002490052441761417630000141753000042610355235044175200203000002002060000141663000020010
5002490051441761417630000141753000042613355209044175200203000002002060000141663000020010
5002490051441761417630000141753000042611355211044175200203000002002060000141663000020010
5002490051441761417630000141753000042611355204044175200203000002002060000141663000020010
5002490051441761417630000141753000042611355213044175200203000002002060000141663000020010
5002490051441761417630000141753000042610355228044175200203000002002060000141663000020010

Test 3: throughput

Code:

  casalh w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
402072401493346934343003517713000392064629384350342681020130003010201600060832330000010100
402052400753662865983003033643000392065829384810342681020130003010201600060829730000010100
402042400463839883973000142653000392065829384810342681020130003010201600060829730000010100
402062400743531252693004326943000392065829384810342681020130003010214600720476930000010100
402042400463839883973000142653000392065829384810342681020130003010201600060829730000010100
402042400463839883973000142653000392065829384810342681020130003010201600060829730000010100
402042400463839883973000142653000392065829384810342681020130003010201600060829730000010100
402042400463839883973000142653000392065829384810342681020130003010201600060829730000010100
402042400463839883973000142653000392065829384810342681020130003010212600720622130000010100
402052400973389138613003019853000392065829384810342681020130003010201600060829730000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 24.0046

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40027240157333853345300401681300039203882938571341781002130003100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009204012938594341751002030000100206000083293000010010
40024240046383398339300004175300364998002938829323111003230036100206000083293000010010
40024240046383398339300004175300009203882938556341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100206000083293000010010
40024240046383398339300004175300009203882938554341751002030000100326007259593000010010