Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASA (32-bit)

Test 1: uops

Code:

  casa w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740063455530071300630001503330001000300010006000130001000
740043423230011300030001503330001000300010006000130001000
740043422830011300030001503330001000300010006000130001000
740043528930011300030001503130001000300010006000130001000
740043430130011300030001503130001000300010006000130001000
740043424430011300030001503130001000300010006000130001000
740043424930011300030001503130001000300010006000130001000
740043425330011300030001503130001000300010006000130001000
740043428830011300030001503130001000300010006000130001000
740043425130011300030001503130001000300010006000130001000

Test 2: throughput

Code:

  casa w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50210703604186111792300691179330036358942804914196120223300362020160006141663000020100
50237703574201811955300631195330003428582798454426820201300032020160006141663000020100
50204700784426914268300011426530003429212799164426920201300032020160006141663000020100
50204700514426714266300011426530003429032799274426820201300032020160006141663000020100
50204700514426714266300011426530003429022799254426820201300032020160006141663000020100
50204700514426714266300011426530003429032799274426820201300032022360072123853000020100
50204700514426714266300011426530003429012799374426820201300032020160006141663000020100
50204700514426714266300011426530036416502803514386620223300362020160006141663000020100
50204700514426714266300011426530003429022799234426820201300032020160006141663000020100
50204700514426714266300011426530003429032799274426820201300032020160006141663000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50030704894177111702300691170330003426252802634417920021300032002060000141673000020010
50024700604417614176300001417530003426832803374417820021300032002060000141663000020010
50024700644417814178300001417530000426552802784417520020300002002060000141663000020010
50024700584417614176300001417530000427422804874417520020300002002060000141663000020010
50024700624417714177300001417530000426612803364417520020300002002060000141663000020010
50024700584417614176300001417530000426592803024417520020300002002060000141663000020010
50024700584417614176300001417530036395142807334314620043300362002060000141663000020010
50024700584417614176300001417530000425922801924417520020300002002060000141663000020010
50024700584417614176300001417530000425992802204417520020300002002060000141673000020010
50024700584417614176300001417530000425962801874417520020300002002060000141663000020010

Test 3: throughput

Code:

  casa w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 22.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
40207220298334773436030041177103000384570026985630342681020130003010201600060829730000010100
40204220046383988397030001426503000384572526986980342681020130003010201600060829730000010100
40204220039383988397030001426503003656677326988470329301021430036010201600060829730000010100
40204220039383988397030001426503000384570326986420342681020130003010201600060829730000010100
40204220039383988397030001426503000384570726986610342681020130003010201600060829730000010100
40205220067376037573030030385603000384572226987100342681020130003010201600060829730000010100
40204220045384008399030001426503000384570026986290342681020130003010201600060829730000010100
40204220039383988397030001426503000384570026986290342681020130003010201600060829730000010100
40204220039383988397030001426503000384570026986290342681020130003010201600060829730000010100
40205220078336393609030030185703000384573526987270342681020130003010201600060829730000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 22.0039

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400282201823341133440300671683030003845409269859534178100213000310020600000832930000010010
400242200393833983390300004175030000845409269857834175100203000010031600660801630000010010
400242200563833983390300004175030000845409269857834175100203000010020600000832930000010010
124143523808110640573831044522135338598630003845449269890334178100213000310020600000832930000010010
400242200513833983390300004175030000845449269891034175100203000010032600720735030000010010
400242200503833983390300004175030000845449269890834175100203000010020600000832930000010010
400242200393833983390300004175030000845409269857834175100203000010020600000832930000010010
400242200403833983390300004175030003845397269856634178100213000310021600060832930000010010
400242200373833883380300004175030000845397269854934175100203000010020600000832830000010010
400252200953573857080300302866030000845397269854934175100203000010020600000832830000010010