Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASB

Test 1: uops

Code:

  casb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740043465830011300030031503430031001300310016006130001001
740053486030041300330001502230001000300010006000130001000
740043449130011300030001502230001000300010006000130001000
740043483530011300030001502230001000300010006000130001000
740043446030011300030001501830001000300010006000130001000
740043429730011300030001502230001000300010006000130001000
740043430030011300030001502230001000300010006000130001000
740043428630011300030001502230001000300010006000130001000
740043428930011300030001502230001000300010006000130001000
740053425630041300330001502630001000300010006000130001000

Test 2: throughput

Code:

  casb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5020870248439241387630048138763000347289790932045728202013000302022560078146513000020100
5020470060457271572630001157253000347252790933045728202013000302020160006156263000020100
5020470060457271572630001157253000347243790897045728202013000302020160006156263000020100
502047006045727157263000115725316111216638095414439493182348131956352020160006156263000020100
5020470060457271572630001157253000347243790893045728202013000302020160006156263000020100
5020470060457271572630001157253000347243790899045728202013000302020160006156263000020100
5020470060457271572630001157253000347244790900045728202013000302020160006156263000020100
5020470060457271572630001157253000347244790900045728202013000302020160006156263000020100
5020470060457271572630001157253000347243790899045728202013000302020160006156263000020100
5020470060457271572630001157253000347243790899045728202013000302020160006156263000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50024700884564115640300011563530003470037912574563820021300032002160006156263000020010
50024700584563615636300001563530000469997912234563520020300002002060000156263000020010
50024700514563615636300001563530000469677911024563520020300002002060000156263000020010
50024700514563615636300001563530000469977912184563520020300002002060000156263000020010
50024700514563615636300001563530000469677911024563520020300002002060000156263000020010
50024700514563615636300001563530000469677911024563520020300002002060000156263000020010
50024700514563615636300001563530000469677911024563520020300002002060000156263000020010
50025701024398513955300301395430000469987912294563520020300002002060000156263000020010
50024700514563615636300001563530000469967912154563520020300002002060000156263000020010
50024700514563615636300001563530000469677911024563520020300002002060000156263000020010

Test 3: throughput

Code:

  casb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.6080

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
431631070226305813780492781295876757295387117350093527262357734726730157682191993000013032
430861055476722118997482241710476346268655116368392965260687681426700157313189583000013013
431041064286776818988487801719077781275668118937594975265987839026007153330131343000012897
429421048926624618768474781659879393289074121263097294272058014927206160270194743000013097
432011072466895419718492361790279426288635121180197328272068015226537156673193163000012979
429681041606411915569485501405876516276938115498793289261747710626288154735187803000012945
430371054046672118746479751679077799296356121469195029267117840326614156559192273000012971
430861054026812219221489011705277285341003123587694697265337811326655157134190023000012989
430891061226765719145485121712173518294763112762088596250317370326074153400185273000012872
430491055706793619328486081699477590297575119165594409264857815327102159779194923000013069

1000 unrolls and 10 iterations

Result (median cycles for code): 10.6133

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
43000107796634391371949720130797939129505212119980969732671880020026473158550195663000012918
42932106148689401959049350173287934428070912106670970022670479980026529158890196413000012928
42805104288681191914248977164107698827775811592970933982584377383026417158211195313000012909
42922106080678261851249314165677451127504811062810896462492674652025843154762191193000012791
42805104288681191914248977164107795427960711817860948642619878456026949161454199853000013018
43032107684695872000549582180017993428709912267990979352694980727026632159535196613000012951
42805104288681151914248973164107698727779011593240933972584377383025843154762191193000012791
42930106133689391959449345172727856827962611929330958052642479126026373157952195033000012899
42950106445689551963149324174487889228015112037670963402655179511026783160407154723000012976
42878105371685291938849141169277837527906811907130955522635078918026167156716193303000012858