Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASL (32-bit)

Test 1: uops

Code:

  casl w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.001

Issues: 3.003

Integer unit issues: 0.001

Load/store unit issues: 3.003

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073499830131301230001502030001000300010006000130001000
740043467930011300030001502030001000300010006000130001000
740043481330011300030001501230001000300010006000130001000
740043424830011300030061508830061002300610006000130001000
740053455030041300330001502230001000300010006000130001000
740043452130011300030001502230001000300010006000130001000
740043472330011300030001502730001000300010016006130001001
740053452830041300330031503230031001300310016006130001001
740053451730041300330031503230031001300310016006130001001
740053435430041300330031503230031001300310016006130001001

Test 2: throughput

Code:

  casl w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
5021290499440061390730099139033000347274103915845728202013000320201600061562630000020100
5020490051457271572630001157253000347234103910345728202013000320201600061562630000020100
5020490051457271572630001157253000347234103910245728202013000320201600061562630000020100
5020490051457271572630001157253000347234103910445728202013000320225600781529630000020100
5020490051457271572630001157253000347234103909945728202013000320201600061562630000020100
5020490051457271572630001157253000347234103910545728202013000320225600781407030000020100
5020590095452911526030031152593000347236103910745728202013000320201600061562630000020100
5020490051457271572630001157253000347240103912145728202013000320297602941569030000020100
5020490217457831575230031157433000347234103907845728202013000320225600781564430000020100
5020590179447071464630061146443000347239103901245728202013000320249601501565630000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
500309034743855137923006313790300034700110395314563820021300032002060000156263000020010
500249006045636156363000015635300004700110395084563520020300002002060000156263000020010
500249006045636156363000015635300004700110395094563520020300002002060000156263000020010
500249006045636156363000015635300004700110395094563520020300002002060000156263000020010
500259010444495144643003114463300004700110395114563520020300002002060000156263000020010
500249006045636156363000015635300004700110395104563520020300002002060000156263000020010
500249006045636156363000015635300004700110395104563520020300002004560078151273000020010
500249006045636156363000015635300004701610395384563520020300002002060000156263000020010
500249006045636156363000015635300004700110395094563520020300002002060000156263000020010
500249006045636156363000015635300004700110395094563520020300002002060000156263000020010

Test 3: throughput

Code:

  casl w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 11.5594

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
4295311599066153149390512141289807911628125812635100961992703379121026909157752190023000012894
4300111563070318196890506291690207913028195913139130961902688579135026885158230199483000012839
4299811575271170197760513941700707961028760212925220968102720579615026912157423188633000012896
4300211553669812190360507761705607969328170912638210967402697079698026937159205191053000012900
4300211559770688196840510041718907925028031512815520962422685179353027003159001192923000012898
4300311566070275193200509551653507870926369913200950957042690278716026751157465193173000012836
4296111625970597196020509951698307974427077612954580968422701979843027890164904197813000013072
4298011546170359195300508291705208033518810612058620937992741780341026978158907190933000012900
4300411558269204191040501001698107867925634413064760957772692478780026914158171188783000012907
4294311711171978203090516691707807955128359512944580969122716679554026889158871194273000012891

1000 unrolls and 10 iterations

Result (median cycles for code): 11.7124

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
428151171106753615544519921298379097280757132582895930264077910126400158160201623000012737
427511171717190920164517451682479061280812132909595888263997906126763160258199783000012797
428121169407192820011519171681380125286019131611796936267618012526406158145201473000012738
427521172557190420142517621682879049280681132979195871263957904926689159780200343000012785
427501173147199620205517911682779163271133133055995698264427917626715159999160883000012773
427491173757156319728518351641480138282442131797697204267688014626395158101201653000012736
428101170747228120334519471705880148282617131672897212267668014826420158200201783000012739
427531171497196620192517741684279152280335132425596011264437915326397158079201523000012736
428101170597211320160519531691580139285674131775197054267638013926400158097201713000012736
428101171056964017656519841477679482283082132253496301265447948226754160221199753000012795