Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASLB

Test 1: uops

Code:

  caslb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.001

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740083459030131030120030031503430031001300310016006130001001
740053433230041030030030031503430031001300310016006130001001
740053430730041030030030031503430031001300310016006130001001
740053431330041030030030031503430031001300310016006130001001
740053431130041030030030031503430031001300310016006130001001
740053431930041030030030031503430031001300310016006130001001
740053429730041030030030031503430031001300310016006130001001
740053433230041030030030031503430031001300310016006130001001
740053431230041030030030031503430031001300310016006130001001
740053433030041030030030031503430031001300310016006130001001

Test 2: throughput

Code:

  caslb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
502089024843919138753004413875300034728110391074572820201300032020160006156263000020100
502049006045727157263000115725300034724110390334572820201300032020160006156263000020100
502049005445727157263000115725300034724110390334572820201300032020160006156263000020100
502049005445727157263000115725300034724110390394572820201300032020160006156263000020100
502049005445727157263000115725300034724110390294572820201300032020160006156263000020100
502049005445727157263000115725300034724110390354572820201300032022560078149753000020100
502049005445727157263000115725300034724110390334572820201300032020160006156263000020100
502049005445727157263000115725300034724110390314572820201300032020160006156263000020100
502049005445727157263000115725300034724110390314572820201300032020160006156263000020100
502049005445727157263000115725300034724110390304572820201300032020160006156263000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
500289024943830137853004513785300034697110393144563820021300032002060000156263000020010
500249005445636156363000015635300004697110392764563520020300002004560078152273000020010
500249005445636156363000015635300004697110392674563520020300002002060000156263000020010
500249005445636156363000015635300004697110392744563520020300002002060000156263000020010
500249005445636156363000015635300004697110392734563520020300002002060000156263000020010
500249005445636156363000015635300004697110392824563520020300002002060000156263000020010
500249005445636156363000015635300004697110392784563520020300002002060000156263000020010
500249005445636156363000015635300004697110392754563520020300002002060000156263000020010
500259009844854148233003114821300004697110392874563520020300002002160006156263000020010
500249005145636156363000015635300004696410392384563520020300002002060000156263000020010

Test 3: throughput

Code:

  caslb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 11.5584

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
42944117520667861516705161912923078722271634133584395677267307872628414167164194453000013128
43007115595702511959605065516911080111280885126020597390272558012527241159431195673000012905
42973114731704331952905090417021079196275767127431596194269657923326965158829195713000012865
43001115579704631954705091617145079502296246130450196601269047950627105158530192963000012899
43000114289692221896305025916730079004269835130293795906269987901027099158494190893000012898
42998113615691121893805017416697078994265405126588695892270007900026898158954192243000012895
42991114458707171977905093817239079410259105126237296510268787941827010159631194953000012899
43001116869693621878905057316470080514271926125479397987272728061627287161322194373000012996
43005115625703091968405062517198079070276510130285296115268587911026828158522190113000012901
43007115584654191476305065613483080567275275121494998167275068062027300159700189943000012898

1000 unrolls and 10 iterations

Result (median cycles for code): 11.7276

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
42812117190676041557152033129737907928094313288620959162641379087026399158103201993000012735
42749117306720332022251811168277915521466213303690939572643779161026399158108201873000012736
42760117288715371966951868163987981428261113222250968062665479814026403158132202113000012736
42750117366720512023451817168337906728109513295910958962640179067026759160251200093000012796
42810117082719632001851945168188012128650113181280969372675780121026757160239199873000012796
42812117097679591596751992133778082127115713409381167968362928481070826402158138201593000012736
42773117172719232010051823168248034728802213135370971852685180353026850160690199353000012808
42822116962719401996151979168308120729334013066540980132711981207026449158391201283000012745
42810117097719632002051943168178012728700013177930969472675980127026755160237199753000012796
42750117321719812019451787168247905728058513284470958772639979057026400158102201583000012736