Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASL (64-bit)

Test 1: uops

Code:

  casl x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073453730131301230001502730001000300010016006130001001
740053434230041300330001502030001000300010006000130001000
740043431930011300030001502030001000300010006000130001000
740043435130011300030001502030001000300010006000130001000
740043428930011300030001502030001000300010006000130001000
740053433330041300330001502030001000300010006000130001000
740043435230011300030001502030001000300010006000130001000
740043438430011300030001502030001000300010006000130001000
740043435030011300030001502030001000300010006000130001000
740043429730011300030001502030001000300010006000130001000

Test 2: throughput

Code:

  casl x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
502149055844016139133010313893300394347910393164445520225300392020160006156313000020100
502049005845730157293000115725300034738110396024572820201300032020160006156293000020100
502049005845730157293000115725300034738010395924572820201300032020160006156293000020100
502049005845730157293000115725300034738110395964572820201300032020160006156293000020100
502049005845730157293000115725300034738210395934572820201300032020160006156293000020100
502049005845730157293000115725300034738110395964572820201300032020160006156293000020100
502059010244699146683003114664300034738110396024572820201300032020160006156293000020100
502049005845730157293000115725300034738010395904572820201300032020160006156293000020100
502049005845730157293000115725300034738110395944572820201300032020160006156293000020100
502049005845730157293000115725300034738110395964572820201300032020160006156293000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 9.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
500349055543918138173010113800300034709410398684563820021300032002060000156303000020010
500249006045640156403000015635300394336510396004441420045300392002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010
500249006045640156403000015635300004709310397664563520020300002002060000156303000020010

Test 3: throughput

Code:

  casl x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 11.5132

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
429611159197106519847512181709479711273994125951296978271607971227202159631193803000012899
429541148627143620077513591694578922265443130244095977270067902627214159235196953000012901
430061155967100519891511141711179260278949127961696408270477926926596157176195733000012798
430001141876963419293503411699378900272655129395595836268297890327195159075194373000012897
430031141567030719694506131729579743287510124096596920269867975026735156843193673000012806
429111100776889918813500861662678945280939130994695938269227902226608156687188573000012833
430021156117112020187509331729680405259980118284797860274588041027113158578194093000012899
430961142317121019517516931738579804277948125715597102272057981326723157206179843000012844
429971155987029519642506531718179069281224130704496111269057907527149159167193853000012900
431011155967037519581507941749379465282921128761796673269857946727103159215195903000012902

1000 unrolls and 10 iterations

Result (median cycles for code): 11.7314

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
428121172446761015581520291297779089281028132959295926264127909226431158295169143000012735
428101170897230620348519581706279361281963132646196255264997936126393158084201863000012735
428101170907230620347519591706280282269160131876296923268118028526417158228202163000012739
427501173567205020233518171683379067281143132948995896264017906726780160394197903000012794
428101170897230720349519581706280135282622131830297197267598013526759160268203263000012796
427491173277202820216518121682779067281103132958095900264037906726401158132201953000012736
427501173567205020233518171683379067281143132948995896264017906726759160268203263000012796
428101170907230720348519591706280134282675131836097196267598013526762160286203393000012796
428101170867230120347519541706280134282675131835997196267598013526772160334197233000012794
428321170017239520385520101714580129282610131815297188267578012926757160256203113000012796