Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CAS (64-bit)

Test 1: uops

Code:

  cas x0, x1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.001

Issues: 3.003

Integer unit issues: 0.001

Load/store unit issues: 3.003

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073484930131301230031503230031001300310016006130001001
740066557930071300630031503430031001300310016006130001001
740053436630041300330031503430031001300310016006130001001
740053445130041300330031503430031001300310016006130001001
740053433030041300330031503430031001300310016006130001001
740053436530041300330031503430031001300310016006130001001
740053435830041300330031503430031001300310016006130001001
740053438630041300330031503430031001300310016006130001001
740053438530041300330031503430031001300310016006130001001
740053427230041300330031503630031001300310016006130001001

Test 2: throughput

Code:

  cas x0, x1, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0051

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5021470548440341391930115138973000347363790986045728202013000302020160006156273000020100
5020470051457281572730001157253000347337790915045728202013000302020160006156273000020100
5020470051457281572730001157253000347336790927045728202013000302020160006156273000020100
5020470051457281572730001157253000347334790930045728202013000302020160006156273000020100
5020470051457281572730001157253000347337790919045728202013000302020160006156273000020100
5020470051457281572730001157253000347349790918045728202013000302020160006156273000020100
5020470051457281572730001157253003947570791404045775202253003902020160006156273000020100
5020470051457281572730001157253000347337790920045728202013000302020160006156273000020100
5020470051457281572730001157253000347337790916045728202013000302020160006156273000020100
5020470132457761574530031157433000347335790965045728202013000302020160006156273000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0060

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50034705334393213821301111380430003470857912914563820021300032002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470797912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010
50024700584563915639300001563530000470847912734563520020300002002060000156293000020010

Test 3: throughput

Code:

  cas x0, x1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.5944

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
43095106300624251400848417130647747330706012168730947342660578311027014159016192953000013065
43035105272680401933248708170717800629328311927970952212666478588026963158618193413000013041
42909104129673071906248245162277955630915712147930972132719580197026730157443190033000013018
43063106181675411926548276172927468226740411019690906902544175087026064153659183443000012885
43138106520624551387348582131687740128603311821300945942660378181026129153745182493000012874
43067104855666331891047723169977680729972112108900937512622977597026362155664188433000012970
43074105863671061904248064170327772128411411947670949252659178271026465156109190193000012964
43184107267673441889248452176807712830053812217740942372645577921025866151684182173000012806
43003104307664341860947825162127940629485211595720971332724380131026586155165183643000012993
43100106491664261856447862166027826625834512166750937322684979083026095153687190193000012893

1000 unrolls and 10 iterations

Result (median cycles for code): 10.6102

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
430271076396345913774496851316778053278434118216395028262307855826196156859193863000012860
428741054246858019409491711697274519275253110741589689249397466525780154401191083000012783
430241075276954419961495831797178845279009119982596253265197941926711160002197693000012966
429841069476700417461495431597378652280333119703895970264597922126200156902193793000012863
428741052206856619388491781692678653280452119501095910264477921326177156798193403000012861
429931069786931819818495001783978321276929118648195438263337885926509158751195943000012925
429671065946906419650494141751779938280395122384997900269298065126941161386198793000013012
429631066666908119699493821756679577283381121731397371268018024326925161259198733000013010
430241075306948219891495911792978708278911119634796036264737927526973161545199233000013018
428161044676814619184489621653080036285472122407097991269568073226162156691193543000012857