Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CAS (32-bit)

Test 1: uops

Code:

  cas w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.000

Issues: 3.000

Integer unit issues: 0.001

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
740073463330131301230031503430031001300310016006130001001
740053429530041300330001502230001000300010006000130001000
740043426630011300030001502230001000300010006000130001000
740043425630011300030001502230001000300010006000130001000
740043427630011300030001502230001000300010006000130001000
740043425830011300030001502230001000300010006000130001000
740043426930011300030001502230001000300010006000130001000
740043425630011300030001502230001000300010006000130001000
740043427130011300030001502230001000300010006000130001000
740043427130011300030001502230001000300010006000130001000

Test 2: throughput

Code:

  cas w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0054

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
5021070342439531388430069138833000347284790906045728202013000302020160006156263000020100
5020470058457271572630001157253000347237790863045728202013000302020160006156263000020100
5020570171445731451230061145113000347238790869045728202013000302020160006156263000020100
5020470051457271572630001157253000347238790894045728202013000302020160006156263000020100
5020470051457271572630001157253003947433791293045777202253003902020160006156263000020100
5020470051457271572630001157253000347237790863045728202013000302020160006156263000020100
5020470051457271572630001157253000347243790894045728202013000302022560078156383000020100
5020470058457271572630001157253000347236790863045728202013000302032360363157043000020100
5020470051457271572630001157253003946152790859045390202253003902020160006156263000020100
5020470508460041582330181158223000347237790844045728202013000302020160006156263000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
50034705834396113828301331383030003469967912454563820021300032002060000156263000020010
50024700584563615636300001563530000469967912274563520020300002002060000156263000020010
50024700584563615636300001563530000469967912294563520020300002002060000156263000020010
50024700584563615636300001563530000469967912414563520020300002004560078139533000020010
50024700584563615636300001563530000469967912294563520020300002002060000156263000020010
50024700584563615636300001563530000469967912254563520020300002002060000156263000020010
50024700584563615636300001563530000469967912294563520020300002002060000156263000020010
50024700584563615636300001563530000469977912334563520020300002002060000156263000020010
50024700584563615636300001563530000469967912364563520020300002004560078153683000020010
50024700584563615636300001563530000469967912384563520020300002002060000156263000020010

Test 3: throughput

Code:

  cas w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.5320

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
430291050376200113588484131274675766272950115435592299258847627426148154003153443000012931
431791076656919519765494301758677942290984118529695344266247859627150159475194233000013074
430071051786706418867481971648777972299480119886395106266267860426215153229186963000012902
431021057836722419167480571739277931308139121080495322267577871126148154488183383000012946
429911038886674018691480491676376231279464117498792840262377693926292154453182163000012960
430301043506618018375478051666876337301080119794193304261547725826940157966158213000013062
431881073986764619440482061766875835282555122447492541262077663126395155946188693000012996
430671063486635218546478061664877280247944113266193713262797760726332155599186503000012970
430911061986731118871484401723579030302883118710996441269897974326775157702185073000013015
428641040366542118246471751610178105313053122416295620266637859527013158952191523000013048

1000 unrolls and 10 iterations

Result (median cycles for code): 10.6100

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
42892105701629081360349305127577994128242312221839786426934806405252717264114227678372724726511
42682102469673211871548606155438018128094812256269821027015809132620015691901938030000012856
42815104358681341916848966164777769828102311768249449826098781702664515959601968830000012954
43030107610695841994949635180157886527980611997359625626534794562629715746501943330000012882
42861105111684021931949083168587949627983112163769726826775801732650615870901642630000012917
42799104108680181908248936164127687427547311566809328625795772652684116078201986030000012993
42895105630686941947949215171137826827906411886439536626311788012590015512101913330000012804
42909105790687551949849257171807882627803111993689624626509794032604815602401923030000012835
42829104661682411921149030166017814027791411842659518226270786622472514804201833130000012569
42945106291689621962449338174187786028312111775769472226165783492622815711101942430000012871