Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STCLRLH

Test 1: uops

Code:

  stclrlh w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
7300534545301810142004100220007767105183000100020002000400001002200001000
7300434213300210022000100020007760105113000100020002000400001002200001000
7300434169300210022000100020007764105153000100020002000400001002200001000
7300434165300210022000100020007760105113000100020002000400001002200001000
7300434126300210022000100020007760105113000100020002000400001002200001000
7300434162300210022000100020007760105113000100020002000400001002200001000
7300434162300210022000100020007760105113000100020002000400001002200001000
7300434203300310032000100020007760105113000100020002000400001002200001000
7300434165300210022000100020007760105113000100020002000400001002200001000
7300434168300210022000100020007810105613000100020002000400001006200001000

Test 2: throughput

Code:

  stclrlh w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40206604204025820210200482015020005115569958384011020205200053020340004200062000020100
40204600624011220109200032010520005115592958584011020205200053020340004200072000020100
40204600554010720107200002010220002115674955484010420202200023020840009200092000020100
40204600554010720107200002010220002115680955654010420202200023020340004200072000020100
40204600554010520105200002010220002115682955674010420202200023020340004200072000020100
40204600554010520105200002010220002115670955484010420202200023025440070200442000020100
40204600554010420104200002010220002115688955784010420202200023020340004200052000020100
40204600554010520105200002010220005115567958334011020205200053020340004200072000020100
40204600554010420104200002010220002115649955264010420202200023020340004200052000020100
40204600554010520105200002010220002115690955724010420202200023020340004200072000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400276042440205201382006720088200021154209543040014200222000230020400002000520000020010
400246006240015200152000020010200001154539547540010200202000030020400002000520000020010
400246007240015200152000020010200001154169542540010200202000030020400002000620000020010
400246006240015200152000020010200001154199543040010200202000030020400002000520000020010
400246006240015200152000020010200001154169542640010200202000030020400002000520000020010
400246006240015200152000020010200001154489546640010200202000030020400002000620000020010
400246006240015200152000020010200001154089541240010200202000030020400002000520000020010
400246006240015200152000020010200001154149542240010200202000030020400002000520000020010
400246006240017200172000020010200369582510429140082200562003630020400002000520000020010
400246006540016200162000020010200001154219543540010200202000030020400002000520000020010

Test 3: throughput

Code:

  stclrlh w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7550

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30205113798463342205324281143222413320547821985304379001393526668239884694002059520000010100
30204107500402961977320523105362101219815681927369320221111121762206164078001926720000010100
30204105849391221889220230102032142919560501924828327151138722292218124291901981420000010100
30204106374398201953520285103412011519126321875194302971028220169205964071001923720000010100
30204110445434332110622327123192142819698461918240326801135422102224334372102005020000010100
30204106960395411939520146102022124419789281927883324091126722026214864216701970920000010100
30204107550407141988420830108322237320093181949461344541221623734229464441902025320285110502
30204106612399371947820459103422074819310911912244315231087621306212554187601936020000010100
30204106978401161964720469105432057519541561918591311651069220953220074331502024420000010100
30204109213415932039021203110832020319363981908237304411033920272203824034701928220000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4179

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
57396170997730543873810834208304618925148209143220144693974314606286932895055674225482000010010
30024114632468472262002422714020024642206935719946063881814188278132911655956223372000010010
30024114390464882247402401413710024107208825720121053777613682268392828454730224782000010010
30024114863472962249402480214638024473209129220140113838913928274532839654720225252000010010
30024114261463382268102365713507024758208483820089233896014213278502829254916225042000010010
30025114427464052245002395513736024638208893820119833880914183277282783054137224392000010010
30024114610470472252302452414192025166207776220027873984814695288202849655243224562000010010
30024114353468492235702449214377024702208732420107823877714088278422679052403225092000010010
30024114359466322254802408414149024641207753020027663866014030278392765653864223192000010010
30024113712461152207502404014306023984207601920010443745713485269092786053578223572000010010