Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STCLRLB

Test 1: uops

Code:

  stclrlb w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7300634795304010260201410070200278231060430031001200220004000100420001000
7300434850300310030200010000200077891056230001000200020004000100320001000
7300435847300310030200010000200081501090130001000200020004000100220001000
7300435351300510030200210010200077621051330001000200020004000100220001000
7300435113300510030200210010200077621051330001000200020004000100220001000
316113358498545313374504495532200077621051330001000200020004000100220001000
7300434371300510030200210010200077621051330001000200020004000100220001000
7300434382300510030200210010200077621051330001000200020004000100220001000
7300434270300210020200010000200077621051330001000200020004000100220001000
7300434258300210020200010000200077621051330001000200020064012100620001000

Test 2: throughput

Code:

  stclrlb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40206603124025020206200442015220005115576958454011020205200053020840009200072000020100
40204600664010720107200002010220005115610958784011020205200053020340004200082000020100
40205601104017520143200322013420002115670955554010420202200023020340004200082000020100
402046006240108201082000020102200341022311015424016820234200343020340004200062000020100
40204600624010820108200002010220002115687955644010420202200023020340004200082000020100
40204600624010820108200002010220002115689955674010420202200023020340004200082000020100
40204600624010820108200002010220002115689955694010420202200023020340004200082000020100
402046006240108201082000020102200331022361014264016820235200333020340004200062000020100
40205601204017920145200342013620002115692955914010420202200023025440070200452000020100
40204600624011020107200032010520005115436957134011020205200053020340004200062000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40026602844014420106200382006020002115368953774001420022200023002040000200052000020010
40024600584001520015200002001020000115356953584001020020200003002040000200052000020010
40024600584001520015200002001020000115381953814001020020200003002040000200072000020010
40024600584001720017200002001020000115354953554001020020200003002040000200072000020010
40024600584001520015200002001020000115375953714001020020200003002040000200072000020010
40024600584001720017200002001020000115372953664001020020200003007440070200452000020010
40024600584001520015200002001020000115375953724001020020200003002040000200072000020010
40024600584001820018200002001020000115374953684001020020200003002040000200072000020010
40024600584001520015200002001020000115379953784001020020200003002040000200072000020010
40024600584001720017200002001020000115377953734001020020200003002040000200072000020010

Test 3: throughput

Code:

  stclrlb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7508

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051142704642322386240371408022446204210219733473471912377241592448847773213612000010100
302041092554210420442216621175521537197843219218963300711572226332142442298197542000010100
302041065033995819499204591041321105191387918900543199810996215102083641152190732000010100
302041048713896118816201451019920207189570118786093047410367203332028240163191602000010100
302041079954064519881207641056620471193943118942523096110590208132132841971194722000010100
302041079614147720086213911148922253198291719317043429012166235592088641323195422000010100
302041064433945019298201521018620148193178119060153038710339202742035640296190032000010100
302041060933920019174200261010620405197886519429443076510461205452241143980200592000010100
302041068654051419722207921077820678194466318977633144710869213202055040670191992000010100
302041076063927619086201901021920723196155719085633143910839211842199243260197082000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4199

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30025114199465512224502430614683025070208067720064803947114415283422759953764225022000010010
30024114408463852246202392314208024125207841420029133774013629270042793653662221262000010010
30024114375466272253502409213873024895207932320046573910514223281662800654603223422000010010
30025114241465252236502416014353024115208173820056073765513555266702839054976221642000010010
30024113979463712222002415114122024709208005820048273874514051278172931856676222452000010010
30024114116464692223502423414144024416208184220072573808713682274092933256652220672000010010
30024113580459232212402379913825024444206798819946433852114089276752787253913219682000010010
30024114375458952233402356113453024677207648520013983890214239279072791454075223762000010010
30024113684458232205002377313921024298208383720073313799213708271782828854349221512000010010
30024114335461952246402373113735024296207595320004213810613820270752750853418220782000010010