Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEORLB

Test 1: uops

Code:

  steorlb w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7300534382303310230201010050200077701052130001000200020004000100320001000
7300434122300310030200010000200077701052130001000200020004000100320001000
7300434093300310030200010000200077701052130001000200020004000100320001000
7300434096300310030200010000200077701052130001000200020004000100320001000
7300434101300310030200010000200077701052130001000200020004000100320001000
7300434455300310030200010000200077701052130001000200020004000100320001000
7300434108300310030200010000200077701052130001000200020004000100320001000
7300434112300310030200010000200077701052130001000200020004000100320001000
7300434305300310030200010000200077691052030001000200020004000100320001000
7300434290300310030200010000200077701052130001000200020004000100320001000

Test 2: throughput

Code:

  steorlb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40207604204029920229200702018420002115719955874010420202200023020340004200062000020100
40204600654011120108200032010520002115717955814010420202200023020340004200062000020100
40204600654010620106200002010220036999011027134017220236200363020340004200062000020100
40205601214017220144200282013320002115719955884010420202200023020340004200062000020100
40204600654010620106200002010220002115725955964010420202200023020340004200062000020100
40204600654010620106200002010220002115721955894010420202200023020340004200062000020100
40204600654010620106200002010220002115719955874010420202200023020340004200062000020100
40204600654010620106200002010220002115717955844010420202200023020340004200062000020100
40204600674010620106200002010220002115727955994010420202200023020340004200062000020100
40204600654010620106200002010220002115713955754010420202200023020340004200062000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40027603244020320138200652008820002115422954324001420022200023002040000200052000020010
40024600624001520015200002001020000115281952914001020020200003002040000200042000020010
40024600554001520015200002001020000115275952874001020020200003002040000200042000020010
40024600554001620016200002001020000115299953184001020020200003002040000200052000020010
40024600554001420014200002001020000115275952884001020020200003002040000200042000020010
40024600554001520015200002001020000115279952964001020020200003002040000200042000020010
40024600554001520015200002001020000115274952874001020020200003002040000200072000020010
40024600554001520015200002001020000115285952994001020020200003002040000200042000020010
40024600554001420014200002001020000115276952904001020020200003006840060200392000020010
40024600554001420014200002001020000115282952904001020020200003002040000200052000020010

Test 3: throughput

Code:

  steorlb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7485

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051111494345021038224121257320759199268619313083150210843212742071640967196592000010100
302041107654319021170220201209020337198451019227333071510478205072078841120193622000010100
302041079804139720281211161111320930196177419108943180010978214772126442076196982000010100
302041066283970619406203001031620320193962919189423066210443204602020440010192752000010100
302041059893968319322203611038220327188559318737153071610512205362051440603192872000010100
302041081244098019934210461101120598190040318947883116010662209102196443156198142000010100
302041075954084719991208561082820396189493918815803084810555207042028840166195132000010100
302041095264038619798205881045320758190651918962733148510827212092217043668199292000010100
302051089344206720504215631162820582196621219145623117910701209752100241570194162000010100
302041076074012019681204391035822064199846119410823392511969233032372946455205872000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4098

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251146364705822544245141517424632208941420118583888614271280312819054831228942000010010
300241153854566523417222481191924020209203820153593756013551267702812454551225392000010010
300241143914653322689238441390324527207458920002473845313939276062729453239224482000010010
300241141194656722403241641407824475208614520095753851414050275522826054817225512000010010
300241143344640122254241471425724911208205920066083941914521283912827854554223412000010010
300241142184663122228244031428923987208983120128943751013534266402822754779223572000010010
300241136284584021937239031390025174208352020080863981314654285412942956349224282000010010
300251146034617722413237641402824598208538720098993854813961277562960856542224862000010010
300241139654717424365228091334425073208100620056793960314541283062815455024223342000010010
300241143754682622642241841422224637207631420010683871314090277952871055674224892000010010