Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEORH

Test 1: uops

Code:

  steorh w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73006349443055103320221011200077601051130001000200020004000100220001000
73004350133003100320001000200077601051130001000200020004000100220001000
73004341943002100220001000200077601051130001000200020004000100220001000
73004341843002100220001000200077601051130001000200020004000100220001000
73004341693002100220001000200077601051130001000200020004000100220001000
73004341473002100220001000200077601051130001000200020004000100220001000
73004341473002100220001000200077601051130001000200020004000100220001000
73005341843004100220021001200077601051130001000200020004000100220001000
73004341783002100220001000200077601051130001000200020004000100220001000
73004341823002100220001000200077601051130001000200020004000100220001000

Test 2: throughput

Code:

  steorh w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0063

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
402063037740258202152004320168200071158171060384011420207200073021140013200102000020100
402043006340110201082000220104200041158541058554010820204200043020640008200082000020100
402043006340115201102000520107200041159121059304010820204200043020640008200082000020100
402043006340110201082000220104200041158481058434010820204200043020640008200082000020100
402043006340110201082000220104200041158371058214010820204200043020640008200082000020100
402043006340110201082000220104200041158501058474010820204200043020640008200082000020100
402043006340110201082000220104200041158981058984010820204200043020640008200082000020100
402043006340110201082000220104200041158201057914010820204200043020640008200082000020100
402043006340110201082000220104200041158421058314010820204200043020640008200082000020100
402043006340110201082000220104200041158501058474010820204200043020640008200082000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0059

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
400263040340168201242004420075200041157181058964001820024200043003140013200102000020010
400243005940017200172000020010200001155601057384001020020200003002040000200072000020010
400243006040018200182000020010200001155671057484001020020200003002040000200072000020010
400243005940017200172000020010200001155521057214001020020200003002040000200072000020010
400243005940017200172000020010200001155761057654001020020200003002040000200072000020010
400243005940017200172000020010200001155731057604001020020200003002040000200072000020010
400243005940017200172000020010200001155591057364001020020200003002040000200072000020010
400243005940017200172000020010200001155611057334001020020200003002040000200072000020010
400243005940017200172000020010200001155891057664001020020200003002040000200072000020010
400243005940017200172000020010200001155661057464001020020200003002040000200072000020010

Test 3: throughput

Code:

  steorh w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.9754

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30205130154413632132720036101302000024500422311933301001020020000202004000002131020000010100
30204129761414102141020000101002000024499432311807301001020020000202004000002130920000010100
30205125608409932094820045101292000024499432311807301001020020000202004000002130920000010100
30204129754414092140920000101002005023894912258531301801023020056202004000002125720000010100
30204129272413572135720000101002000024400802302974301001020020000202004000002125720000010100
30204129279413812138120000101002000024400802302974301001020020000202004000002125720000010100
30204128793413232132320000101002004924231102288770301781022920055202004000002122320000010100
30204128784413132131320000101002000023820642251984301001020020000202004000002130920000010100
30204129754414092140920000101002000024498912311789301001020020000202004000002130920000010100
30204129754414092140920000101002000024499422311807301001020020000202564011002078120000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 12.7197

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30025134903418132176120052100452000024676012323517300101002020000200204000002136120000010010
30024130350413722137120001100102000024676052323517300101002020000200204000002127320000010010
30025130623413622131620046100382000024991382351681300101002020000200204000002145320000010010
30024131899414632146220001100102000024991382351681300101002020000200764010902111320000010010
30024129759412752127520000100102000024562482313083300101002020000200204000002127320000010010
30024129754412822128220000100102000024562482313083300101002020000200204000002127320000010010
30024129754412822128220000100102004924185372280214300871005020058200204000002127320000010010
30024129754412822128220000100102004024569962313858300711004120042200204000002127320000010010
30024129754412822128220000100102000024562482313083300101002020000200204000002127320000010010
30024129754412822128220000100102000024562482313083300101002020000200204000002125320000010010