Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEORL (64-bit)

Test 1: uops

Code:

  steorl x0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7300534496301810142004100220007767105180300010002000020004000100320001000
7300434217300310032000100020007760105110300010002000020024004100420001000
7300434226300210022000100020007760105110300010002000020004000100220001000
7300434348300210022000100020007767105180300010002000020004000100220001000
730053436130051003200210012000776510521030001000200008621724435862431
7300434948300210022000100020007760105110300010002000020004000100220001000
7300434353300210022000100020007760105110300010002000020004000100220001000
7300434481300210022000100020007760105110300010002000020004000100220001000
7300434534300210022000100020007760105110300010002000020004000100220001000
7300434404300210022000100020007760105110300010002000020004000100220001000

Test 2: throughput

Code:

  steorl x0, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40209605994049820411200872020120005116038965544011020205200053020840009200082000020100
40204600654010620106200002010220002116058962454010420202200023025440070200432000020100
40204600874010720107200002010220005116029965344011020205200053025440070200632000020100
40204600584012220119200032010520002116127962714010420202200023020340004200182000020100
40204600584011720117200002010220002116114962454010420202200023020340004200162000020100
40204600584011720117200002010220002116107962234010420202200023020340004200172000020100
40204600584011720117200002010220002116121962604010420202200023020340004200172000020100
40204600584011720117200002010220002116139962904010420202200023020340004200172000020100
40204600584011720117200002010220002116137962894010420202200023020340004200172000020100
40204600584011720117200002010220002116139962934010420202200023025240065200652000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40029605824035420271200832010620002115926961974001420022200023002840009200082000020010
40024600654001620016200002001020000115926962034001020020200003002040000200062000020010
40024600654001620016200002001020000115932962134001020020200003002040000200062000020010
40024600654001620016200002001020000115930962104001020020200003002040000200062000020010
400246006540016200162000020010200341061771003134007820054200343002040000200062000020010
40024600654001620016200002001020000115942962294001020020200003002040000200062000020010
40024600654001620016200002001020000115946962424001020020200003002040000200062000020010
40024600654001620016200002001020000115916961834001020020200003002040000200062000020010
40024600654001620016200002001020000115938962234001020020200003002040000200062000020010
40024600654001620016200002001020030904811048544007120051200303002040000200072000020010

Test 3: throughput

Code:

  steorl x0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7598

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30205110685431512100402214712250021821199860619376213355411838231162169442847196682000010100
30204107492406651982302084210824020285194758019020993063410449204842027640135191832000010100
30204107124399181959602032210142020424193595318996933089410573207372085641244193542000010100
30204106794400971951102058610485020905193805919145153180711002215672241244061202282000010100
30204107895404251983802058710537020136195246719206253036110326202452082241245194942000010100
30204106527401341945802067610631020802189417218786023153210830212462082841212190982000010100
30204106463396721933602033610348020226188085718661793051410389203612094841285192942000010100
30204107598396251948002014510110020343195874519112773073310491205842135242222195832000010100
30204106453401121957802053410440020669191049718875973135010785211612152642567197122000010100
30204107514400671961502045210491020076190434818815113024510269201312165042581194112000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4102

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300261148334745122526249251461524940208337820080923928914364282522884455493224452000010010
300241144954711022429246811458624624208123420061783868814077278862761453505224332000010010
300241140274594322258236851375724043208191220058083762913597269232866855356221032000010010
300241137284593122033238981403724440208957620128853833313905274422810054023223922000010010
300241144644668622565241211403724583208205120057363881814247278362838054869222002000010010
300241141654638822500238881397825007208329920081963946514471283912732853241223502000010010
300241141614630022488238121381924161207711120014863786313717272232777053997223062000010010
300241143904656922461241081397725046208110820043093955414523286192797854519222592000010010
300241141024622622426238001389524572209273720158723856214000275092866755643221362000010010
300241139094628022221240591421424614207193919976253870014100278452837654924223262000010010