Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEORLH

Test 1: uops

Code:

  steorlh w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005344583018101420041002200077701052130001000200020004000100320001000
73004344363004100420001000200077701052130001000200020004000100320001000
73004343433003100320001000200077731052730001000200020004000100320001000
73004344633003100320001000200077701052130001000200020004000100320001000
73004341033003100320001000200077701052130001000200020004000100320001000
73004340993003100320001000200077701052130001000200020004000100320001000
73004341003003100320001000200077701052130001000200020004000100320001000
73004341013003100320001000200077701052130001000200020004000100320001000
73004340873003100320001000200077751053130001000200020004000100320001000
73004341023003100320001000200077701052130001000200020004000100320001000

Test 2: throughput

Code:

  steorlh w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40206606354028220234200482015020005115437957134011020205200053020840009200062000020100
40204600554010420104200002010220002115506953824010420202200023020340004200042000020100
40204600554010420104200002010220002115510953914010420202200023020340004200042000020100
40204600554010420104200002010220002115521954104010420202200023020340004200042000020100
40204600554010620106200002010220002115508953874010420202200023020340004200042000020100
40204600554010420104200002010220002115514953964010420202200023020340004200062000020100
40204600554010420104200002010220002115514953954010420202200023020340004200042000020100
40204600554010420104200002010220036111453978464017220236200363020340004200042000020100
40204600554010420104200002010220002115512953924010420202200023020340004200042000020100
40204600554010420104200002010220033108024990254016820235200333020340004200042000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40026602824014520107200382006020002115496954994001420022200023002040000200062000020010
40024600654001620016200002001020000115370953844001020020200003002040000200052000020010
40024600584001520015200002001020000115362953724001020020200003002040000200072000020010
40024600584001520015200002001020000115370953854001020020200003002040000200052000020010
40024600584001520015200002001020000115360953664001020020200003002040000200072000020010
40024600584001520015200002001020000115364953724001020020200003002340004200062000020010
40024600584001520015200002001020000115368953834001020020200003002040000200052000020010
40024600584001520015200002001020000115357953604001020020200003002040000200052000020010
40024600584001520015200002001020000115354953554001020020200003002040000200052000020010
40024600584001520015200002001020000115356953604001020020200003002040000200052000020010

Test 3: throughput

Code:

  steorlh w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7472

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051089334115720282208751085220861195844919113733169310932214502052640612189782000010100
302041092094082019712211081106320524191910018985133103610613208292030440192194762000010100
302041097634281120825219861214421704200086419379693330311726228942504148720211502000010100
302041076004031619739205771057820556195323519011773109710641208182025440101189962000010100
302041067103976019382203781038520183195262319083033041710334202492252444063200912000010100
302041076504131420139211751117020359192756118886173077610517206222026240107191042000010100
302041079934075020032207181077820767196978419136623151210847212822652451398216362000010100
302041103504284120951218901204022463203349219678853471712383240772144742386201112000010100
302041085664099520151208441089721128195193519160553216211176217602111641750197832000010100
302041063184005119571204801041620748195174819040593144610798211212443447632206082000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4086

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3002511504547375224862488914903250212094822201798739496144872815726820521852281120000010010
3002411447846183222132397014730247872075998200112739143143712812427622532672247520000010010
3002411370746446222132423314535252532069260199468939933146952878427910542852247620000010010
3002411439646575223102426514265242682098542202074737963137062687328067543882243920000010010
3002411369246019224712354813560245322079624200399538655141362783327705538812234720000010010
3002411367346147220912405614439236672087218201015536886132322614628422549222215320000010010
3002411407146508222852422314138243362084957200894138160138362720828094544772228720000010010
3002411408647057223752468214563247932070500199703839003142232799627019528762210920000010010
3002411357345789221962359313898246682077940200250538731140762792127972541232226720000010010
30024113658468122233024482144372443320711331997232383401391827524590381027754385538682423078