Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STEORL (32-bit)

Test 1: uops

Code:

  steorl w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
7300634581304010262014100720007762105133000100020002000400001002200001000
7300434174300210022000100020007762105133000100020002000400001002200001000
7300434178300210022000100020007762105133000100020002000400001002200001000
7300434153300210022000100020007762105133000100020002000400001002200001000
7300434149300210022000100020007762105133000100020002000400001002200001000
7300434167300210022000100020007762105133000100020002002400401003200001000
7300434371300210022000100020007762105133000100020002000400001002200001000
7300434116300210022000100020007762105133000100020002000400001002200001000
7300434155300210022000100020007762105133000100020002000400001002200001000
7300434346300210022000100020007762105133000100020002000400001002200001000

Test 2: throughput

Code:

  steorl w0, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40207603994033220277200552016820002115780956914010420202200023020840009200072000020100
40204600584010520105200002010220002115649955634010420202200023020340004200052000020100
40204600584010520105200002010220002115643955524010420202200023020340004200052000020100
40204600584010520105200002010220002115653955684010420202200023020340004200052000020100
40204600584010520105200002010220002115659955824010420202200023020340004200052000020100
40205601064017220140200322013420002115645955564010420202200023020340004200052000020100
40204600584010520105200002010220002115665955954010420202200023020340004200052000020100
40204600584010520105200002010220002115659955814010420202200023020340004200052000020100
40204600584010520105200002010220002115649955644010420202200023020340004200052000020100
40204600584010520105200002010220002115657955784010420202200023020340004200052000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40027604004021020158200522007420002115418954604001420022200023002840009200072000020010
40024600584001520015200002001220002115420954674001420022200023002340004200052000020010
40024600584001520015200002001020000115410954534001020020200003002040000200052000020010
40024600584001520015200002001020000115410954554001020020200003002040000200052000020010
40024600584001520015200002001020000115414954624001020020200003002040000200052000020010
40024600584001520015200002001020000115416954654001020020200003002040000200052000020010
40024600584001520015200002001020000115406954434001020020200003002040000200052000020010
40024600584001520015200002001020000115406954454001020020200003002040000200052000020010
40024600584001520015200002001020034105682996354007820054200343002040000200072000020010
40024600584001520015200002001020000115400954354001020020200003002040000200052000020010

Test 3: throughput

Code:

  steorl w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7438

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
30207112774444102170602270412752021274197189119259693236611205218982157442398200062000010100
30204107509409021998002092210950020209189992818825253049110401203702332545624200872000010100
30204108134410182003602098211069021260196784719127433246611307221632434647566207572000010100
30204107821407551992002083510921020426192355518868313088410559206982183643043196912000010100
30204107176401931958502060810585021541196923219190763290411498224442194943275200312000010100
30204106194396831942202026110177020324194913419050183070510496205322047440497195082000010100
30204106460397461928502046110285020682192121018874363138310801211752163342608197132000010100
30204108042400701954002053010437020195192629118989223046210367203252273244341199902000010100
30204108144410442007802096611022020610197764819181263123310724210462124042007198962000010100
30204107649406151995602065910720020144195387919264103037710333202542198943361199522000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4198

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3002511487346622223562426614736246592072347199803238615139692778927381533182241020000010010
3002411434746479224632401614081248732081691200646939167143062801027707541762239520000010010
3002411394646497222382425914296250602071495199825039483144402834328149547992251620000010010
3002411436446667225522411514095244242079403200423338330139162755427874539822246120000010010
3002411394646561223132424814491247882085492200968138976142042814628474552992197520000010010
3002411386546128222932383513745252332087803201210139925147082852629210563252198820000010010
3002511441046908228542405413939256212087949201214740611150032921127936541122211120000010010
3002411373345796220172377914006247182075643200042839009143012803927682537542206620000010010
3002411427046538224592407913994249742075091200010739437144762839628656555582172420000010010
3002411398146419223162410314077244062075912200181938292139002724928044544882226120000010010