Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADDLH

Test 1: uops

Code:

  staddlh w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005344873019101520041002200077701052130001000200020004000100320001000
73004342113003100320001000200077621051330001000200020004000100220001000
73004342043002100220001000200077621051330001000200020004000100220001000
73004342243002100220001000200077621051330001000200020024004100320001000
73004343513002100220001000200077621051330001000200020004000100220001000
73004342063002100220001000200077621051330001000200020004000100220001000
73004342063002100220001000200077621051330001000200020004000100220001000
73004342223002100220001000200077621051330001000200020004000100220001000
73004342033002100220001000200077621051330001000200020004000100220001000
73004344683002100220001000200077621051330001000200020004000100220001000

Test 2: throughput

Code:

  staddlh w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0058

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40206604354025920210200492015220005115564958244011020205200053020840009200082000020100
40204600624010520105200002010220005115560958174011020205200053025140065200422000020100
40204600554010420104200002010220002115646955194010420202200023020340004200052000020100
40204600554010420104200002010220002115639955074010420202200023020340004200052000020100
40204600554010420104200002010220002115645955154010420202200023020340004200052000020100
40204600554010420104200002010220002115638955054010420202200023020340004200052000020100
40204600554010420104200002010220002115647955194010420202200023020340004200052000020100
40204600554010420104200002010220002115649955244010420202200023020340004200052000020100
40204600554010420104200002010220002115645955164010420202200023020340004200052000020100
40204600554010420104200002010220002115649955274010420202200023020340004200052000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40026603044014720111200362005820002115504955004001420022200023002040000200062000020010
40024600654001620016200002001020000115381953824001020020200003007640070200452000020010
40024600584001520015200002001020002115535955314001420022200023002340004200142000020010
40024600554002520025200002001020000115396954054001020020200003002040000200142000020010
40024600554002420024200002001020000115386953894001020020200003002040000200142000020010
40024600554002420024200002001020000115342953464001020020200003002040000200142000020010
40024600554002420024200002001020000115367953734001020020200003002040000200142000020010
40024600554002420024200002001020000115375953874001020020200003002040000200142000020010
40024600554002420024200002001020000115373953844001020020200003002040000200142000020010
40024600554002020020200002001020000115396954064001020020200003002040000200142000020010

Test 3: throughput

Code:

  staddlh w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7434

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
306841203364640722144242631452621586199806019368463308911607226972247444218202542000010100
302041075784049719924205731057220189194822319220773044110353203002051440540194212000010100
302041087644062919922207071051420876196978419324013174410970215072247844247200882000010100
302051109304361221141224711258121024198026319226923200711086217152189843136199712000010100
302041090884209420515215791168220542195219319021263107710636208242095041457197052000010100
302041093964193920478214611143120665193058519069233123410673209312198943347198312000010100
302041081374035719716206411057120103192288019041083028110283201552076241013197642000010100
302041070573953219287202451013020252195250819157883054110390203772130042017196082000010100
302041084903999719483205141052720311187450918713213060610397203792138442268196632000010100
302051075794075120007207441071621089195413319105433213811151218632162042714197542000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4140

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3002511527746669224122425714129249422094155201655739308143792821728326548462255720000010010
3002411477347124223972472714515247482080656200577538975142422815927910539132242620000010010
3002511430546338223702396814056247272081168200574138917142062807028016542882227220000010010
3002611410946188221902399814049244102080086200555638281138852742028072545002195620000010010
3002411441746358221722418614407243122084800200848638075137772718127284529202231320000010010
3002411408946249224802376913610245632079659200438338599140502760629026555952226220000010010
3002411426546424225712385313777242472074322199941237945137102703827888540802240520000010010
3002411462546401226172378413654249562081647200578939305143612820427906540792248620000010010
3002411429446811223422446914462248822084460200897039196143282790828588553182239920000010010
3002411429246359223742398514061241882086687201119837872136972703828534551742228320000010010