Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADDL (32-bit)

Test 1: uops

Code:

  staddl w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.001

Integer unit issues: 1.002

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005354603018101420041002200078451059730001000200020004000100320001000
73004345393003100320001000200077601051130001000200020004000100220001000
73004344853002100220001000200077601051130001000200020004000100320001000
73004342083002100220001000200077601051130001000200020004000100220001000
73004342533002100220001000200077621051530001000200020004000100220001000
73004342303002100220001000200077601051130001000200020004000100220001000
73004342233002100220001000200077601051130001000200020004000100220001000
73004342033002100220001000200077611051230001000200020004000100220001000
73004342223002100220001000200077601051130001000200020004000100220001000
73004342363002100220001000200077601051130001000200020004000100220001000

Test 2: throughput

Code:

  staddl w0, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40207603914033520280200552016920005115678959604011020205200053020840009200082000020100
40204600654011120108200032010520002115765956654010420202200023020340004200062000020100
40204600654010620106200002010220002115767956674010420202200023020340004200062000020100
40204600654010620106200002010220002115765956644010420202200023020340004200062000020100
40204600654010620106200002010220002115769956754010420202200023020340004200062000020100
40204600654010620106200002010220002115765956634010420202200023020340004200062000020100
40204600654010620106200002010220002115773956774010420202200023020340004200062000020100
40204600654010620106200002010220002115775956844010420202200023020340004200062000020100
40204600654010620106200002010220002115763956604010420202200023020340004200062000020100
40204600654010620106200002010220002115769956714010420202200023020340004200062000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40027603964021120157200542007420002115476955284001420022200023007340069200402000020010
40024600674001720017200002001020000115466955174001020020200003002040000200052000020010
40024600624001520015200002001020000115479955444001020020200003002040000200052000020010
40024600624001520015200002001020000115497955734001020020200003002040000200052000020010
40024600624001520015200002001020000115476955334001020020200003002040000200052000020010
40024600624001520015200002001020000115483955474001020020200003002040000200052000020010
40024600624001520015200002001020000115493955694001020020200003007440070200442000020010
40024600624001520015200002001020000115493955664001020020200003002040000200052000020010
40024600624001520015200002001020000115495955704001020020200003002040000200052000020010
40024600624001520015200002001020000115487955574001020020200003002040000200052000020010

Test 3: throughput

Code:

  staddl w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7769

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051142364642022368240521409523057203871219721343578812870250552411847085208432000010100
302041097644272620813219131209620393196099119106963082910542206652080441108193162000010100
302041070853983819490203481023720725197174519140323142310800211812204443387202212000010100
302041074844036419668206961073621563194674619082713304311580226422089641259196292000010100
302041080594004119710203311033221190197328219225223230911221220292207843531201332000010100
302041081864138320235211481121320941199396419319323187011035216212042440435194302000010100
302041094664250420749217551170421965199725119365133375211889231922213443607200542000010100
302041072604031719716206011056120758196044919055803151510857213162151842518198672000010100
302041069444010619508205981056620988194127619029573189111004215992219243700201172000010100
302041062933989819522203761045520519194757319081543102310605208052189243161200102000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4091

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251149614748322590248931498024813208361720071523916814367281692909856520224172000010010
300241147774714122669244721433224377210746820282023817113810271232883655253224322000010010
300241140684638722236241511393824595208778620115463868114102277082800454572226362000010010
300241135874598922376236131362224754208446120083713891714176280052724453507223652000010010
300271146314649322242242511421424632206976919960633876714150278672730053394224612000010010
300241138154625422218240361396824770207114319977653907714320279962797654395223712000010010
300241137054597822260237181400125035207072819975193971414694286782731252867223242000010010
300241142984680722467243401414624566208173320069873868514133277852732053464223482000010010
300241137454631822354239641409324697207032319955483880814124280232769753629225322000010010
300241136274603422173238611428624863206373819900113922514379282752711252890222532000010010