Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST3 (multiple, post-index, 4S)

Test 1: uops

Code:

  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 7.000

Integer unit issues: 1.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f222324373a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2c3c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
660082901723120101810133517700004644292963317263700710003000300010003015301850253366724224200229002895729180842037077302430337051909029486294376161001100010000303306230251003020000001324788916735307305220010317538075254502854210061577812377139203000300010002929929049292132912829430
660042902223200010201313210000455328996031662470001002302130391008302130155050335982404860229022840429117721070353027303770009000296802921211161001100010000302206030160277603000060001292391296913310905119703310238154555532845410081571512386139763000300010002882228959295202963828829
660042936423600000819106510000466529046331700870981009302330091004302430505050330952400040230972856928813131070963009300071059144293142956718161001100010000302926030000250733000060001322189146808302805119710306938135449522920210111571812642139833000300010002883429365294662936828780
66004293862360010015182772184900004707297761316591700010063043302110083033303950053304624096100229992908729644572817070300330007000900029075289071161001100010000300000030000003000000001310695717030319726119330318838041853532815410011539912192132473000300010002863428538285932862028570
66004286172230502000091000047392857601148777000100030033000100030003000500033021240008124119283322852731070003000300370009000287192865211610011000100003000000300000030002902868401324793886941310445219543313538092157552830210001557711992136673000300010002859228797287272883628795
66004287172230606000030000466228636001665170001000300330001000300030005000330432400050227512856228844310700030003000700090002862528585116100110001000030000003000000300009000130519437689631411591953831593810956522826510001551712290138963000300010002883528809286842879628835
660042887222205040000300004606286811016654700010003000300010003000300050003301624000502263528607287643107000300030007000900028813287661161001100010000300009030020023000200001318192796973313225319563309638101654472830210001537812254137473000300010002872028814287442873328668
660042843822309050000200004701287181116709700010003000300010003000300050003304624000612273628521286033107000300030007000900028691286511161001100010000300000030020003002070001323393996982319534819597321438081856492826510001531412436138153000300010002889328723286672871228643
660042880322205060000100004724287473016612700010003000300010003000300050003304724000302275628281287833107000300030007000900028488285081161001100010000300009030000003000000001354495377013322635719478329238131851492813110001564412084137383000300010002848728458286162856828553
66004284822201505000010000467728449311653570001000300030001000300030005000330322400000227422852328987310700030003000700090002875328831116100110001000199300000030000033000260001328091246916315215419627324738201756582816710001505412035134223000300010002864928595288562916128575

Test 2: throughput

Count: 8

Code:

  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  st3 { v0.4s, v1.4s, v2.4s }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)0309l2 tlb miss data (0b)18191e1f2324373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
480208120042930000021000363701200331616025564136801002428882400008010024000024000048049955195671944967012002112004312004239987340043560100200240000240000200560000720000120198120042118020110099100100800008000010024018203410202400021052400020340005110117111200398000024000024000080100120053120043120048120205120050
480204120042931000003005255012002716002556616580100247831240000801002400002400004804975519567194223701200211200431200484007434002456010020024000024000020056028072000012004212004211802011009910010080000800001002400000340024000200224000223400051105341111200398003924000024000080100120044120049120048120044120043
48020412004293100000600971101200341616902556748880100246426240000801002400002400004804995523052194850601200241200491200433998434002456010020024000024000020056000072000012020412004211802011009910010080000800001002400000345202400020022400020340005110126111200458000024000024000080100120043120043120058120044120043
48020412004893101000000533001200340168255633278010024505124000080100240000240000480498551956719626580120021120197120043399843400255601002002400002400002005600007200001200431200421180201100991001008000080000100240064034002400020022400022340005110117111200408000024000024000080100120044120049120043120043120044
48020412004293000000000629401200281616125564160801002447572400008010024000024010848049855195671940725012016212004312004239986340030560100200240000240000200560000720000120042120043118020110099100100800008000010024000003600240002108522400022340005110117111200468000024000024000080100120051120043120060120043120044
480204120198931000003005023012002716160255714748010024577924000080100240000240000480498551956719408030120021120043120049399863401305601002002400002400002005600007200001200421200491180201100991001008000080000100240240034002400621052400022340005110117111200398000024000024000080100120051120050120044120043120043
48020412004393100110185006779012018216094485636988013924711724006080178240234240216480962553120519431830120164120350120356401737402325603642002402402401202005602807210801203481203453180201100991001008000080000100240064234100024168000225222400622342005147143111204708007824000024000080100120348120358120350120503120198
480204120196930000003005704012002816160255643678010024190924000080100240000240000480499551971119386620120024120042120042399843400255601002002400002400002005600007200001200491200431180201100991001008000080000100240000034002400020022400022340005110116111200458000024000024000080100120044120044120050120043120044
480204120048930000003006460012002716160255681938010024642524000080100240000240000480499551956719406130120023120048120042399843400245601002002400002400002005600007200001200431200421180201100991001008000080000100240000034002400021022400022340005110117111200398000024000024000080100120044120049120044120044120043
4802041200429310000120001454012003716160255703658010024824024000080100240000240000480499551968719430610120021120042120042399843400255601002002400002400002005600007200001200421200481180201100991001008000080000100240000034002400020032400022340005110117111200408000024000024000080100120050120136120043120043120044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f23373a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)61696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cdcfd0l1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4800281200499310000010019060010120027161602556605580052244080240000800102400002400004800495519567196059301200210120195120042399863400495600102024012024012020560000720000120042120043118002110910108000080000102400151436002400161018240002143614305033003926373712034580078024000024000080010120043120044120196120199120044
48002412249495501100225289704715112035316032556625180010248660240060800492401172401084802745525847193535601201740120052120353401831140032560010202400002402402056056072000012035912005811800211091010800008000010240000205002407221002400022340005020004016393912003980000024000024000080010120051120061120052120052120052
48002412005293110101000210866201200281616025568473800102485422400008001024000024000048004955195671942062012002101200491200433998434002456001020240000240000205600007200001200431200421180021109101080000800001024001414001240016132024000016014005019003916414012004780000024000024000080010120043120043120043120043120043
48002412004296500000000303671112004316160255654978001024898924000080010240000240000480045551976419373600120033012005912006139988340035560010202400002400002056000072000012005012006011800211091010800008000010240000034002400020082400022340005020003817393812003980000024000024000080010120051120048120051120053120051
48002412005896410001000180113260120028161602556454480010247499240000800102400002400004800495519567193516701200230120049120043399843400245600102024000024000020560000720000120043120042118002110910108000080000102400141536102400160121240000163614005020004117412112004680000024000024000080010120051120061120060120051120060
4800241200509641010100021071951120037161662556733980010248992240000800102400002400004800455519860194733601200340120058120049399883400335600102024000024000020560000720000120058120049118002110910108000080000102400141536002400160017240002163614005019003917403712003980000024000024000080010120061120059120053120050120049
480024120051965100000001905461012002816162255650158001024383424000080010240000240000480049551956719355150120021012004312004939984340024560010202400002400002056000072000012004812004211800211091010800008000010240014140002400160117241144143614005020002016383612004780000024000024000080010120044120043120043120043120044
4800241200429640000000030453811200431616025561344800102467932400008001024000024000048004955198841960669012003301200581200573998634003356001020240000240000205600007200001200581200601180021109101080000800001024000003402240002008240002200005020003517391812004080000024000024000080010120052120051120058120051120060
4800241200589641000100121904794012002716160255664338001024713324000080010240000240000480049551966319201760120021012004312004239987340024560010202400002400002056000072000012004212004311800211091010800008000010240015140002400160116240002163614005019003516183512005580000024000024000080010120044120043120043120043120044
480024120042964000000006010667112004316160255657518001024679424000080010240000240000480047551969219306770120034012005212005839996340034560010202400002400002056000072000012005012005811800211091010800008000010240000034002400000022400022340005019001717203812003980000024000024000080010120051120051120204120051120053