Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST4 (multiple, post-index, 16B)

Test 1: uops

Code:

  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 13.000

Integer unit issues: 1.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f373a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafl1d cache miss st nonspec (c0)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
7200728915223100000000048672857540144381300010008000400010008000400050005188871620015024692284542858731013000400080009000200002863028740116100110001000400008040000004000801308496736919323814418659316338041138392819510001540211965122734000800010002857828632285792875928604
72004287202221001000120046952853830145261300010008001400010008000400050005189871596019024644284992852231013000400080009000200002853128574116100110001000400000040000004000001335694166970315004418579320538111140452825210001546712009123784000800010002854328578286532862528560
72004285132211001000120148532866804144861300010008002400010008000400050005190271605019024631285882866331013000400080009000200002860228799116100110001000400008040000404000801332895156936318804118600325038121034362807510001514611964123694000800010002861728656286002860728665
720042861522211100000004716285413414334130011000800040001000800040005000519057162901402467828568287103101300040008000900020000286282855711610011000100040000004000000400000131819594703232260391851932553802834352808610001552512000120144000800010002859328586286152865628585
72004287472222001000000466328583001429413000100080004000100080004000500051896716120120246162846428609101013000400080009000200002848328571116100110001000400008040000004000001338794556980323513218460315538051335372825810001523012144122214000800010002869628680286242869128829
7200428606222000100000048142857440143361300010008000400010008000400050005190871616014024633286672869331013000400080009000200002866128692116100110001000400008040000004000001329495566982317914018634319238081341352813610001525712328123254000800010002863628561286312829628655
7200428513222200100000048032863544143481300010008000400010008000400050005190671634012024638284542863891013000400080009000200002844728510116100110001000400008040000004000801315696447031318413918755318838061042422820610001553912173121494000800010002852128599286642866628580
7200428438221000000000048372852544142801300010008001400010008000400050005191271632013024582284722864931013000400080009000200002868728693116100110001000400008040001064000801306695587041320303918820319438161130302814110001533911909120284000800010002867028518285422867028665
7200428607223100200000048262851744142391300110008000400010008000400050005190171642011024592284552875131013000400080009000200002850728705116100110001000400000040000004000801352698847026321403318599325738111637322817710001483312247124144000800010002872828738285222867028603
720042867622310010000004892285610414384130001000800040001000800040005000519067166001402462628499286293101300040008000900020000284042851311610011000100040000004000000400080133039714706231810381849232333817836292817510001553612166120214000800010002850328620287252855728664

Test 2: throughput

Count: 8

Code:

  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.5033

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2223373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
9602072003881547000001502008504612006810161386165922511272168010072405332000080100640000320000400500938508310807496002005300198313198535431133382511040100200320000640000200720000160000019875020104821802011009910010080000800001003200600342589303200021033217420340051112162219957580000032000064000080100200444201531200063200755198059
96020420033615461000100200875761198743161614381636125112569780100723252320000801006402383200004005009352736106962430020144902000941991813762014397461040100200320000640000200720270160000020002820010611802011009910010080000800001003200000342504303200022223200022340051112161219944080000032000064000080100200440198940201881200072197460
9602042010721545000000020088724020149816161648168692511255958010071988232000080100640000320000400500924656510817056001990010200015198927405493421701040100200320000640000200720000160000020108920166011802011009910010080000800001003200000342581803200000003200022340051111171120248780000032000064000080100198858201565202049200104200881
96020420034415630100001502008925102019161616138316332251129825801007287993201208010064000032000040050093140401087109400201672019830720025440758340825104010020032000064000020072000016000001996632016971180201100991001008000080000100320000002531503200021053200022340051102172519871980031132000064000080100200117199800198739203134198222
9602042007701563000000020093957019926416161807165582511285558010072646032006080100640000320000400500924716310727989001990930199815198202377163411341040100200320120641200200720000160000020231220079811802011009910010080000800001003200000342520803200620083200022340051091161219861680000032000064000080100202089197419200491200438200092
96020421566615500010001220091627119965516161576173202511281468010072527032000080100640000320000400500930255710895378001982000199854214185419463387091040100200320000640000200720000160000019945219757411802011009910010080000800001003200000342575803200021003200042340051112171220160380000032000064000080100200647199654201096199646200785
9602042022021556100000120008614812037901601251168322511239658010172591032000080100640000320000400500927703610782610002005710198937199493402573396171040100200320000640000200720000160000019787720136511802011009910010080000800001003200000342613803200020023200022342052591533319977580116032000064000080100197361198947199972201425201033
96020420129315711110162528442008505601989121601476176013611255788010072510732000080100640000320000400500934037210857612002006863201956200825408643367611040100200320000640000200720000160000020061319906611802011009910010080000800001003200000342479903200020023200022340051091162119919780000032000064000080100198173201005199211198696199578
96020520194715530000001220083071020143916161272170412511268398010172533832000080100640000320000400509936167610731079012021430202104200325375403406111040100200320000640000200720000160000020014419896911802011009910010080000800001003200000342479103200001023200002340051091172119924680000032000064000080100201066200761199457201440200752
96020420218215591010003210910610198726160137216672251127283801007244593200008010064000032000040050094014051081159402199641020114119906637188340714104010020032012064000020072000016000002003202015451180201100991001008000080000100320000034245670320002005320002200051111171220053380001032000064000080100200024200734200440201931200458

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.4970

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2223373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6067696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
9600272010721571000000105631081696119674616161435163072511237648001072412432000080010640000320108400050928891210780233000198943020184220051740211340128104001020320000640000207200001600600199531200837118002110910108000080000103200000422670703200022165320002164314005039101601271971988000032000064000080010202728198907199253198678199481
960024197706154400000160930081511020152601612631638025112590980010727175320060800106400003200004000509332020108218950001981490199158200410381993390591040010203200006400002072000016000001999201998871180022109101080000800001032000004224965032000000532006224200052271217012101987398000032000064000080010199947199787200116196316199707
96002419953915611011004290008906702026111616156416761251119815800397277023200008001064000032000040005591634391087841300020285802005421998445686933866110400102032000064000020720000160065520002820032521800221091010800008000010320000042257190320002102320000242000501961701071992828000032000064000080010196802202234201626200379199636
960024198603153500000054630084610019997816011871671525112106780010725913320000800106400003200004000509322926106731530002001220200531200452390413408581040010203200006400002072000016000002016611997221180021109101080000800001032000004226239032000200232000204200050191216012122155678000032000064000080010203456200816198198197350201484
96002420173315540000004443008983001996581601509170032511299208001072307332000080010640000320000400050924115310929331001198600020091819979138066344058110415102032048064072020720540160180019856119935241800211091010800008000010320240002680823202420046123202422420005066105307142003088005832000064000080010199418207336198138199956199918
96002419856815530000006930090719020009116161376171002511316658001173021532000080010640000320000400050925947110650718001200663020113020004939691341093104001020320000640000207200001600000199031199490118002110910108000080000103200000422649703200020053200022460005019617012121997478000032000064000080010197689199361201798200617200103
9600241985781536000000030084814020116916161453174062511251678001073056232000080010640000320116400050935629110718695410199701320087120020141193340938104001020320000640000207200001600000200993202626118002110910108000080000103200000422602903200020053200022420005019516012122004958000032000064000080010201152202732197717196649198610
960024201621155400000003008343502003950161463160712511302088001072492432000080010640000320000400050929631810907037200200911019826919810739464339579104001020320000640000207200001600000200537200149118002110910108000080000103200000422602503200020023200002420005019101609121998638000032000064000080010198939201736201016200398199200
96002419976815310000002430080657019815016161443163832511260278001073005832000080010640000320000400050921561110884093200199154020053419883738695340604104001020320000640000207200001600000200622202086118002110910108000080000103200000422586903200020053200022420005020752010121992758000032000064000080010201486201344199434200183199734
9600242002151556000000183008235002004060161363165542511288848001072411032000080010640000320000400050921301010710579200200911020086819974542123338725104001020320000640000207200001600000201124199909118002110910108000080000103200000422628203200020003200022420005019131641271998938000032000064000080010200578198676201003198651199828