Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST4 (multiple, post-index, 8H)

Test 1: uops

Code:

  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 13.000

Integer unit issues: 1.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f222324373a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
72007295832371012009000000000472129276011519913000100080084000100080004000500051904716205246442922629553310130004000800090002000029359294461161001100010004000012040000004000180001307194627003312825319372333138321654542885510001634612846129604000800010002951429387294882935529350
72004294252360080110010125000046892922411152451300010008000400010008000400050005191071589424738294172952031013000400480009000200002945529398116100110001000400000040000004000412412931324492846979320565319336336038272044492878610001622312763129714000800010002938329264293612935029516
720042948923701800500013210000469729380401520613000100180004000100080004004500051911716016247352922929509111013000400080009000200002938629468116100110001000400450340080344000112028691310992466879305834419740333938171753482903410011629212853131494000800010002956329916298452951729455
7200429790238006016010141510004674294881015113130001001800040001001800040085000519017161462468029256294777101301340008000900020000294542944311610011000100040000034000004754000012006571298091586900307224620133336538252951472945110011629912880130484000800010002954829484294752936329445
720042937723710602700026410000467529447441525113000100080084004100080064000500551889716022246742927529378310130004000800090042000029416293442161001100010004002012040060045040000120001287491936942316135119340327838281548472870310011605312905131484000800010002945129432293972943329476
72004294902370040190000000104710293730015097130011000800040001000800040005000518987161292468229225293793101300040008000900020000293532945611610011000100040000120400001140000120001319692926937312334819343323738301947502869410001611812828130084000800010002961429661296912953329553
72004294532370050030000000004756291940015156130001000800040001000800040005000519017161322466129185293523101300040008000900020000293492931911610011000100040000120400000040000120001327095066953311524619404327138221953472872210001602812935130194000800010002933129395294592944929385
720042955923610500400012000014760293230415102130001000800040001000800040005000518967163942475129275295383101300040008000900020000293722939011610011000100040000120400000340000120001311894466913316425419320323238221744472872210001614312830130584000800010002940629448295062945129344
7200429366237008007000010000475629288041518213000100080004000100080004000500051890716228247112921829497310130004000800090002000029254293921161001100010004000012040000004000012000131739340699431653491940233403829947492877810001620612764129234000800010002937329433294812936229390
7200429446236005004000010000470229346341524413000100080004000100080004000500051896716114246532954529744221013000400080009000200002934129331216100110001000400008040010034000404101307993707010316235318804322438332346482834410001554011996123984000800010002903029053288212882629063

Test 2: throughput

Count: 8

Code:

  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.5010

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f222324373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
96020720154913640100000200082564020024016161552167782511227518010272515132000080104640030320007401093923039210756719002018692004442009293802464036810401432003200166402622007200311600080199549199034218020110099100100800008000010032000003626805032000200232000223400511000226121981158000032000064000080100200238199762200326200033199976
96020420133016130000003200082342020008116161366166882511252698010072227432000080100640000320000400500928531810739191001996582013111998823966834145810401002003200006400002007200001600000198127199739118020110099100100800008000010032000003426163032000210832000223400510900116212015558000032000064000080100201070200277201837200232197738
96020419777316080000101220008424001993521616141615500251131163801007228723200008010064000032010840050092259471075243700200335197350201936380133405781040100200320000640000200720000160000020093119850011802011009910010080000800001003200000090320008001232000223400510900116222013738000232000064000080100199579198576199801199893198706
96020420065516280000016080018694302008791616153816575251127927801007269223200008010064000032000040050092907071074871810199965199764200554389323409841040100200320000640000200720270160000019942419799111802011009910010080000800001003200000342442103200020024032000223400511000117112023548000132000064000080100198782200187200954200933198807
9602042010651612000000320108243901980041616131315818251125190801007275843200008010064000032000040050992930031070298500198673200039198941391823393611040100200320000640000200720000160000020125419999311802011009910010080000800001003200000342522703200020012832000223400510900117222029798000032000064000080100199154201035201264201085202545
96020420311716210000002720009078402012790161388168922511312658010172739332000080216640000320000400504911056110773371001994142001212008213940734015410401002003200006400002007200001600000201078196368118020110099100100800008000010032000000261810320002001132000223400510900116112039288000032000064000080100202482198337197871198017198599
960204199483161200000035000907530201454161616591600925113431180100724958320000801296400003200004005009335664107708820020025620140120037341716341989104010020032000064000020072000016000001998312015091180201100991001008000080000100320000034250980320002003232006223400510900117112012648002932000064000080100200182199719202367197128203269
9602042000821611001024396178010865460202067161616652041811391140087810887258223202408018764095232032440109093347081080780601201370198547197479395912240302104160020032036064096020272081016018001997372020384180201100991001008000080000100320000034263190320002101732000223400510900117111979528000032000064000080100200622197599198617199818199392
960204201244162700000030000905590201304161614081676125112514880100726005320000801006400003201084005009225726108342410020042519791519912540221340899104010020032000064000020072000016000002016692004161180201100991001008000080000100320000034248900320002004432000203400511000117111984108000032000064000080100200272201374200724200648199852
960204200070160600000032000904550200491161616991519825113032380100723481320000801006400003200004005009249220106692200019916420078720047639862339113104010020032000064000020072000016000002000731980521180201100991001008000080000100320000002512803200021019732000223400510900117211999598000032000064000080100200649200534200744201483197874

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.5038

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f222324373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)606667696b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
960027197329155600000001530007924811999421616148515752251132678800107253923200008001064000032000040005093306321073810600020063102012321982254060203411411040010203200006400002072000016000002002621991081180021109101080000800001032001415442572203200160101832000216441400050192162220159480000232000064000080010200477199969200893199389199490
9600242002321546100000113210300084567120074216161715173641651126983801557280013202408009764095232032440048592958261087690400119916302018081999604355204238588104113520320480640960207210801602400201554199933518002110910108000080000103201971744240133320016120163200021601410050192162220089880001032000064000080010199252200004198452196563199695
9600242019541616101000024140008337012002041616173117220251123262800117274523200008001064000032000040005093476341077159800020027502012032026364082703412531040010203200006400002072000016000002003262011381180021109101080000800001032001414442606513200160102632000216461400050192172219776380000032000064000080010199785199888199201198915199684
9600242008571606100000042170008790512004471615189816550251124545800107235293200008001064000032000040005092748851088836000120159802005392020774136603380831040010203200006400002072000016000002032882006081180021109101080000800001032001514442605703200160102832000216421410050202162219884880000032000064000080010202125201522198547198529200817
9600241993881593101000012170008666311993371516132016627251122961800117283503200008001064000032000040005092084551069008100019914601987492005394207203389541040010203200006400002072000016000002008861998371180021109101080000800001032001515432596313200161202032000216441400050192162219855580000032000064000080010200667199935199220200453200729
960024200485162210100000190009617312013551616141016301251126880800107271263200008001064000032000040005493246731076090100020159902009602019984042603411981040010203200006400002072081016036002010482000842180021109101080000800001032001514442591703200161101632000216431400050192172219665980000032000064000080010199106200945199006200394200625
96002419753616021000000132170008560311990971616145815773251128994800117295803200008003964000032000040005093150541069464200020017601990961985994175803410381040010203200006400002072000016000002017972006061180021109101080000800001032001415442600903200160001732000216441430050555443420064381218032000064000080010200459198679200147199331199444
960024202048162511000435282810008366522004301616146915896251124093800117227273200008001064000032000040005093237161075631200020116602002612005073997903423581040010203200006400002072000016000001992121997331180021109101080000800001032014217442531603200161101732000216441401050192162221645880000032000064000080010200515198372200969198517202149
960024200650159910101000190009477411990611616163817628251122972800117216373200608001064000032000040005092485771080403400019680602019251999364106103396901040010203200006400002072000016000001999842003741180021109101080000800001032001417442584003200161001632000216441430050192162220084480000032000064000080010200190219060199134199846197415
96002420090715971000100017000792151199216161615421718425112907980010715272320000800106400003200004000509276871108628740002018720199585199492388850339448104001020320000640000207200001600000196042201248118002110910108000080000103200141544249140320016010183200021644141005019217212720117180002032000064000080010200879201225202564203238198917