Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST4 (multiple, post-index, 2S)

Test 1: uops

Code:

  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 7.000

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)91inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
660072881822402600210000000483528441221524870001000400020001000400020005000256013580311237620285892867331070002000400050001000028461286881161001010001000200005020000012000200001316194757036322645919481319338101252512813410001530912414128182000400010002848428539285182876128647
66004286972220180021000243000490228610121528270001000400020001000400020005000256213581010238020284872857331070002000400050001000028588285861161001010001000200246220020022000202101335098996988316065119390322938041651512814610001521412515126912000400010002861228527286192850628639
660042859022211800170001200048762857222151997000100040002000100040002000500025652358091323833028630286293107000200040005000100002857728602116100101000100020000502001001200010221491340495417010330076019410318638071455522815910001541412358124482000400010002854828488284802856328617
660042863822211511181000300498428517001539570001000400020001000400020005000256213580413237470284672869031070002000400050001000028550286881161001010001000200000020010002000150001334096696985321365619482318138061349482815110001498512432126242000400010002868228611285362857128720
6600428622222022002100012300472128641201547470001000400020001000400020005000256083580213238170285312872531070002000400050001000028586286411161001010001000200436120020122001240001344095937016327275719512319238081554542823310001530112665127722000400010002862828631285462863628642
6600428625222013002400012000477228649001523770001000400020001000400020005000256193580113238420285362859031070002000400050001000028693286111161001010001000200334220030052000262101340395116990326055119483314938041948512817210001533612648127432000400010002860728621286002863228597
660042871722312301141006000469628675011536570001000400020001000400020005000256193581010237940285332873531070002000400050001000028672286941161001010001000200005020000012001042201316596936925321935019372325338021448532804810001532412503128012000400010002872128677288302874428685
660042868522211500101003930049002866022154497000100040002000100040002000500025614358215237760284892865531070002000400050001000028683286081161001010001000200005020002002000050001318695476945320074319362326038101054532815610001542812418127912000400010002883328608286192854428657
660042855522201700190006100488629570101523770001000400020001000400020005000256063581112237620285832862431070002000400050001000028688283731161001010001000200336120020222000202301323196117015322445219457319338081645492814010001519112687127252000400010002866228643287182874728595
660042850722111711230000200489828604001549170001000400020001000400020005000256203581613237490283862860731070002000400050001000028535286521161001010001000200000020001002000040001338697566963321395819430317538051653512816210001545512610127662000400010002873928617286682865328699

Test 2: throughput

Count: 8

Code:

  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.2650

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f2223373f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
480207100286775000012510374201007611616213865282560308380100364342160000801003200001600004752314679702556894400101552010083899994210253208785601002001600003200002004000008000001023961020221180201100991001000800008000010016000023211169016000200701600022320005110117111004598000016000032000080100101839100909101103101240101032
480204101942790000002004107199967161622226528256024898010036001916000080100320000160108476007467276157808150010094701009221008161981732059956010020016000032000020040000080000010322999784118020110099100100080000800001001600000010694016006200021600020320005122117111021188000016000032000080100102045100429101087100637102376
4802041018257880100020038819102185161622126445255997268010036033216000080100320000160000475697466146356945750010151101004311014402069032192956010020016000032000020040000080000099922101944118020110099100100080000800001001600000321012301600020205160002232000510911711101392800001600003200008010010188199053100917101867100295
480204100131783000009010422761011210162081705425601659801003620271600008010032000016000047585846850625648432011002990102551100853216453206855601002001600003200002004000008000001016681015061180201100991001000800008000010016000003210276016000000021600022320005110117111012658000016000032000080100101878996069975610189299918
48020410196378100000200401599997516161755637825601749801003613001600008010032000016000047547447141115619408001008480100137101884207513226665601002001600003200002004000008000001016251011501180201100991001000800008000010016000003210309016000210021600022320005109117111013408000016000032000080100102277100808102036100565101755
480204100362778000000004183099688160204656012560247880100363331160000801003200001600004760734650627593237600100653010115010232622525321796560100200160000320000200400000800000102216100594118020110099100100080000800001001600000321144201600022003160002232000510911611100812800001600003200008010010105310211799842100570100946
48020410097778300001250037498999451616203062482560005580100356571160000801003202361600004759344695502553999800100560010009410094021256320832560100200160000320000200400000800600102266100291118020110099100100080000800001001600000010338016000210021600020380005109117111012948000016000032000080100102220102982101267101446100868
4802041025177810000030041237102023160179967662559944380100361443160000801003200001600004758624720611578255000101417010105310165022720322192560100200160000320000200400000800000101914100874118020110099100100080000800001001600000401143701600020002160002240000510911711100872800601600003200008010010188710104099898101561102950
4802041012967890000000043477102169161622547044256002138010035931416000080100320000160000475794469455556029370010176209986610082921724320484560100200160000320000200400000800000995891022551180201100991001000800008000010016000004010152016000000021600022400005109116111009348000016000032000080100100981101410101099101987100102
480204101637786000012300421701002991601916691225601476801003596751600008010032000016000047564646728425725934001011870101897100897217103204905601002001600003200002004000008000001016871012781180201100991001000800008000010016000004210165016000200051600022400005110116111007488000016000032000080100101069100478100719100870101337

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.2644

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f222324373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
48002710115078111000012100430772100806161620277372556038858001036222616000080010320000160000475127467710056679260010160801014761007112179632128056001020160000320000204000008000001017931017121180021109101080000800001016001212461115701600140014160002144612150190917108100641800001600003200008001086822100575101996100930102374
48002410149876911100015000400882101023161619206120256015138001036331316000080010320000160000475140473406557139310010184201005791008362182432083556001020160000320000204000008000001014001009591180021109101080000800001016001212371016901600140017160002143812150190517959993280000160000320000800101011241023599872210067199765
480024990117661010001800040856299913016198757212560116780010360436160000800103200001600004747514708597555115300101517010237510189221765319955560010201600003200002040000080000010197610235411800211091010800008000010160012120100640160012111216000214012150491541744996398000016000032000080010101329100738100928100655100937
480024101402759100000180004308021015501602059619025599540800103646541600008001032000016000047479947007135716253001017570102333101773206853209855600102016000032000020400000800000100498100361118002110910108000080000101600121238110150160014001416000212381205019041755102022800001600003200008001010059810199410093299865102172
480024100766791001006150004278301009961616211763872560069080010360508160000800103200001600004748134678195556230200101802010173510074621172320774560010201600003200002040000080000010005010233011800211091021338000080000101600001238110140160002008160002232005019041744101792800001600003200008001010145610103210146310060799471
48002410011477600000320004133201006981616204967192560051180010364177160000800103200001600004749524694627565060900100637010032310067820436321120560010201600003200002040000080000010048810025411800211091010800008000010160013133210735016000210121600001238120502005164410168680000160000320000800101025879998510240210273899889
480024100877786100006180003941821010801602170583025604819800103596961600008001032000016000047630846857095729845001013870101753101103225843213125600102016000032000020400300800000102131991651180021109101080000800001016001313381005401600141015160000143812150190317441012058000016000032000080010100826100467101449101107100250
48002410052478210000012000392150100698161620386208255999888001036382516000080069320000160000474694474480455953490010104801010769951821164322102560010201600003200002040000080000099855101550118002110910108000080000101600000321048901600000021600020320050200416641012618000016000032000080010991779967810086799319102293
48002499853783110101216000372032101498161621546826256015778001036199816000080010320000160000474927467702256369430010076401018601002522249632091456001020160000320000204000008000001012311006391180021109101080000800001016000000996001600020021600022320050190417441000338000016000032000080010101848102738100241101928101180
48002410041977510000015001389692102024161521855350256022568001036009116000080010320000160000475611465492857326480010070501029471024631999332130656001020160000320000204000008000009999710140211800211091010800008000010160013123810110016001400201600021438120501905174410165380000160000320000800101017169897910117110252999556