Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST4 (multiple, post-index, 2D)

Test 1: uops

Code:

  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 13.000

Integer unit issues: 1.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)191e1f23373a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
7200729239237030002200000048562851440144301300010008000400010008000400050005192671619062466128574286553101300040048000900020000286452878611610011000100040004804000002140004840013127931369653176135318745324438091649482824410001557512179126234000800010002895928801287182875328706
720042882122301100151000004622287400014510130001000800040001000800040005005519097159901024662286682890131013000400080009000200002882428764116100110001000400008040000004000080001302393726938312995218840314638162050502826910001538512494125284000800010002882028854288082876328888
72004287582240170020000000474828690041481313001100080004000100080004000500051938716180724610288712894531013000400080009000200002914529029116100110001000400008340204434000080001316194536728301465319247325838081549542864510011586912273123284000800010002981829755297992895028821
720042882723901410170000004513293684014434130001000800040001000800840005000518987161404246352852728699101013000400080089000200002879728728116100110001000400004040000004000040001310994186901315885118863310238071445542832910001548712258123134000800010002882828807288612871328737
720042876222201700210000004807286534014465130011000800040001000800040045000519087161401024593287752871631013000400480009000200002873828771116100110001000400008040000004000000001313294096998317094818490318338051247432823110001502011910118284000800010002871528761286892842528657
72004285512210220025000000487828553031420913000100080004000100080004000500051908715971524639286992876031013000400080009000200002878728710116100110001000400000040005201240000000013704936969513129134718624323638161653562817110001499611873119814000800010002858028473287592869428691
7200428794223017001600010147222868140142721300010008001400010008000400050005190971618017246642842828707310130004000800090002000028546286931161001100010004000080400020040000400013270958270423117104818654327138171355522827310001529712036120554000800010002882728635286932854528383
720042863022201900210000014745286614414381130001000800040001000800040005000518977163801524632284562865431013000400080009000200002854328392116100110001000400008040000004000000001304593516850307875519218310038141463652839310001622712723127654000800010002943129191292682933929179
7200429285228016001800000046872929000150031300010008000400010008000400050005188671610012246532905329219310130004000800090002000029282292851161001100010004000040400100040010120001298292336967310175019456320738081457602854810001614312874128044000800010002935129505294712940929349
7200429346228019001800010046872929404151241300010008000400410008000400050005189971614011247232947329357101013000400080009000200002934929374116100110001000400020040000017854000000016513172948969383087115319591336338081654592914110001440412026113744000800010002810028356282322847128308

Test 2: throughput

Count: 8

Code:

  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.4980

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2223373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)67696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
9602072017871548101000151700840670199620160138115981251130374801007259853200008010064000032000040050892675541082234602013651993692001384039534062110401000200320000640000200720000160000020060619993111802011009901001008000080000100320014154425212032001601183200021644140005109116112018488000032000064000080100198465201895196894200310200358
96020419816415601121000190081426119913116161473159282511264818010172550632000080100640000320000400500939567010776795019990420051320175039449339900104047502003200006400002007200001600000200671198727118020110099010010080000800001003200141444263161320002009993200021644141005109116111980268002932000064000080100199570200424197608199253199741
96020420061915501200001218008766712003571616142917602251129495801007293673200008010064000032000040064592839791081940402008302006392017264114434111710404750200320000640000200720000160000020078319913411802011009901001008000080000100320075144425622132000200213200021644142005109116111993228000132000064000080100200263200415201611199062201486
9602041986781613100000917008149402002611601433167012511272578010072684432000080100640238320000400500924053910884959019892919950519939341080338545104010002003202406400002007200001600000197131196904218020110099010010080000800001003200150024674132007611163200021644142005122116111995318000032000064000080100199964214943200059197735200065
9602041994221588110053101900903491200089151612171749725112834280102720689320060801006400003200004005009168734107777330200326197899196679403973389621040100020032012064000020072000016000001975911983041180201100990100100800008000010032001515442598403200162023200621444141005110117111993588000032000064000080100198838199998197907201626201445
960204199493162010100014418008637012012561616135817094251129068801017253043200008010064000032000040050093003371064162911991692001112009364334433891310404750200320000640000200720000160000019943019930111802011009901001008000080000100320080144425386132001600163200021644140105109116111996818000032000064000080100197741201650201437199462199741
960204201734161710100101700906681200605161615591706025112724180100726200320060801336400003200004005009307383107360250200251200569198629396123402431040100020032012064000020072000016000001993411997521180201100990100100800008000010032001514422543203201365011763216221643149005143143112016578011632000064000080100199626200370197728198193201487
960204203931162010224452836900817270199216161613261680425112214080100720701320000801006400003200004005009271978107703231200101213503200363383443405501040100020032000064000020072000016000001992511989451180201100990100100800008000010032001415442419603200160153200021646140005109117112022368000032000064000080100199733201697200300199134201382
96020420039916031101000170184418119787000159915832251119469801007257973200008010064000032000040050092108271072876201997971976081989613968733739010401000200320000640000200720000160000020189019915711802011009901001008000080000100320014154324971132000200163200021644141005109116112002748000032000064000080100199962200530201787200053201133
9602041995361608100100018008741011987501616132816317251133706801007270053200008010064000032000040050093050891074871102001312002972014593806334043510401000200320000640000200720000160000020051619976711802011009901001008000080000100320000144423752032001601183200021644140005109117112011048000132000064000080100200939199832199087198110199961

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.4994

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f222324373a3f46494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)67696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
9600271995631499000000020008547501993701616154816300251128309800117237673200008001064000032000040005091644651073557602037340199378198674379542244270610400102032012064000020720000160000020051020086311800211091010800008000010320000002687403200020023200022420005019617641966348000132000064000080010201098198845199358198090198248
96002419934114970000101083800181422020138201612971572725112897580011724344320000800106400003200004000509391411106506700198391020178120030841579339660104001020320240640000207202701600000199240197798118002110910108000080000103200600342519903200042036143200022340005020516661990348000032000064000080010200906200000198430198757200764
960024200875157801100021190018758912003881616149316579251130905800107300863200008001064000032010840005092396431082441002012810201199201104405193401551040010203200006400002072000016000002005252000541180021109101080000800001032001415442550713200760032320002164414005019516762005358000032000064000080010199560198009200403202665200055
96002420168415941001001321401082208119930716161533171615211235118001072824832000080010640000320000400050942393610763686120194302000402002714074234079310400102032000064000020720000160000020216419883621800211091010800008000010320014160256781320016001058320002164214005019416441995658000132000064000080010201417199651199690201349201078
9600241993001562101000014000805721199355161612851706725112392380010722618320000800106400003201134000509275029106954131199140020134520025039665338766104038520320000640000207200001600000199996202675218002110910108000080000103200151444253820320016001432000014014105019517662016878000032000064000080010200250201430199801199871200310
96002419859215301011000170008461812015451516137916003251119170800107309403200008001064000032000040019593615881078407401998190201811203504407853410081040010203200006400002072000016006002029431973241180021109101080000800001032007414432508303200161020320002164614005019617651996228000032000064000080010201082202724199603199192202113
960024196977154310100001400086113120074701615141617025112890680010725953320000800106400003201084000509355370107175740199788020102420083938896338077104001020320000640000207202701600000200817200588118002110910108000080000103200151502504903200160117320002204414105019417442007508000032000064000080010200904200963199814200596199161
96002419962915561001011218000870571200186161614771543325112634080010728599320060800106400003200004000509270704108041290200394019857520019438724341115104001020320120640000207200001600000199105201847118002110910108000080000103200151502645123200160216320002164414005019416552002198000132000064000080010200538199337200622199241202036
96002419951515551002000107000951461201263161612861669725112660280011726705320000800106400003200004000509265545107970771201532019938719868139749341975104001020320000640000207200001600000200641200543218002110910108000080000103201941444278820320196002944320180144414805055535451986988124732000064000080010199778199329199599201569200154
96002419935715601000333963690008894912006191616152517887251121519800107254853200008001064000032000040005093003731072130502010490200652201559382013410731040010203200006400002072000016000001994231981321180021109101080000800001032001416442578113200160116320002204614005019617562009388000032000064000080010198007202233200265201637200248