Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PSTL3STRM)

Test 1: uops

Code:

  prfm pstl3strm, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)f5f6f7f8fd
10041583122915302376160087425100010001000706891594160713023148010001000100015951592111001248222122153224023972198100073116111494100016081620158115941616
10041623122815302363160689225100010001000697131607161913023148710001000100015811567111001251222722303226023832224100073116111505100015811572162116031618
10041571122916302390155588425100010001000694921594156213013147210001000100015841587111001251221922183214023852211100073116111502100015891592161816181594
10041627123015302421155488325100010001000705121585162713153148010001000100015831589111001232224922263216023712237100073116111502100015621604158816081628
10041621123015302370160887225100010001000705571589160513133147310001000100015841578111001234222122163196023942220100073116111519100016201637159815621613
10041612123015302360160892725100010001000697511597161213183147610001000100015851588111001248222022093219024012212100073116111514100016321578161516371637
10041605122915302390159288925100010001000701681571159013053148910001000100015811587111001243220722423225024512237100073116111500100016181605159216211614
10041654123015302371162092925100010001000695371600158813233147210001000100016101596111001224223922343225023922232100073116111510100016091580163316281627
10041606112916302381157586625100010001000690151580163013323144810001000100016171581111001235221822293208023742219100073116111488100015881615161716251624
10041619123015302378159188625100010001000693901561159813273147710001000100015881594111001274224922063226024042201100073116111507100016051618161816291596

Test 2: throughput

Code:

  prfm pstl3strm, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.5776

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
202041581911833518133824213157629817252022010244100001010010000131217737059040491257715786157691304131311220100102001000010200100001566915511202011009925471001010010022662226963276502244432261910000131021611156201011710000101001585015791157701562315719
2020415839118335184336243541583098292520217102171000010100100001322667371790424912857156951567813050313238201001020010000102001000015759156112020110099243010010100100225402267632775024245372257810000131011711156031009310000101001577515781157371587415913
202041584611833718033424497158589793252018710205100001010010000133966738815039491272815738158371306531317020100102001000010200100001573016311202011009925341001010010022583226823256900244592270010000131011611156731013810000101001570315735158381580415704
202041564611833418333424486156869828252021710223100001010010000131671736427041491269415830156891300631318320100102001000010332100001570815511202011009925211001010010022748227543256501243642267910000131011611156211011110000101001574915711158701581815845
202041580011734218633824316158069773252018710196100001010010000131950736586030491277815784156721295831305920100102001000010200100001568115511202011009924151001010010022821225233270400242382259610000131011611156381011710000101001588615627158041593815807
202041580311834418133524443157519842252018110208100001010010000131072739793043491271215724159061303331327520100102001000010200100001575715411202011009923591001010010022753226613268100243782261810000131011611155221011110000101001576115896157821579915673
202041592211833718333724248156839806252019610238100001010010000132859738360030491272415868157391299731325720100102001000010200100001575415511202011009925231001010010022667225223268806243032270010000131011611155581010210000101001564315848157651566515729
202041568711834018034624279158979784252017810223100001010010000133578736865038491280115765157591300431315020100102001000010200100001582115511202011009924961001010010022803228083270400243862264210000131011611156411011710000101001580415778158351581315763
202041575411833518033624233157429771252021510196100001010010000132166736594043491288215787157391301331312820100102001000010200100001583615511202011009925481001010010022776228673270900243532261010000131011711156951008110000101001579715793157401573515763
202041579611933318633924386157029830252020810214100001010010000132427736818037491268615762156781308731311820100102001000010200100001563915511202011009925641001010010022701227183269700241412259810000131011611156881010210000101001588215775157181573715610

1000 unrolls and 10 iterations

Result (median cycles for code): 1.5577

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e3a3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbbl1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2002415479117137619413742487511558796582520142101691000010010100001287267286340148491254815605157381299131310720010100201000010020100001553115411200211092253101001010231162327333115002485523188100000127200216231545710111010000100101571315709156141558215646
2002415579117137819413732494011554695652520154101271000010010100001298947302700145491253415545156081295631310520010100201000010020100001547316311200211092197101001010230312325533168002485423101100000127200316331542410105010000100101548015641155431555915613
2002415600117136719413782509111552595612520181101811000010010100001300987278530147491253815700155671298431301120010100201000010020100001558816311200211092211101001010231592303433230002462423051100000127200316231554010126010000100101562115571155011548715516
2002415445118137419413762481711547297442520154101871000010010100001313617334230148491249815530154601306931301020010100201000010020100001557016411200211092085101001010230362326433197002493823035100000127200316321544210147010000100101558415477155821560415604
2002415630118137919913692486611556496412520151101271000010010100001301447368880139491256415575154971277231305320010100201000010020100001556515411200211092161101001010229332330933122042480423062100000127200316321537810120010000100101563415576155321567215582
2002415537117137619513782474011558995822520154101151000010010100001358557265750146491254715559155351295931320520010100201000010020100001546815511200211092130101001010232002301433146602482223034100000127200315231536910093010000100101553115494156211566915638
20024155181161375202136724920115530960625201571015710000100101000013026072900301374912468156901557612945313034200101002010000100201000015593154112002110921811010010102297923211333200024873229811000001272003162315577101381610000100101557515659155911543415451
2002415501116137819813752500511562096422520103101331000010010100001296687296380148491247715577156681298131293720010100201000010020100001554015411200211092224101001010231622311133126002493123054100000127200216431542110126010000100101567615751154921549115611
2002415601117137120113772483611565396392520133101721000010010100001312747297020140491257615635154951299031304920010100201000010020100001563015711200211092059101001010230842305733142002474123213100000127200316431551010123010000100101560815557154961550115529
2002415587117137319013722483111553896992520151101811000010010100001306357287850145491243715694156021283131309620010100201000010020100001566215711200211092136101001010231432311533084002484023156100000127300316331539210132010000100101559715570155541556515601

Test 3: throughput

Code:

  prfm pstl3strm, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.5393

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)dde0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
102041538011534919235000246411538694122510100100100001001000050072308914912379153941548613836714127101002001000820010024153731213111102011009925381001001002295522922329161246792288010000111718160153110100001001536515309154001543015372
102041527711934819435143247261543793962510100100100001001000350071748114912278153651548413961614074101002001001620010016153851213211102011009925031001001002293522838328681246832295810000111719160153300100001001549115534154181541115394
1020415386115344180356002477515277949525101001001000010010001500718650049122571541315330139156140991010120010016200100161543912213111020110099251810010010022909228983287502472122824100001117171601528720100001001545115359154211542215430
102041547411534619035000247931538093952510100100100001001000150072377004912266154211545713977614050101002001001620010024154491221111102011009924251001001002280822979329321247312287610000111718160152590100001001535515479153751537415382
102041553411534418635500246461542694722510100100100001001000050072076104912407153521536414017614134101002001001620010016153371224011102011009924801001001002293622840329711246012285610000111718160153600100001001540215307153161547715380
102041541311634418135700246371540794532510100100100001001000050072053804912323153961534513883614097101002001001620010016153381210411102011009924671001001002289122997331821247592286210000111719160152680100001001544015446153971540815406
102041540111734718234600247101539293822510100100100001001000850071785904912355154631534513985614071101082001000820010016153601219211102011009924981001001002285922955328900247012292510000111719160153330100001001544815374153941532115374
102041535911634318235600248841545294792510100100100001001000050072057904912218154121543313994614154101002001001620010008153711216211102011009924611001001002295722961328400246452280010000111717160153300100001001525415395154871535815360
1020415396115352179352002471715311943025101001001000010010000500721122049122821539415487139537140151010020010016200100081526012216111020110099238610010010022977228503292028247162290110000111717160152910100001001538215411153471541315455
102041535011534618334900247121531194132510100100100001001000050072153804912316153531545113992614130101002001000820010008153961212511102011009924451001001002290922867329411247132292710000111718160153190100001001539315411153531543015356

1000 unrolls and 10 iterations

Result (median cycles for code): 1.5584

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acbbl1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
10024155871172931472902408615533965825100101010000101000050728963149124651555615646141443143261001020100002010000154941542611100211092700101010222162222632232000239872233710000640216221541710000101559615572156611557815505
10024155871162911462942400815568958125100101010000101000050730758149125421557515579141053142801001020100002010000155691551711100211092678101010222202235332228000240202235410000640216221541010000101559215525155821555015596
10024155931162971452952396615600967925100101010000101000050728389149124571556615611141733142441001020100002010000156001549411100211092730101010222512229832339000239292230810000640216221547210000101557915659155701555615552
10024154951162961492962401115622961125100101010000101000050727354049124731558915587141683143361001020100002010000155351547611100211092696101010223152220932275000239662227010000640216221537410000101547715577156271561115457
10024156051162971472972406415628961125100101010000101000050728382149123941562015629141433143071001020100002010000155071553611100211092710101010223462226832282010240132231310000640216221537710000101557615576154951568815617
100241562211729614829424033155899659251001010100001010000507292160491246915551155211418731422610010201000020100001555215550111002110927071010102225422216323360330239312226710000640216221558010000101560715760155511559315549
100241558411629414629824027155159565251001010100001010000507307450491253515578155491417931432910010201000020100001545115563111002110926511010102234122222322880320240142227310000640216221545810000101560315552154971560415642
100241559911729114829623898156159506251001010100001010000507298820491246615593156441409931429310010201000020100001550815511111002110926371010102226322366322920400240472229010000640216221540210000101552315579156381548615611
10024155711172991482952401015562963025100101010000101000050723144049124381559315586140943143291001020100002010000155641557911100211092626101010222562230132293000240622224610000640216221537410000101559215557156131564315643
10024155921172941472992388015553956325100101010000101000050731233049123941558115564141783141921001020100002010000154021543011100211092729101010223642226632302000240082232410000640216221547110000101553115533156141553415570