Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PRFM (register, PSTL2STRM)

Test 1: uops

Code:

  prfm pstl2strm, [x6]
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acbbl1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)f5f6f7f8fd
100415771232153124041581872251000100010006890411594157412723143310001000100015881569111001242223522243232000241222501000073116111515100016161574162216111597
100416141231163124091575871251000100010006891411595161113073150610001000100015741600111001266222922613246000239122531000073116111510100016081611159716191607
100416141230163024081605858251000100010006862511598162313153148710001000100015881570111001212223922333247200243322511000073116111507100015991600162315861628
100416241130153224231613865251000100010006785911596157313153144310001000100015821576111001215222922613300000245022591000073116111543100016181602160415851619
100415641230163224131600884251000100010006866211605159113123145610001000100015931592111001260223522483229000240022411000073116111561100016251615161116131626
100416011231173024421559885251000100010007032411604158113093146310001000100015821601111001237223322503241000242022811000073116111504100016121628158915831623
100415971231153224101594869251000100010006953411570162812943145110001000100015901597111001217225922533260000244122411000073116111513100016251611161216041622
100415741231163123991574888251000100010006858611571163213013144910001000100015781594111001260223522433253002239922601000073124111531100016181629160516061610
100415921232163224111566859251000100010006909511591157712903144510001000100015861607111001250224622463246000240622611000073116111490100015811612159716211615
100416221232153324361600882251000100010007018611605157912913148510001000100015961580111001223222922533225000241322561000073116111504100015691576159016151598

Test 2: throughput

Code:

  prfm pstl2strm, [x6]
  add x6, x6, 64

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.5708

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)67696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2020415748117362185358246421565897402520220101931000010100100001316387280053549126961555815602129833131372010010200100001020010000155711481120201100992167100101001002310022815329640245882297910000131011711155211009310000101001574515634157841563515745
2020415669117357177362248381561597412520190102171000010100100001327607358163649126541570915657129973130512010010200100001020010000156111481120201100992331100101001002289522858330270247092298210000131011611154851012910000101001583715652157391571715690
2020415677117365180363247561561897082520166102291000010100100001323737306823549126031568715707130023131922010010200100001020010000156471551120201100992320100101001002294122881329440249252282610000131011711155691011410000101001562815735157411566715671
2020415733117366187365247121584396602520200102141000010100100001319667346874249125761571015727129683131412010010200100001020010000156461461120201100992313100101001002308622849330890248162290810000131011611157521010510000101001573615667156841559815715
2020415663117356184358247111560397772520214102111000010100100001321357342194249125681559315625132343131582010010321100001020010000156571521120201100992288100101001002280822888329070247062293410000131011611155721008710000101001573215668157351565815783
2020415720117365180361249051572997472520205102321000010100100001327997363363449125751560715625129733132302010010200100001020010000156461541120201100992270100101001002294623025329500247012295010000131011611156551010510000101001576115745156641576815675
2020415612118362184358249351574196512520232102111000010100100001326487394114149125321575015630129703131952010010200100001020010000156101481120201100992281100101001002309323101328920248792308210000131011611155221012310000101001558315694156131573715603
2020415724117361187364245831576597242520226102111000010100100001320327308003149124861576215797130133132552010010200100001020010000157601411120201100992326100101001002274223131329940246982288810000131011611154561009910000101001560715699157351569715859
2020415713117359188359247281557798002520190102321000010100100001323737342604449125821559115706129083131672010010200100001020010000156771561120201100992405100101001002274222948330250246982283510000131011711155441015010000101001567115616158391567815745
2020415708116358178356244271555797762520244102051000010100100001319187327723849125921567215612129203131512010010200100001020010000155741481120201100992283100101001002285122978331040248922285110000131011711154861010810000101001573515690157251563615645

1000 unrolls and 10 iterations

Result (median cycles for code): 1.5570

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2002415493116362198370248111558696742520127101571000010010100001306647336511384912265155551565712892313137200101002010000100201000015766155112002110921991010010102313723340331890247572324810000127021611153931014710000100101550515577156761546415628
2002415631117370200367249821556795552520175101631000010010100001296647274941464912415156431543412904313101200101002010000100201000015647158112002110921481010010102301623253330040247672328010000127011611154831013510000100101560715560154501557715540
2002415437117369194364249081551897192520127101241000010010100001316097314051464912382155821558412898313038200101002010000100201000015576156112002110921841010010102307923009333090247092318310000127111611153011014110000100101560715632154191554615527
2002415513116365199358248571564295652520181101241000010010100001298837348271484912502154671547113051313145200101002010000100201000015502154112002110922271010010102304623307331130249592291710000127011611155061014110000100101564915595155131568015569
2002415625117362194364248991546897712520169101601000010010100001315907317641424912345155071544512885313064202451002010000100201000015523154112002110922141010010102303122968333160250592317810000127111621154281014410000100101558815553155491555515467
2002415400117372202368250041550197462520145101511000010010100001299247310881454912492155401549512835313046200101002010000100201000015623163112002110922821010010102340323137331481249482300610000127011511153551015310000100101568215553155631551115581
2002415488119365201366249301559495672520145101421000010010100001316867250221514912463156261559713063312985200101002010000100201000015546157112002110921731010010102332223112332990248862314710000127011611154611014410000100101559115448156701560315581
2002415693117366195366247391550795122520154101451000010010100001311887364121374912352156001568212908313045200101002010000100201000015716155112002110922041010010102305023130330060247202305010000127011611154411015610000100101565615468155491568615714
2002415536117369197370247261560495432520160101751000010010100001315667254431434912520155271562212972312975200101002010000100201000015560163112002110922241010010102310523216331860247842311910000127011511153951010210000100101551415523156851555515556
2002415622116366197370247881560495432520163101421000010010100001315557317411324912526154991556713072313185200101002010000100201000015530156112002110921611010010102334523051329250248312329710000127011611155001012610000100101568315486154291566915517

Test 3: throughput

Code:

  prfm pstl2strm, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.5437

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)dde0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
102041548911633517333624563154779466251010010010000100100005007195431491243515488154541404171412810100200100162001000815449122541110201100992516100100100227282276032726002456422876100000111719160154170100001001545215453154701536715419
102041542311533417433924566154699504541010010210059100100005007259721491247615401155091401171413010100200100242001002415472122611110201100992593100100100226752272032745002444422739100000111719160153470100001001551115510154331541315376
102041542711633317033024464154119467251010010010000100100005007225771491235615462154511400571417410100200100162001001615437121511110201100992577100100100226822283832799002456622876100000111719160153730100001001546415444154801540215474
102041539811533417334124612153739502251010010010000100100005007248731491232415476154501397871415710103200100082001000815436122431110201100992613100100100227752275432700002457522739100000111719160152670100001001544615457155501544915353
102041541511532817333524524153839443251010010010000100100065007252901491243915463154091402361409210103200100082001000815463121841110201100992597100100100226502281932760002458822825100000111717160153890100001001542415493154881538915505
102041548111533717233624545154899427251010010010000100100005007240351491230215466154611406561408910100200100082001000815301122381110201100992549100100100228052273932797002454222839100000111718160153700100001001538115413154121537615425
102041545111733517233624591153839472251010010010000100100035007230801491232615425154231398671407610100200100162001000815459122121110201100992626100100100227662268232789002463922803100000111717160153840100001001548615448154551548115521
102041541611533617633724553153919473251010010010000100100025007222091491250615524153901399771415610100200100082001000815496122001110201100992551100100100228302268332818002454222753100000111718160153480100001001542515573154871548515443
102041538211633917334124601154589523251010010010000100100025007219631491240215479155011392061413910100200100082001000815469122201110201100992552100100100227002277132731002443722802100000111717160153190100001001547815401154041535815356
102041552711533417632724522154409458251010010010000100100005007220301491237615401155321401661418410100200100162001001615437121551110201100992552100100100227532272932855002459822741100000111717160153620100001001550215499155131537915507

1000 unrolls and 10 iterations

Result (median cycles for code): 1.5562

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)1e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbbl1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1002415548117293148297239841552194752510010101000010100005072865214912476155451551114150314228100102010000201000015488154941110021109272810101022366222573233802404722291100000640216221538810000101554515607155521549715593
1002415587116290148302239731561896012510010101000010100005072824614912516155201557414089314309100102010000201000015493155131110021109270210101022251222873236302387522260100000640216221545010000101555715526155911555715581
1002415475116295148297240221557796892510010101000010100005072963614912516155791552814187314368100102010000201000015534155281110021109267510101022345222703230302402722254100000640216221549110000101557415586154321561615567
1002415602117294145294240241555997932510010101000010100005072714614912455155741554614089314323100102010000201000015571154841110021109268210101022322222823232802401622275100000640216221541010000101553815575155791556815656
1002415574116295148297240001559696922510010101000010100005072750614912449155731553714171314332100102010000201000015496155521110021109267410101022213223013232602393922316100000640216221538510000101560615662155511559415524
1002415582116294148297239701558895842510010101000010100005072879814912475155781554914187314328100102010000201000015527155501110021109265110101022277223483228102393922245100000640216221549510000101566115463156251558015679
1002415531117296148294242621565895392510010101000010100005072912514912480156641553414052314262100102010000201000015524155071110021109274510101022269223023220602399722268100000640216221542910000101559815538155191554215546
1002415544116297147298239691554296242510010101000010100005072559414912558156761560414211314270100102010000201000015474155091110021109274210101022297223323224702406322260100000640216221545810000101555315524155601557715482
1002415531116291147294239381556896062510010101000010100005073220814912455155681554614124314232100102010000201000015574155331110021109264910101022268222793226212400322315100000640216221544010000101562515580154971560415486
1002415593117296147299239931559696252510010101000010100005073012614912454155441562314146314343100102010000201000015536154911110021109272810101022287223323219402401822312100000640216221539510000101562715661154631558115626