Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e1e20222b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051047800001401900201032093022625200010001000100010005282845833110471048699377820001000100010001000104744111001100010000102871551140111561141130439985571073216221041100043321000100010481048104910481047
2004104781100168141010103389211242520001000100010001000528164584411048106370137812000100010001000100010484411100110001000010230150113360171210611193661055000073216221041100031431000100010501065104910471048
20041059810101921210001031211012826252000100010001000100052816458441104810476993781200010001000100010001047441110011000100001034006311173120810711263541033900073216221041100030261000100010471066104810481048
20041048700001442210601032169559242520001000100010001000528244583411047104870137842000100010001000100010484411100110001000010250040113760241210311262961126300073216221040100033321000100010491047104910481049
200410487111015523104010506964327252000100010001000100052824458401105810606993781200010001000100010001048441110011000100001029804111444021010911163571065572073216221041100048351000100010491049104810491049
200410487000113923005201034139332252520001000100010001000528044584811048104769937822000100010001000100010464411100110001000210200063112601181411811211831076370073216221034100031301000100010481047104910481049
20041047800001782200701032095592925200010001000100010005281645841110481046701378020001000100010001000104644111001100010000100000491104000010711243661064700073216221025100035281000100010501047104910491047
200410468000016022006410329943328252000100010001000100052828458361104810656993780200010001000100010001047441110011000100001030804611202013011011192441086361073216221039100033361000100010491050104710651048
200410478000012700070103399332272520001000100010001000528204583811048104969937822000100010001000100010474411100110001000010180048112101231613211184141014770073216221039100031331000100010481049104810481048
200410478110017300090103110907622252000100010001000100052836458401104810477013781200010001000100010001047441110011000100001012605511182017610711163641013100073216221041100027211000100010491049104810471048

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1854

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020971998538000000005848411632316471843792547157925507704063610132401001000061480127280971496855307182271843653497365456501004020010000702001000071830351140201100991001000030100100000100109290156509106662881191942231090813691371032610251117166740536100494491210000401007199171848718327172871839
502047192155910000000583847172031087162083060716622550785406401014340100100006147342727441149686960717947169865215036554850100402001000070200100007188935114020110099100100003010010000010010915114853410675289139264224109431374123103261015611716654051687890291510000401007190171730719067182971795
502047188053700000000542846169622967171682640713872550690406521011940100100006770522725205149685840717557168965138036557550100402001000070200100007178835114020110099100100003010010000010010907111455101064727412944664510974141101251010261015112716424050888289093310000401007182371752720067186671853
5020471900536100000006108351704110471799818307143425506754064410141401001000067691627192711496863407179071952652750365553501004020010000702001000071844351140201100991001000030100100000100108789166539106302871191328401091712261372052610158117155240560104095294810000401007182471811718577188271704
5020471865539100000005338171720217671746829717159525507554064810131401001000067501527177481496868307169671940652320365366501004020010000702001000071748351140201100991001000030100100000100109301149503106472921390440391092112961221312261015111717704056089696693510000401007148871920718957199771960
502047173753700000000578866172802487178480240716712550810406921014840100100006778822716692049687020719007183165179036557650100402001000070200100007159235114020110099100100003010010000010010978116447210687291118804834109271309126106261015111716624055692494892410000401007182971789718117183071894
50204719015372000000057890316722156720997924471652255078540676101404010010000680108272603804968663071874717756527073656085010040200100007020010000718523511402011009910010000301001000001001097201514831068028299455437109751244131033261015111717784057692292698910000401007186171885718737175771776
5020471780539100010005418501720114071849801387155825507454064410136401001000067808627102120496885307177171876651810365602501004020010000702001000071720351140201100991001000030100100000100109400179488106612731395268341094713561350032610151117158640580970906102910000401007192171997719967194072017
5020471777538000000006288121704196718518333471512255076040648101404010010000678269272068914968506071761716436537003654515010040200100007020010000716603511402011009910010000301001000011001090741425031066927121898323910927122101292392610158117162740548856816103010000401007145271878718677178771938
50204717715392202000054380917281288716407812071643255076540616101304010010000678711272474114968870071802720186540003655925010040200100007056610000717953511402011009910010000301001000001001094613164550106502751487698411095313281222272610151117166940588101499096710000401007193671912717767174471965

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1830

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500297222453913010005130860172031127182381465714472550680405541013540010100006143082730294004968951717557173965265036567550010400201000070020100007184535114002110910100003001010000010110131139518106542661091186401093813081311370025200118507117176740588968102290210000400107191071751718087183671759
5002471688538010000053708272736112071740795737164825506904054610145400101000061328927273630049687637190771686651860365599500104002010000700201000072079351140021109101000030010100000101092411495141065925811952804210943138812410600252007850107719194059291085494210000400107197772012717267183671829
5002471912538010000051108451720110471849800547173325507854057410124400101000061409427242450049663237187471901652280365502500104002010000700201000071820351140021109101000030010100000101090111435081066526313920782910976135613010500252008850812715884057691688099810000400107189771787717957189571858
500247186953801010005780883152021127171481260716352550735405661013840010100006113362726554014968722717707186565206036547950010400201000070020100007188935114002110910100003001010000010109151150506106462578936823710969134614610300252007850787168840568102899696610000400107172771886719957174772092
5002471786538010000048508282712210871658818737175525506604056210144400101000061250027285350149688077185071867652370365529500104002010000700201000071547351140021109101000030010100000101091111405221066426110926802810971142712110400252007850711717154056899895890210000400107185071896718977190271730
500247188853801100005740836173621607178982944716862550655405501013840010100006112522729875014968850717337173565358036557250010400201000070020100007191035114002110910100003001010000010108991145522106202931491240471095712613142102002520068507127150040516107097891810000400107196671802719567169572070
500247180153801000004740853171221167170182261715572550735405901013340010100006133582727775004968840716847183965225036541150010400201000070020100007190735114002110910100003001010000010108972149513106412611691254471094913511129123002520057106107174340524984100897610000400107165171890719037187271901
5002471916537010000053208241712211271845790557158125506554063410129400101000061371227185820049687487171871783654990365391505704018310083700201000071830351140021109101000030010100000101090011495211066626299145239109691297127130600252006850108714864062093093699210000400107185972029718977192371739
50024717085370101000510082516083112718158161027174225507154063010144400101000061164227243120049686357177371865652080365492500104002010000700201000072044351140021109101000030010100000101090911465131064225413906804410937134813310300252007850677162140536854107895810000400107185871839717597180971861
5002471866539010000058608321728211671537831637159525506554052610140400101000061041727236440049685717164571916652490365501500104002010000700201000071847351140021109101000030010100000101089711604961063324910917423710956107711210500252006850107716494054496296285210000400107187271937720227197671918

Test 3: throughput

Count: 8

Code:

  ldr x0, [x6, #8]!
  ldr x0, [x7, #8]!
  ldr x0, [x8, #8]!
  ldr x0, [x9, #8]!
  ldr x0, [x10, #8]!
  ldr x0, [x11, #8]!
  ldr x0, [x12, #8]!
  ldr x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3675

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092956222022000715880917041061482937982234017531823205025160150801648000080100800004007591292965164492617929322293169329039331160100802008000080200800002935835118020110099461008000010080000100809813741156338522368911989465069862217091244876573236545110117112928624800645685559380000801002939029278292722937129349
1602042941022030000670082817361131442947680535717821748198625160152801558000080100800004008141289098050492613729437294809413039527160100802008000080200800002936435118020110099481008000010080000100809865639952998542366213984425153857837671294894503155035110116112949829800555145409280000801002929729209293512940129350
16020429456220211006948835173611614429329804323157316472146251601538014880000801008000040076612974900684926279293412943193930394151601008020080000802008000029267351180201100995010080000100800001008096751400565184736692129666646698605974412952225540530351101161129358468006956953210680000801002953929272294482908629299
160204294882222000071478431720125128293788464131791171720042516014280156800008010080000400792129604903949263232943829246923403927716010080200800008020080000293153511802011009936100800001008000010080958373825304855297029968484913862337011274657500219055110117112929425800556175539380000801002927429374292302926429553
1602042944822010000664183117201462642934580737518321871201225160148801548000080100800004007721299186067492626829297292649293039598160100802008000080200800002955935118020110099591008000010080000100809381943351128522070615989544425856817931284679559119045110116112921334800556095139880000801002957529383293592926329457
1602042937822020000673285117201311082952381533217121682225625160172801718000080100800004007671296434057492615329400292809198039230160100802008000080200800002923335118020110099541008000010080000100809784038855388510968699423445308574381513248915820360551101171129490358005664354110080000801002939429455294022952029334
160204294522222000063668451632118922925081135415612008200725160141801758000080100800004007791300129061492655729406295209200039359160100802008000080200800002937035118020110099591008000010080000100809743540058928540169199765451498556275211552445261190251101161129366358005655859510580000801002964729300293822937129350
16020429410222100006633834171214110829391801338154515772047251601618016080000801008000040077813173421674926337292642947894120394151601008020080000802008000029405351180201100994510080000100800001008094519372520084823727109737249768616381312152935116190751101161129410318005057057910880000801002938829282291962933229299
16020429307220100016455822172011814829436913361168417772050251601438017480000801008000040073412951430634926153293942933591910393931601008020080000802008000029431351180201100993810080000100800001008098137390570084774673109745047888571982513647945457363351101171129560318006351755410380000801002923829355293592926229457
160204293182212000067088261736109682919780933416891687204825160151801518000080100800004007691301195054492632629259291959232039616160100802008000080200800002957235118020110099541008000010080000100809671940952858546865713996885043862487161325214571919035110116112929632800626235789780000801002947229519294592950629441

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3677

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)daddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929710220100016570832172810912829377813355179817902387251600708007080000800108000040038313119350514926310293692947092273923416001080020800008002080000292073511800211094310800001080000010809000417544085289665119127652038565081111846044647183050206160672940340800745235538880000800102950329478293772941729436
16002429574220100007201809172812016429479788378172816852158251600728006180000800108000040031113169130574926263294652956791963929316001080020800008002080000296143511800211093010800001080000010809291840755158520967111919784861866018141404998523900350206160462952036800536176108380000800102940729370293582943429338
160024291692191010068018331728114108293427973631883157722192516006180070800008001080000400302131512505649260412927829188916939418160010800208000080020800002918035118002110942108000010800000108091803985960850626379948136499486648818139519849301733502041604329451368005262855810180000800102944329464295662940529374
160024294082211000065618141720127100292018293681747186422232516006380071800008001080000400362131611506049263192955329409945439399160010800208000080020800002925135118002110939108000010800000108093416394596885396727109285048628625487413447185806003502041605329550338005260761211580000800102933729580295942946629585
1600242935822010000686783117361151562925682540516701834208725160062800798000080010800004003541303187063492626529291293859369393801600108002080000800208000029394351180021109331080000108000001080932038956008519266511925785113859917581384538510600350203160432933137800585845009280000800102932429353294562950629509
1600242926422100000627582917281011122944189437216431819217425160062800708000080010800004003301303454056492638529288294789318393831600108002080000800208000029443351180021109301080000108000001080926040050978527566299507850478602675812650234614036502041603429297418005752157110280000800102943129300297752943029216
1600242904722200000678182517121181922935686436515691715234125160076800758000080010800004036321293681051492627029298292959126393711600108002080000800208000029360351180021109411080000108000001080918040554368536568789204654408603678713249294837004502041603429343308007160158010080000800102931129268293482943029491
16002429439221000107002837172812010829476822379175918352125251600848007380000800108000040033312978590674926552294262935193773946816001080020800008002080000294213511800211093710800001080000010809461736757758560367412931745095860068091364667552800350204160432938232800706265409980000800102931529280294382937929401
16002429452221100007049850174410596293147953971632172919652516006080072800008001080000400329129746605249263982931629249928439300160010800208000080020800002932635118002110940108000010800000108093404305056851416681192050539586186783124482148061801502041604429447428006767161011280000800102941429431294252942929433
1600242923622000000716886017441221322929880738117511699220525160061800618000080010800004003091300758063492643829350295789359393241600108002080000800208000029448351180021109301080000108000001080912036754688522666899235455288598181113348435699033502031606629575398006957757810780000800102943629553295942949829536