Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (post-index)

Test 1: uops

Code:

  ldrh w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200510408000104000601025108583125200010001000100010005282845824110401040699377320001000100010001000104044111001100010001019179103300002710235383064073116111037100028251000100010411041104110411041
2004104070007500010102501372025200010001000100010005282445824110401040699377320001000100010001000104044111001100010001013057102700003110373962356073116111037100019271000100010411041104110411041
2004104080006900010102510151721252000100010001000100052824458241104010406993773200010001000100010001040441110011000100010131781058422383910365783564073116111037100026171000100010411041104110411041
2004104070006013002410251388927252000100010001000100052836458241104010406993773200010001000100010001040441110011000100010170661037901203510233882548073116111037100021271000100010411041104110411041
2004104080001121910401025155346292520001000100010001000528444582411040104069937732000100010001000100010404411100110001000101919410495219126910484162964073116111037100028241000100010411041104110411041
2004104080009717105010250667172520001000100010001000528244582311040104069937732000100010001000100010404411100110001000101606510435116123810254572764073116111037100026231000100010411041104110411041
2004104070006401030102517111092725200010001000100010005282445824110401040699377320001000100010001000104044111001100010001000061103201003110234362156073116111037100030221000100010411041104110411041
2004104080006321002410250138212520001000100010001000528164582211040104069937732000100010001000100010404411100110001000101307310752138183110634893058073116111037100028271000100010411041104110411041
2004104070106819182010251524152025200010001000100010005282845823110401040699377320001000100010001000104044111001100010001017168104211019104010573984064173116111037100023271000100010411041104510411041
20041040800063000312102519536212520001000100010001000528244582311040104069937732000100010001000100010404411100110001000101718010441016182810333982356073116111037100030271000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1876

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e181e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097233254020100446079717121112719028200371668255077540608101104010010000614505273291004968754719897180565270365636501004020010000702001000071876351140201100991001000030100100000100109152149476106332751292948201099011311341030261025811718054050011071156109410000401007189171988720197204371945
502047182954031000446078417041136718668020371763255069540616101324010010000613732272615614968809718267190765203365479501004020010000702001000072040351140201100991001000030100100000100109021144468106252481190674291095412311231060261015811717084049210201090110310000401007187671898719947195871876
502047185853820000480080716962112720458032171372255076540620101244010010000613941273657114968856719747189365241365510501004020010000702001000072020351140201100991001000030100100000100109212149466106202671287048221099912331291050261015011715674049611311105111610000401007208071995718827185971884
502047195953811000481077417201132719597841071592255073540592101194010010041614632273200314968743719637203465363365683501004020010000702001000071853351140201100991001000030100100000100109121155483106232499906483311012117112210100261015811717994050410481034106110000401007186171768720297185371942
502047170253710000434083517281132718357981071676255071540584101314010010000615019273243314968805719227192765392365494501004020010000702001000072062351140201100991001000030100100000100109151174507106532571288946291097413521351060261015811714934048810141059101610000401007180472065720457199272105
502047189053820000459081617121140719757961071568255073040668101154010010000615357272280414968916717687185365300365591501004020010000702001000071971351140201100991001000030100100000100108821140492106462621388086251098312631211390261015811717514047211101134113810000401007200371808719297209471965
50204720665392010047509001672214072043762107160125507904061210133401001000061483127341501496897072073717996523736555350100402001000070200100007210935114020110099100100003010010000010010878115947510647262889748171097613321301050264115811717824051610281001106810000401007185672070718207175271972
502047206753820100472081216961116718667871271511255075540608101234010010000614791271553914968876719137183065308365738501004020010000702001000071961351140201100991001000030100100000100108991163504106462641290428351096012131261060261015811718154056010221180107010000401007198371800719167210572058
502047193454010000431081716801112716597800071652255064040612101334010010000614634272632004968764718547219165268365452501004020010000702001000071801351140201100991001000030100100000100109131163501106382711288448201094113311191270261015811717504052411171031108910000401007183771940719547194572003
50204717325391000046107961696112871894773107149525507404062010118401001000061447527301721496875971736719686543836559650100402001000070200100007198235114020110099100100003010010000010010875116049610617269118864632109481263126105026101581171712405209981110112210000401007184771670717257194671891

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1952

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002972349539300000004583408541744512071811840627172125507064052710146400101000061270427320470496896971859719556563936568150010400201000070020100007219435114002110910100003001010000101095623140542106702541396270411097313031241070025201985711718344054011911210111510000400107205772085719867203771863
500247194953920000000432085217204156720768002271839255069540554101434001010000613135273282204968774719687182065707365677500104002010000700201000072163351140021109101000030010100001010933111765181068628213967783210953132513713170025200885611719004052811271196110810000400107213371946721287187171966
5002471897540300000004220820174411167180879602715802550630405261014140010100006122262726026149688967198971753653223655475001040020100007002010000720863511400211091010000300101000010109072186502106342748931841610925117413427300252006856771522404721040952116710000400107200671951719697198972114
500247202153833030000515084816963140721737991371647255069040590101454001010000612810272748004968772720547173065384365649500104002010000700201005171996351140021109101000030010100001010913101445211065926912945803810935124613112300252008856771811405281069103099510000400107190272164721097200571920
50024722135392000000041208381736312471867805327188225506604051810138400101000061345527314020496889371873721366548036575850010400201000070020100007208535114002110910100003001010000101094612130489106592871495072291093814291321340025200785106718684058410861197117910000400107186671893718997189372095
5002472088540200000005710862173647671927819387157425506604051410125400101000061263427203000496863771690719626571436579850010400201000070020100007206735214002110910100003001010000101092823153486106722781095936341095112521272730025200785116719834055610761063118010000400107208571982719947189271866
5002471625539300000004160874180031167210881062717442550685405141013940010100006159922732368149687657197971963656363656955001040020100007002010000718723511400211091010000300101000010109041614850710642274894586221094913151372030025200785813719144054410711078114510000400107198471951718357188271955
5002471924539222000004210822168851207192383122716702550665405381014440010100006136102729000049689347199471843654273657185001040020100007002010000719713511400211091010000300101000010109291614349110667262139417423109451313131035002520078585717714056011591128112610000400107181371980717147194071913
50024719405392000000045908361728512072011834327175025506604055410135400101000061369327279111496878371867721376556736570450010400201000070020100007196535114002110910100003001010000101092625136515106782811196448271094512241292130025200785117718314054010651112116210000400107196572032719887193872046
5002471970539410400004520847172042807193582532717452550680405301013140010100006144132729273049688137196671985656913655465001040020100007002010000721963511400211091010000300101000010109731715049410677269119677827109561395129107002520068558718454054810621091109910000400107191471952718357193871884

Test 3: throughput

Count: 8

Code:

  ldrh w0, [x6], #8
  ldrh w0, [x7], #8
  ldrh w0, [x8], #8
  ldrh w0, [x9], #8
  ldrh w0, [x10], #8
  ldrh w0, [x11], #8
  ldrh w0, [x12], #8
  ldrh w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3691

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602093005722120100714180417921121042959182380515881904226525160155801608000080100800004008151307370168492637629537295489412039422160100802008000080200800002959235118020110099351008000010080000110080948373935529085260681129145055298574883112556185189181351101161129509268006656068611780000801002949629431295282945129508
1602042927621920000728182517361181202958980776815471710212125160162801508000080100800004007781290544060492639129612296359302039464160100802008000080200800002943135118020110099341008000010080000010080909173965396085818637149068054918644670512452435129160351101161129474388007165460212280000801002966229520294992950329406
1602042953322010000702084617281141362945381667414301965208225160167801528000080100800004008061305237046492634829435295219341039393160100802008000080200800002937935118020110099421008000010080000110080912164115554085157655169287249678628577112350344947156351101161129552208005265064810180000801002951429526293312951529770
160204293292201000071778291696891322939379175716141668218125160157801708000080100800004007951297506061492622029745294999548039561160100802008000080200800002952335118020110099501008000010080000010080910173795200085448666119218046218662782012747605117133451101161129470298007658759011480000801002969829375295762956629441
160204296262221000065688171696101108293598056811785177522392516015880171800008010080000400876129875205649265012950029365934703935516010080200800008020080000293123511802011009949100800001008000001008092403735504084752659119287451118595075813652224773001151101161129377228005968063312580000801002943129497295162964529545
1602042940122100000699483717201131922947278765114491863204525160169801728000080100800004007941305931058492647129543294119564039455160100802008000080200800002951335118020110099321008000010080000010080922174125288085469674159193850248601076612552085115150351101161129525348006465865412380000801002954229572295012939629519
1602042959322010000672782717761211522945682470117841856209725160166801568000080100800004007761307413063492636229368293379439039466160100802008000080200800002942735118020110099421008000010080000010080893164035506085103644129093449268643068612754425834160551101171129544318005668564710280000801002957429427294952937929570
160204295582213300072188241704102116294207977611663173324712516016180162800008010080000400790129420706049264032956929583928603938416010080200800008020080000295963511802011009945100800001008000001008092934338562508538462711927705537860286751145212534533035110116112959038800526706169380000801002960029505295222955229594
1602042939922020200661182216801181602965181873015731916212925160168801678000080100800004008041305739065492653529466295699273039557160100802008000080200800002939435118020110099411008000010080000010080970194285209085140661129148053378603375412652065661024451101161129546288005661959511280000801002945929617295952944929575
1602042951122210000666383015361251362958479480419362189218225160158801728000080100800004008351297004058492656829486294159394039399160100802008000080200800002956235118020110099451008000010080000010080918184185066085037676158821304765858876851314521532313005110116112937533800716336599080000801002953129624296542954529475

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3681

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)d9daddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600292992322020110007276815172812913629489802360175818762285251600748007880000800108000040033613007271624926383293302961894130393011600108002080000800208000029331351180021109301080000108000001080947373805021854486547888445190857417641325082463250035020916016229503318005953259010380000800102949429600293822940629438
1600242943322020000006509846175211815629280809363192519382190251600758006380000800108000040037713066630474926461294152956894390394491600108002080000800208000029475351180021109231080000108000001080957384045380853087222394176495785704808147514455743630502021600222956833800675565698980000800102943129530293282960229479
16002429334220202010069658261736130116293678324131841164222422516005980070800008001080000400293129345907349263602948729628937003942516001080020800008002080000294523511800211094610800001080000010809503637153968556768189487049588626384613247465176370118502031600232940224800596306189980000800102940429391294002944029223
16002429560222200000067818431584125100295778203951718179520162516006980072800008001080000400371130159706249264152942329406942603937516001080020800008002080000294823511800211095010800001080000010809543635756968555164411944785512858687971265159586637035020216006229489358005955963411580000800102947529425293732951129383
16002429330219202000070818621704127124292648243971684170322672516008780080800008001080000400279129844505249264562928429327930203938516001080020800008002080000293433511800211094110800001080000010809753740157158534764910956485350861708421425257561536035020216206229416338005057257210480000800102955929496294042939629363
1600242955422120101006853818179212610829300810382166118392274251600688007380000800108000040032713047340664926315295222960894550393071600108002080000800208000029552351180021109531080000108000001080950343595148852086561092674525286244796133477251773633502061600222942633800585546089780000800102952129435294092951229432
160024295132202001000677785218001281282941481237716701693216525160068800748000080010800004003611293436055492644329492294609402039373160010800208000080020800002946135118002110952108000010800000108097338384536385547677992166511486084752133493251233703502021500262943530800656005229880000800102940029281294692937129387
1600242945622021112006518835171211714429372831393183717082268251600688008080000800108000040032712957430634926361294872937693090394611600108002080000800208000029462351180021109511080000108000001080958373635691852656478915444673859217931385049521437345020616006329392408006058358510980000800102951829387295262941329369
160024294422202011010664682416961371282943983437116371970215725160072800688000080010800004002941299302059492631229497293739550039466160010800208000080020800002933435118002110936108000010800000108097334404540485726664892878517886176772132501054283709502021600362946931800655845789280000800102930429335295112946929352
16002429413220200100069948231736115120294578083791686171920932516007580072800008001080000400322130019405549261912943129441939503943516001080020800008002080000294033511800211095810800001080000010809603738854098520666014948744892856467831345031533236045020216002229379228005561148810580000800102955329421294202947229357