Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (post-index)

Test 1: uops

Code:

  ldrsw x0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f181e1f202223292b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051041810100007202100004010259215816252000100010001000100052828458241104010406993773200010001000100010001040441110011000100001016024610274024627103827244480075616661037100015151000100010411041104110411041
200410408101000041016000040102511384262520001000100010001000528324582311040104069937732000100010001000100010404411100110001000010260158104620281229103647441720075616771037100028231000100010411041104110411041
2004104081010000760200000501025135147142520001000100010001000527964582411040104069937732000100010001000100010404411100110001000010280147106541120251039312282413075616771037100022351000100010411041104110411041
20041040810100106602600005010250414101925200010001000100010005281645824110401040699377320001000100010001000104044111001100010000101260541031001263110322842948007571666103710001191000100010411041104110411041
2004104081010000510120000701025051241625200010001000100010005282845828110401040700377720001000100010001000104044111001100010002101662391046000039104039439800075716661037100022151000100010411041104510411055
20041040810100003900000060102501212202520001000100010001000528404582411040104069937732000100010001000100010404411100110001000010126187105291122032103820425480075616771037100032211000100010411041104110411041
200410408101000057028000060102586121218252000100010001000100052820469041104010406993773200010001000100010001040441110011000100001020006910515016013101323211400075716771037100022131000100010411041104110411041
20041040810100008402800008010250514162025200010001000100010005284045824110401040699377320001000100010001000104044111001100010000101260611028720045103621233400075716661037100026151000100010411041104110411041
200410408101000074020100051210361761214222520001000100010001000528244582411040104069937732000100010001000100010404411100110001000010260075103930141827104530329400075616771025100029231000100010411041104110411041
20041040710100008000000050102503641925200010001000100010005280845824110401040699377320001000100010001000104044111001100010000102000541029000027104329323640075616661037100028191000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1746

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f191e1f202224293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097218453810000007170848107122100718318132171337255078540616101274010010000613398271864349685117203571896650693652685010040200100007020010000718343511402011009910010000301001000001001090211454951061826688834224109351322118108261035122715654046486688886010000401007161971637717737160571850
5020471576537100000059808641069611367173677411714802550730406081012740100100006760802717582496883872027719346525236543950100402001000070200100007193435114020110099100100003010010000010010910215846610642257159054823109191183113109261025122717664053685690489010000401007169871674718347167371873
5020471480537100000056807901073621167189278611712612550790406481013040100100006759462709355496854971884716476521736547550100402001000070200100007166835114020110099100100003010010000010010877114748010650265129296626109411302142103261025122715964053694292889010000401007167471844716157181271763
5020471875538100001055208131068011007175980010714642550735406521012940261100006729992713538496872672005717336526536544850100402001000070200100007176835114020110099100100003010010000010010930214947610661260109484636108411293145105261025122715734051699891689010000401007178471941716407199371444
502047194353910000004970830106801100716587841171448255075540644101264010010000676737271047249687517181171670652303653565010040200100007020010000717273511402011009910010000301001000001001089311535111066125379275224109051263121108261025124715364054492891288410000401007177171693717737190171685
502047170553810000005380809107841132716507911171431255080540608101344010010000677095271099349686977190471847652203652685010040200100007020010000717723511402011009910010000301001000001001092711464751062627098954832108841202131103261025122714934050488093090810000401007172371672715717161871779
5020471715539100000053108261069611327168181511716212550710406641012640100100006764492716425496865671973718196511736542450100402001000070200100007179735114020110099100100003010010000010010857115446210679262129307421109001102128106261025122715694049294290899810000401007173371577718917168671795
5020471820537100100056108091073611127188278611713362550805406161012340100100006766502715377496851271923719566533236548750100402001000070200100007156235114020110099100100003010010000010010914113950510637284128784838109021332124107261025122714634054091894889410000401007178271764718387191571681
5020471627537100000054907961069611287143180711714522550790405601013440100100006769922716885496856071845716806498736553450100402001000070200100007181735114020110099100100003010010000010010884115146010704304149253629109021273125103261025122715314050092481697210000401007169271743717537186771983
502047178453610000105360819106881136716827891171615255074040580101434010010000677153270617949686187195871712652033652945010040200100007020010000716463511402011009910010000301001000001001094521714531065626378723229109291342120103261025122715374052489485485810000401007168871598717487176371605

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1609

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500297178953810000204998342520128071796804017131825505954048210151400101000063023627235594968465716007179965043365344500104002010000700201000071820351140021109101000030010100001101096311265041062024610914122221089211421331042520465066713724051691478492410000400107152871630715717175571642
500247173153710001004428502552119271616826017138625505954047410111400101000062917227091024968582717587174065142365430500104002010000700201000071718351140021109101000030010100000101091011504821066425689051521610894122211810122520465046715354053684289287810000400107163971642718027174971697
50024715235371000000504851255212767182980810713132550640404861011540010100006279072716217496841471686716026512336570150010400201000070020100007149835114002110910100003001010000010108931151504106612436932156191090812321241672520665144714474045285489288610000400107166971475715637175771665
500247183753610000005418542576131271636803107123125507004048610134400101000062872527099064968590717947159965027365254500104002010000700201000071458351140021109101000030010100000101090411414881066324869181681910922120212113202520465145715844050087487881810000400107159671524717057166571682
50024715575371000000445830257612847180879300715312550656405581012040010100006295452712806496860171641716926499136534150010400201000070020100007174135114002110910100003001010000010109011137490106212538918138191093311611181382520365145714044047683887682410000400107153771546714957170471653
50024714835371000000520822244012287166383901711902550595404621011340010100006289482710935496855671586715886506336525650010400201000070020100007147735114002110910100003001010000010109281143488106522597907112171092410611261142520465144715604049291892889210000400107156071795716297174971531
5002471489536100100049582624641180716437960071196255059040470101354001010000628991270695749684597149671464651393652085001040020100007002010000716503511400211091010000300101000001010895115248710645255989970191091712911201292520465044713724048478275090610000400107169871622715417168671555
50024717265361000000464835343211327165076700712802550645404981011840010100006301462717098496858971552717516503736533150010400201000070020100007151235114002110910100003001010000010108861144495106452609908112201090812221191262520465144716684046485887288610000400107155471598716207176871579
50024714375371000000535841259213207167681210714032550525405101012340010100006298702708528496849471508718086511036536250010400201000070020100007160635114002110910100003001010000010109081177495106652519929170321089111721231192520465144714324051290277294010000400107171071738716047159371529
50024716745371000000340850253613927158080900713262550580404861011940010100006299732718223496858571775716826501536535750010400201000070020100007174435114002110910100003001010000010108922143485106302449896116251087711711181372520465143714114050486687088610000400107172071578716767163871655

Test 3: throughput

Count: 8

Code:

  ldrsw x0, [x6], #8
  ldrsw x0, [x7], #8
  ldrsw x0, [x8], #8
  ldrsw x0, [x9], #8
  ldrsw x0, [x10], #8
  ldrsw x0, [x11], #8
  ldrsw x0, [x12], #8
  ldrsw x0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3671

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602092988822010000107092083017761161602945080134618362060214625160173801798000080120800154008371298815158492639729362293109490069554160135802248002080752800202931235118020110099551008000010080000010080918323935489084986670109139249678567166514151915458163511151170160029367448007453162810380000801002932829376295222938029445
1602042949322210001006803081517041131802950877836617031752217025160147801598000080100800004007851290811156492626029143293289459039444160100802008000080200800002937735118020110099201008000010080000010080954364205167084937663109144246528540484413245475057342500051101161129123278004669460610780000801002935729303294052947729357
160204292322191000100667108121720127100293517913401773164223092516015680161800008010080000400750129853217049263472947929455932403945816010080200800008020080000293793511802011009930100800001008000001008091216401515008526763879248049568603475013348555071182300051101161129336398006058864210080000801002935429366292752946429583
160204294802201100000680708321736134140293757913481702167023702516016080163800008010080000400814129881216449263042907129236939803966416010080200800008020080000293983511802011009936100800001008000001008093618366535408501061411913824809859308341195426523516330005110116112950924800716596308680000801002933229404294672941229381
16020429414220100010068100804174412214029145790349178416852167251601578015180000801008000040078413014001634926377292712940291900394261601008020080000802008000029407351180201100994410080000100800000100809283141556550858306491191892523286220827125455858883401110005110117112946431800655866139680000801002945629521295472967729571
160204293082212120100656708301696119140294058063551629160021262516014980164800008010080000400775129846214549264242919129185943303928016010080200800008020080000295003511802011009936100800001008000001008120036423540808496167689313845258623067912550284992340400051101161129362368005063064511380000801002946729458293742948429447
16020429447219220010067650812172811525629302799363160316852123251601638015980000801008000040074712944231564926646292492939992710393221601008020080000802008000029579351180201100993910080000100800000100809913541555040850286536898344817858499081404855537235050005110116112936028800575486148580000801002946329423294172948329498
1602042941722120221006822081217441212602939580838618091622196625160164801548000080100800004007941297278160492636329537293569287039418160100802008000080200805042947535118020110099451008000010080000010080921343925443085069657119144646278570584912444135570350800051101171129264438006764960910280000801002946029457296352933729245
160204293092202002100677107961536115100294138043611761179822292516016080154800008010080000400756130857815849263622943129544948003931016010080200800008020080000292823511802011009943100800001008000001008093636391489108525363888684652898570976912249685055362300051101171129274248006561051510980000801002952729562293832925629483
160204293002192000100724208091680130156294058103821693174221342516015080155800008010080000400755130400515249263152952929201933303933216010080200800008020080000293833511802011009931100800001008000001008092636414529908515464399304248808580581313650074913360300051101161129347358005957958212980000801002945629389294992932529382

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3678

retire uop (01)cycle (02)03mmu table walk data (08)0e0f1e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dbddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600292993022100063858341720111128294538134121740192023069116038480085800008001080000400292130023001714926570293952939195353966816001080020800008002080000295853511800211094010800001080000010809380422568508503270799541185133858858021344708546836502030011500112932628800486015619380000800102952429370296172963529467
16002429374221000653783717121191202928481142417671670209425160068800798000080010800004002791298007005349262362957629597954239773160010800208000080020800002956335118002110943108000010800000108091604305912085102693689810849558572681813948255271035020000116201129677348007560860911680000800102925029445295142933329525
1600242949222200069098241736128200295228154401708187920512516008180073800008001080000400373128673711684926233294232927192623973716001080020800008002080000293943511800211094710800001080000010809500418503708472169099408246978612184413050465486335020000115001129372418007462455310580000800102954029285292402939629500
160024293992190007231825172010812029356827520190618172253251600718007480000800108000040038212954611168492639229336296119183395671600108002080000800208000029234351180021109581080000108000001080936042057190855407166922765985855837221324677498803502000011600112970131800636725609780000800102938629625295012939429454
1600242954722000066088121712121962951584744018901997213225160070800668000080010800004003551292295005249261382923129296950139735160010800208000080020800002946735118002110936108000010800000108089203945746085414669119322646978631178012948285869035020001116001129451298006754064711780000800102924029242294432939729551
1600242914122100068558681776137108292867944441722192021562516008780071800008001080000400373130466401684926271294592952594703100821600108002080000800208000029512351180021109591080000108000001080903045057510855827495964284928858517701245310571734502001011600112947824800746476399880000800102931129474295362942229430
1600242941722101068718221720108112294978044821773190123722516007380072800008001080000400362131518301744926402294132951994783999016001080020800008002080000294753511800211094510800001080000110809340394565408579769389167449408630484713649555406035105000116001129624228005956062010880000800102944029369294772950029390
160024295432200007221835172812711229685805447177116612036251600658007780000800108000040033912963300065492641829453293309191397271600108002080000800208000029608351180021109601080000108000001080924042254780849136789956765116861388911344946516404502000011600112936726800716095818780000800102940429239292802935429384
16002429448220000734882417281311242951779848719181866225125160078800788000080010800004003261299273117349263472951029469925139670160010800208000080020800002945735118002110958108000010800000108088504005385085314651118982448598622278612250645049035020000115001129207328006270963510280000800102940629316293382934029391
1600242944322000069998481720122124293078034771763171320882516007580069800008001080000400376129811301664926300295232948392743958616001080020800008002080000295643511800211094010800001080000010809420429506408560969911947765331861487291204958504708502000041600112930640800655375989680000800102948529502293272950429478