Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (pre-index, D)

Test 1: uops

Code:

  ldr d0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030508090b0e0f1e202223243a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a2a3a6a7a8a9abacafb5b6bbdcache load miss (bf)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebld/st retires (ed)gpr retires (ef)f5f6f7f8fd
100510568111101105371001010252333241925200010001000100010005073045826110151040104082438992000100010001040104011100110001000010160184107322281449106646653790175516661037100053441000100010411041104110411041
10041040800000195161001410252037051525200010001000100010005071445824010151040104082438972000100010001040104011100110001000010106038107420311472107136652580075516561037100039371000100010411041104110411041
10041040800010112000002010251037012825200010001000100010005069045824110151040104082438982000100010001040104011100110001000010200048105920201447105830651550175516551037100042401000100010411041109010411041
10041040800000184000010102503200232520001000100010001000507224582311015104010408243898200010001000104010401110011000100001020004110733030854105836554550075616551037100031301000100010411041104110411041
1004104080000019714100181025932062025200010001000100010005075445824010151040104082438972000100010001040104011100110001000010140263107150201062106236447470075516551037100028241000100010411041104110411041
1004104070000011021610030102514372122252000100010001000100050746458241101510401040824389720001000100010401040111001100010000102800521050000048106230752520177617661037100038301000100010411041104110411041
10041040800000110200002410251033473125200010001000100010005076245823110151040104082438972000100010001040104011100110001000010400049107412291848108336551610075616551037100032231000100010411041104110411041
10041040800000110221100201025173335242520001000100010001000507064582501015104010408243898200010001000104010401110011000100001038016010540020647105636556390077517661037100038421000100010411041104110411041
1004104080000019318100112102553722222520001000100010001000507384581911015104010408243897200010001000104010401110011000100001000007610693112046104830757800175617661037100044431000100010411041104110411041
10041040800000179190002010251737282125200010001000100010005073845824110151040104082438972000100010001040104011100110001000010240063106330221449107836553340175616651037100051391000100010411041104110411041

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr d0, [x6, #8]!
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1857

retire (01)cycle (02)030508090b0e0f181e1f2022293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502091223739120020000447081517040112122008812331217922570520503501012410000401001000010000107757946124464655921012168701218381218541151620311554360100302001000010000602001000010000121778121777115020110099100401001000010000110010936011853110709213993736201095711721340155032101761112155750242102011478961000050100121780121726121849121905121902
50204121805913000000044408301736111612192778622121576257049950374101291000240100100001000010784794606477465187201217570121925121774114997031154846010030200100001000060200100001000012175212181111502011009910040100100001000001001095801125311071122412928322010947120212200303210176111218325026298210328761000050100121736121706121847121757122152
50204121759912000000043708251720092121923808331216302570478503621012510000401001000010000107961346073954655303012170601218481220521152940311538660100302001000010000602001000010000121811121910115020110099100401001000010000010010891011051910742207891832181094713331250911032101761112150950236108510829451000050100121893122071121971121926121731
50204121877914100000041308111696110812163580023121495257048750346101311000040100100001000010773634610332465355601217950121852122032114877031153156010030200100001000060200100001000012164712183811502011009910040100100001000001001094211215251070721469123425109401114116105032101761112159050250110710109891000050100121903121922121933121905121913
502041217019121111000458078817281108121860782221215272570475503661012410000401001000010000107814646045894651950012189901219271218801150940311554160100302001000010000602001000010000121758122001115020110099100401001000010000010010899111353410728207993452181091012531221311032101761112165050296108111799741000050100121922121771121727121863122063
502041219559141000000437081217121124121905788231215562570487503501013010000401001000010000107976646078404656832012183001219691217771150400311565160100302001000010000602001000010000121933121784115020110099100401001000010000010010911111351510792211119313621109401304133133032101761112160650254110711559441000050100121793121913122063122005121720
50204121806912100000046008191728110412175979833122635638708735063310162100564400111239111821138695467131747195890123903012428312446311620102261168876619833708101211115868612111331118812426812451130150201100991004010010000100000100109261127529107122168937783610925118312906180389512914212423550584114110599181000050100124940124436124939125201124929
5020412436593611000021430082717360144121982801341214092570460503441013210000401001000010000107872246025924656028012191801219101220361151210311541360100302001000010000602001000010000122011121575115020210099100401001000010000010010947011853810720208994572151097012821230310032101761112149250252107810428851000050100121784121626121783122024121924
502041218139120000000439083117680108121820821321215242570493503321012810000401001000010000107748046011704655681012192801218921218881151600311549660100302001000010000602001000010000121770121774115020110099100401001000010000010010908012053710725213109147621109481304120109032101761112160850254108110589331000050100121912121815121819121926121885
502041219419131000000455086718001108121877794321214662570523503801013410000401001000010000107974846064144657596012199701217671219161151310311551960100302001000010000602001000010000121918122065115020110099100401001000010000010010932010052010742206891436111091712421260012032101761112171250266115410309771000050100121933121680121867121860121802

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1879

retire (01)cycle (02)03050708090a0b0e0f191e1f2022293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3cdcfd0d5d6dadbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5002912194691200000000042808411696212012177180432121419257041850260101291000040010100001000010807064611101465435612195801218141216901151280311538560010300201000010000600201000010000121760121841115002110910400101000010000010109231109530107132081392072361091411631161191703140017820012912163550266106511729621000050010121815121870121947122076121848
5002412191991320000200045608251736111212198478722121741257041850252101281000040010100001000010784834612968465500412186701218881219351153210311553860010300201000010000600201000010000121978121816115002110910400101000010000010109191112530107082091189074301089012631301313031400108200121112174950274105010949921000050010121837122031121818122004121894
5002412180891100000000043908401728110412195281222121595257039750274101281000040010100001000010791674612449465677012193401219531218701151830311564460010300201000010000600201000010000121751121831115002110910400101000010000010109191885501071821111945722710932139312203120314009820112912176150272101110799671000050010121916121991122033122000121922
500241218029130000000004530838171208012193680922121630257042150260101361000040010100001000010793294611519466153912186001218211218141151440311567260010300201000010000600201000010000121984121817115002110910400101000010000010108911115536107031851295380391090613431261690314008820010812156450252102510479461000050010121783122012121805122034121892
500241218419131000010004560829169611361218638012212154225704185026810135100004016110000100001079284460925446522371218280121817122003115204031155096001030020100001000060020100001000012192112183511500211091040010100001000001010936111453510712204149407045109251504119009031400108200151112165250262108910239231000050010121750121862121768121841121826
500241217609120000001004420845161611321220028082212154225703825025410128100004001010000100001078807460222546558331218330121895121880115106031155066001030020100001000060020100001000012194912188611500211091040010100001000001010900011053610689206810197235109241222124038031400782001115121508502509929649471000050010121934121966121936121934122021
50024121895913200000010440082517842108121929790221215972570382502721012710000400101000010000107881646092294662530121991012187212190111513503115625600103002010000100006002010000100001221721219705150021109104001010000100001101086921115471071721489177226109121264125269031400108200111012163650254102211489001000050010121900122002121850121834121901
500241217389132012020004400826170429612189379822121729257045150288101301000040010100001000010787174610374466048012203701219411222001152580201155426001030266100801008260750100001004012206612266331500211091040010100001000011010859411553010717211119281182810921129513023303140098200131212183450290105111749681000050010121867121842121854121937121908
50024122051914200000000501088217122112121914813341215952570385502541013710000400101000010000107795246081794662537122007012181412190411514003115668600103002010000100006002010000100001219131220311150021109104001010000100000101093221095301071320110934864010940136512720160314009820015912164850244106210439681000050010121915121883121927121987121899
500241219939132001000003990813172801161217527843212159225704095027010117100004001010000100001077169460545646649881218250121934121849115248031155476001030020100001000060020100001000012190012182311500211091040010100001000001010910010754510699206109124623109421272124031403140088200141012162750262106410879041000050010122048121901121842122028121876

Test 3: throughput

Count: 8

Code:

  ldr d0, [x6, #8]!
  ldr d0, [x7, #8]!
  ldr d0, [x8, #8]!
  ldr d0, [x9, #8]!
  ldr d0, [x10, #8]!
  ldr d0, [x11, #8]!
  ldr d0, [x12, #8]!
  ldr d0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3667

retire (01)cycle (02)030508090e0f1e1f2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a5a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80209294582191010068310829174413011229336791334163716442038251601408014480000801008000040070712872781532938329154293491910531930316010020080000200800002946129416118020110099411001008000080000010080914041147190852216391591074535785329754119476146950005110117112935639800455835469080000801002936629480292312941129229
80204294232190000069250812179212814029388795400158816342285251601528013380000801008000040072112863931472936329380294511932931942916010020080000200800002928129374118020110099501001008000080000010080909040953280851856341192978460685556753135444649000035110117112928437800345515529180000801002931629266294962936229294
8020429297219000006330082317121176829140815299153915912083251601368013880000801008000040073312911771432928629445292221912831927516010020080766200800002960529378118020110099351001008000080000010080926037153220851506931690544524385284876139446952720005110117112936138800355575628780000801002923829391292772930429383
80204294092190000165960817171213511229220837377161318222140251601398015180000801008000040074912912201402937729215293751916631950216010020080000200800002948629411118020110099301001008000080000010080936042153990847277011493432447385849832123457252290035110117112910826800515306259080000801002931929372292852931829237
80204293202200001067100856166412310029280819324167618871952251601408014480000801008000040076312987281462926729347295191937831917716010020080000200800002937229380118020110099391001008000080000010080934041156200850116681296176517685906896126447847840035110117112938423800445625339080000801002935129239292232945729389
80204294012190000066600849173695104292618173591524188821302516015480140800008010080000400749129557615229341293462953219222319227160100200800002008000029538293781180201100995410010080000800000100809510414531108532465410952805679855917381204833538903351101171129489338004259262310480000801002937629333294612936929195
802042940221800011656608581744109116295068383401687169521742516014380143800008010080000400728129880114329359292562926319362319293160100200800002008000029383292251180201100992310010080000800000100808970392530108487369999361344738856508681224614482100951101171129131318005658354710880000801002928929274292982940229290
802042926921900010667508231712124184293158183731741180621582516014580136800008010080000400733128570814629432291902939119078319262160100200800002008000029434293201180201100994910010080000800000100809450391450208460867610895544877855118791334843487500751101171129389418004561255410280000801002938029454292482928829384
80204292642190001066590833171211311629390805340185716021988251601408014680000801008000040072712908701552925529328293491918231940716010020080000200800002937129350118020110099451001008000080000010080934041951690849816901390578509686038809141467751350075110117112960924800365946178780000801002958529404294972931129109
80204294182180001070210829176813612029295799356195216541935251601538015480000801008000040079713029681462961329363292881917531923016010020080000200800002936029217118020110099441001008000080000010080930040557680849316331194448522385948746118494353050005110116112948425800485225999180000801002944529216294882957929478

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3762

retire (01)cycle (02)0305080b0e0f191e2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a2a3a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd0d2icache miss (d3)d5d6dbddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80029304852243000108525853170413514030060780199216622024245025160071800688000080010800004004301323035016930037300493007719884320166160010208000020800003021130144118002110928101080000800001080980513125803867895721188242663687903757126679760395303502000011601129953298007978171615380000800102998930191300453007930225
8002430202225300010868580616641231203006279822761906205425352516006780067800008001080000400351132445400662971229826300401984031996416001020800002080000300683011211800211094110108000080000108096136387547987316621890748729487061811131637850833505502000011611129783318006570467214480000800103025229978302783011229859
80024301442262000108783836174412314430403802211316281972281325160062800748000080010800004003071329203005630218302223022520121319957160010208000020800003001130295118002110939101080000800001080955173835829867966401595052648488464748128676158931820502000011601130378358005968868012780000800102997930317301663035930080
80024300532261100108692819171211710830143791229814662169276825160081800858000080010800004004111325491016730120302612988320160320131160010208000020800002995829793118002110939101080000800001080929153196496873446231091146671188225800139709358921803502000011601230029368006968768913480000800103017730059302713022230106
8002429985225200010895180517041211483006280620471769205725982516006680071800008001080000400415132802701662997629889300402008021198561600102080000208000030277301151180021109581010800008000010809233534454628723762012974766994871847341197188532033010502000011601129564248005670065814080000800102997029979299843022330132
80024300322262000108974790169612814030254806213218171821232325160070800768000080010800004003241321228016930153303453016320133320043160010208000020800002982430147118002110931101080000800001080956323425601868116581388346689487624765126702456303734502000011601129983338005970570313080000800103003230053303603029829996
80024301522242000107966788163210514030079824213818011924280325160078800798000080010800004004181313565016530324301612983120159320156160010208000020800002994029784118002110941101080000800001080980303416495880206031393642710188038779132738054773163502000011601129973338005465566913380000800103021230101301263011130345
8002430026224202010832277117361211002993981519931718212825182516007280073800008001080000400381132146601533026630383303732004732003916001020800002080000302833020211800211094810108000080000108098351330616187572678996098707487701767122639146195438502000011601130133408005760663214580000800103002729980301492992030122
80024300252233000109223858171213012830017791221915792087276125160068800808000080010800004003251330631007030146301433018020015320085160010208000020800002993330010118002110935101080000800001081128573226110871106741291682727787450802132730652945103502000011601130115348006068167614380000800103025930328302923044630069
8002430286226320010861080817041431043013182320761945205025862516007280069800008001080000400421133707500622997730608301122002232031816001020800002080000301102976611800211094210108000080000108095054363608287447646788846683387561796129694956435203502000011601130168288007464369712080000800103019829967299083016330259