Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, D)

Test 1: uops

Code:

  ldr d0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03080b0e0f1e20222b3a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a2a3a6a7a8a9abacafb5b6bbdcache load miss (bf)c2c3cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
10051055800007417103121025161557192520001000100010001000507624582410151040104082438982000100010001040104011100110001000210200069103821178271040486244800731161110371000413601000100010411041104110411041
100410408000074251022810251113020192520001000100010001000507624582410151040104082438982000100010001040104011100110001000010140054102201012401053244317200731161110371000353401000100010411041104110411041
100410407000052190010102501411424252000100010001000100050762458241015104010408243898200010001000104010401110011000100021024074510240700241024366317200731161110371000353301000100010411041104110411041
100410407000062120024102512141241225200010001000100010005076245824101510401040824389820001000100010401040111001100010002101600451027201712381044246273200731161110371000333001000100010411041104110411041
1004104080000731200101025131737122520001000100010001000507784582410151040104082438982000100010001040104011100110001000210220034103933150241067366295600731161110371000363601000100010411041104110411041
1004104080000671600101025013111202520001000100010001000507464582410151040104082438982000100010001040104011100110001000210201207210429100321025415274800731161110371000333301000100010411041104110411041
10041040800006915101010259183142025200010001000100010005077045824101510401040824389820001000100010401040111001100010002101900681039211710301051367284800731161110371000423201000100010411041104110411041
100410408000060131020102510143212125200010001000100010005075445824101510401040824389820001000100010401040111001100010002102000541045101312271047476255600731161110371000333201000100010411041104110411041
10041040800008228002010250151182125200010001000100010005076245824101510401040824389820001000100010401040111001100010002100000791044311312241040306295600731161110371000333901000100010411041104110411041
1004104080000611510041025121205212520001000100010001000507704582410151040104082438982000100010001040104011100110001000210160073103831164291040365264800731161110371000383601000100010411041104110411041

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr d0, [x6], #8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1890

retire (01)cycle (02)030508090a0b0f18191e1f202223293a3e3f4043494d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502091219449140000000062208211077601081215288215412145225704815039810151100004010010000100001076184460382746497881217481217901216941150193115353601003020010000100006020010000100001216731217711150201100991004010010000100001100110220115550107052011292748411093314451262030321017611121561502609308528681000050100121979121794121915121732121635
502041218149110000000461608441068801801218828118312154925705295035410140100004010010000100001077819460554546549991219361218271217941150593115456601003020010000100006020010000100001217431217631150201100991004010010000100000100109230118551107132071190866401091113131181060321017611121438502748928649021000050100121655121641121625121893121733
502041216539120000000057308141072011161217778185012138225704935039010133100004010010000100001078071460714746503661216741218331217691148863115510601003020010000100006020010000100001218481218191150201100991004010010000100000100109080117536106852221590848321091112951231050321017611121456502728268607811000050100121647121902121760121831121939
502051218109130000000059208251071201441218317966312166325705055036210130100004010010000100001076905460395246524081217451218021216321150023115283601003020010000100006020010000100001217001215911150201100991004010010000100000100109802119543107201959923424310958130712510120321017611121565502688628667011000050100121664121740121624121824121782
5020412186091211000000570080110744111612178981850121477257047850352101161000040100100001000010776394602212464669512180512191812161711518331153286010030200100001000060200100001000012174812171311502011009910040100100001000001001091221235451071421312910504710911128511924703210176111215325026091810307551000050100121772121693121887121714121835
502041218309130000000053708061070401121217248337212141325704905033610113100004010010000100001079011459813846482461218091216441217791149913115440601003020010000100006049610000100001217801216881150201100991004010010000100000100109140120549107182141492772381092511051291080321017610121340502548488727811000050100121777121658121716121745121889
5020412171591200000004609083910688012812168480640121501257049050382101381000040100100001000010770184607842464825712177612170712182311496331154206010030200100001000060200100001000012173012190911502011009910040100100001000001001090501205631072220712892504110919118512710120321017611121349502549048488301000050100121783121771121747121799121808
502041218909140000000056008291070401361216448084212286563870934506701021110052440971103311060114023646622494703773123665124495124531116354231116997658733368111136111586709011128111591243141244503015020110099100401001000010000010010907011854110714217109064819699109181194122101013872129322123722505628919207631000050100124401124577124414123672124607
5020412388293200011000234832568051071201361240287845212163021070481503421015710050429561099510975112250846557494701686121940121786122012114978311534560100303251000010000602001000010000121681121980115020110099100401001000010000010010932011952210718203129817437109311275134004032101761112176750262107910489701000050100122048121775122041121890121988
502041218479130000000044808020174401001219628143012166925705085034810152100004010010000100001079561460927746565131219111220681220731151363115591601003020010000100006020010000100001218381218051150202100991004010010000100000100109050104555106842101094056371093112441260154032101761112161950278102211469571000050100121951121885122043122006121907

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.1840

retire (01)cycle (02)03050708090b0e0f191e1f2022293a3e3f40434d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6dadbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5002912207691320000100508264833160821041217637851121559257040950264101311000040010100461000010775474606272464975211218380121848121581115040031155886001030020100001000060020100001000012183812188511500211091040010100001000001010892312151410665194129252622109251433123237314048200491214805026897010029611000050010121924121890121803121984122022
500241219539112000200047708031696210812177179311215122570403502761012810000400101000010000107681846070094658281112189701218091217541150470311546460010300201000010000600201000010000121834121799115002110910400101000010000110109202112549106982171290078151089113031142073140782009412147150232102410448931000050010121814121970121861121888121762
50024121936912200000005070775172821001216757791121489257036750240101261000040010100001000010770794601788465221811219970121781121786115099031154436001030020100001004960020100001000012198312178111500211091040010100001000011010911212054010696188988676241088812131252043140382003412155050252100011348811000050010122056121629121926121807121796
5002412174391320000000502077117122108121785793112147425704005026810127100004001010000100001076791460280646525331122018012159812182311494303115425600103002010000100006002010000100001217071217051150021109104001010000100000101095531045231069820689617216109721294120369314041700741215045025291110749001000050010121802121886121873121749121862
5002412184091220000000433080317043100121613744112135125703405025010123100004001010000100001078402460709246503180121802012204412184911526603115491600103002010000100006002010000100001217501216971150021109104001010000100000101091641175221073620110899803210917118512740331404820044121448502489949359451000050010121855121766121806121601121883
500241217779144010000042708091712310012179178911215472570412502381012710000400101000010000108188046171994659027112183801220351218721149720311545060010300201000010000600201000010000121890121882115002110910400101000010000010109062115516107222031191676211090412831272053140382006412138950250100910598761000050010121837121837121895121824121815
50024121897912200001004150811172021081215597761121223257039450266101551000040010100001000010789154605103465344411218140121895121891115116031155916001030020100001000060020100001000012174612207811500211091040010100001000001010904611251010730196887674161092913441173043140482004712157150230100210839071000050010121759121842121728121874121730
5002412175991320020000423081516882108121828776212138625703675024810112100004001010000100001076485460809146576121121887012199012192411503503115539600103002010000100006002010000100001216141218741150021109104001010000100000101091321085001068721010872681210913131311420133140482004312145450248108510658281000050010121799121885121814121691121809
50024121973913200000004720808168831081218187811121331257037050256101201000040010100001000010792394605913465537511217850121997121781115033031156656001030020100001000060020100001000012183412189111500211091040010100001000001010927311254010688198890370251088513151124013314038200431214625024896510018551000050010121894121902121814121953121721
500241218529134000101044508711744368122013800112152925704185025410136100004001010000100001077745460680446480881121827012228712183111517703115480600103002010000100006002010000100001218381219431150021109104001010000100000101087061105251069420689147415108831213117203314038200431216485024492010179671000050010121830121945121764121812121830

Test 3: throughput

Count: 8

Code:

  ldr d0, [x6], #8
  ldr d0, [x7], #8
  ldr d0, [x8], #8
  ldr d0, [x9], #8
  ldr d0, [x10], #8
  ldr d0, [x11], #8
  ldr d0, [x12], #8
  ldr d0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3664

retire (01)cycle (02)030508090b0e0f18191e2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a5a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802092998122000000100711082817601211442916487946320872109218925160145801398000080100800004008101297851156291242911129242190553192271601002008000020080000292512948711802011009921100100800008000001008091404384923328496777416942305416857907341474641517400005110117112922632800445414317580000801002920629180292452933529082
8020429334218101000006566846179211429629227886501210020432186251601528013680000801008000040080912898101462944229153293051937531942416010020080000200800002939229128118020110099111001008000080000010080984042054370852517628940365333859127421385000504100305110116112925236800394684387480000801002929429357292692930429295
80204293502180000000064248381736107104292358714082077210818392516014880143800008010080000400745128805913829196292882917819333319142160100200800002008000029047291301180201100991110010080000800000100809080399507008486679710960244762856407661305140504300305110117112939736800435004406680000801002917629314292272932529172
80204291572200000000069618171728107100293448404622114223820912516014680155800008010080000400742129693315329296291832927819267319302160100200800002008000029341291191180201100992110010080000800000100808930440567108527071813950384917862588021294515551000905110117112934432800424424486680000801002923329250293002946029355
80204292802200000100068838431696113124298528564522079201621222516015480146800008010080000400798129503904129196291472952719310319308160100200800002008000029274292881180201100991510010080000800000100809380484576030850517929930804781852687641524974486900605110125112918633800505014339080000801002933229600293252931029292
8020429434218000000006791820173610810829231854438216719882089601604608013780000801008017840164812877020562920329052293661931031939916010020080000202801922933529306118020110099231001008000080000010080928041250720851197739916764974861568441364862518003305110117112938134800384034495180000801002923029268293332923229265
8020429304218000000006801837175210615229314856404208222102100251601418015580000801008000040073312908230412925229391294001923231915216010020080000200800002921529407118020110099191001008000080000010080956194325102085447780139437857168596782013252825503190605110117112949032800454684166480000801002917229267291732937929462
8020429393221101000007049847174412113629240855431191622752024251601468014780000801008000040074712849450392921129389291721923731919416010020080000200800002951729088118020110099321001008000080000010080958214855434085671767109538047738596183414152495071190505110116112934823800405044836980000801002946329321292242922129419
8020429357219100100006611822173610315629286842447195419161954251601428014880000801008000040071812934990422917529168292701953231906816010020080000200800002938129378118020110099221001008000080000010080963194185278085517764148683050408582879411653115560191805110116112923349800434364628080000801002915529445293712937129531
8020429449220111101117143855168013692293378504151998219521822516015580341800008010080356401711128330204729300291102929619113181920916010020080000200800002931729361118020110099301001008000080000010080962194645057085142721139203454518599984414946605564194405110117112943547800514894507780000801002927529333291982924529131

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3679

retire (01)cycle (02)0305080b0e0f1e2022293a3e3f404346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)67696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a2a3a6a7a8a9abacafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c3cfd5d6d9ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002930008220100016706818172012414429603809579184917752109251600588005980000800108000040029512944115129402294932942019449319500160010208000020800002936429492118002110935101080000800000108094920409587485301666139084847968602790413753525502341350209160432929940800436326288480000800102932829403295052964229698
80024295972192000064648121720119112295687965331926189521712516004680060800008001080000400267129645938293432950329492191643193261600102080000208000029504294971180021109411010800008000001080935373515062854076151090548464286249790127542253163637502031604329474248005158462810580000800102954829610295442952229512
80024293712202221066547941736122352293967986141760163621052516005080055800008001080000400319129725453294962955529396194393193961600102080000208000029420295381180021109451010800008000001080953384045266854456481090342522286178779123519855143404502041606429473258004065759612480000800102960529325294402944029580
8002429607220210016363828168010814029549762523174019692263251600518006380000800108000040026813102904429431294872948019366319552160010208000020800002959129548118002110940101080000800000108091140413489585259668891413846848578377713755935189373316502061603429385328004258465810380000800102947829739294322954529466
80024294792212100067678071768119112294517965601543190822752516004380061800008001080000400330129280249294302942529513194713192451600102080000208000029338292271180021109371010800008000001080943364064958852636121388274532585449818125503954003208502041603429539278004760957110480000800102946029540293952929629423
8002429343222200006160822167212512429517782534156819042191251600488005480000800108000040032612954854629250293682949219392319197160010208000020800002938529444118002110920101080000800001108097037376550685333695138527453268595174713754675464383750207160352941130800435805739980000800102947529440294762942629542
80024295182202000070798111664115112294107735161709178522272516006580061800008001080000400253130430138294702923429262195763193751600102080000208000029327294761180021109431010800008000001080916403916093859636961292848516186020738118500451483403502071606629480408004556860810880000800102934129478295592963729416
8002429560220201006867832174412110429273794522150519282390251600608005680000800108000040031013060474929310294472935519569319420160010208000020800002943929390118002110937101080000800000108093635369526785439689895642518086060721127566652083744502061607429503248004759665210580000800102931129444293752980029369
8002429376220200006853814170411413229379766464166819612106251600498005980000800108000040034012951214629394293822940519478319461160010208000020800002940529398118002110952101080000800000108097333436576384562679894880454485825747126477652243706502071603429467338004859659611980000800102938229501293382973229502
800242939522020100684979816961259229431748543153018342139251600538005780000800108000040027513010032929402294662956519432319482160010208000020800002949329317118002110939101080000800000108094337364582085545629988774575086176730139538754353637502061606929579358004067560210880000800102944329436293992944629442