Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (post-index, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e2022233a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200510407000015600020102511111114252000100010001000100052840458241040104069937732000100010001000100010404411100110001000101001581027111401910283658320073116111037100024261000100010411041104110411041
20041040700100580001010250111232520001000100010001000528404582410401040699377320001000100010001000104044111001100010001016015210271216623102853818560073116111037100028261000100010411041104110411041
20041040800011701010101025401111252000100010001000100052844458241040104069937732000100010001000100010404411100110001000100800411027111001910223558320073116111037100024251000100010411041104110411041
20041040700100143710101025612214252000100010001000100052840458241040104069937732000100010001000100010404411100110001000101501501021108010102042483202373116111037100024271000100010411041104110411041
20041040700000328100010251201082520001000100010001000528564582410401040699377320001000100010001000104044111001100010001014002410071122817101624710240073116111037100028281000100010411041104110411041
20041040800000581110101025313382520001000100010001000528364582410401040699377320001000100010001000104044111001100010001012026910371131615103444716720073116111037100034371000100010411041104110411041
20041040800000501400101025011117252000100010001000100052844458241040104069937732000100010001000100010404411100110001000100000321008010013100818510320073116111037100024231000100010411041104110411041
2004104070000075160000102510131925200010001000100010005284845824104010406993773200010001000100010001040441110011000100010000141102121101223102241612560073116111037100036431000100010411041104110411041
2004104080010062810128102512133172520001000100010001000528404582410401040699377320001000100010001000104044111001100010001000004110131110019102241412560073116111037100024261000100010411041104110411041
20041040801101401400001025800014252000100010001000100052848458271040104069937732000100010001000100010404411100110001000100000561014000023101536611560073116111037100024231000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1753

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f202224293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097208153911130000059808451074411567166781143714422550810406121014040100100006145342712818004968845718287180765408365469501004020010000702001000071733351140201100991001000030100100000100109082145494106762599912110291091312561282822610025111716144061199287093610000401007168971709717517188171907
502047174353801000000058608081071211167180281443716072550770406481012540100100006740522718196004968759718887173665221365542501004020010000702001000071756351140201100991001000030100100000100109152164516106542498899903410917120513322126100151117184140564886104488610000401007167271812717987187171750
50204714795380200000005690831107203164716918093471330255072540556101304010010000674643272201900496870171765717596524536540250100402001000070200100007170035114020110099100100003010010000010010923612153410661255993482331092913371202282610015111714254050488498682610000401007179871695717937192271799
5020471761537011000000579082710720292719258044371398255072540604101304010010000674724271707200496853971683716956540436557250100402001000070200100007185935114020110099100100003010010000010010892114551410678264893046421096211861292282610015111717014053291291290210000401007183371987717617174871625
502047173753701000000053208391074431047159878733716372550760405881013340100100006768492718739004968718716567183165080365375501004020010000702001000071702351140201100991001000030100100000100109142128486106512627929108421094913741213442610015111716904055688895490010000401007179371711719947179071713
50204716575370100000005500827107122100720208053371628255073540668101344010010000676659271640400496877171672716456508936537450100402001000070200100007176335114020110099100100003010010000010010917113051710649245891424311092811661351642610015111715454058893490083810000401007178071826718247180771829
5020471777537010001000518081810704212071590813437142725507904061610131401001000067604927147280049686957177171883651933653885010040200100007020010000716733511402011009910010000301001000001001091721475261068127810930182271093912961312042610015111715824049295290887410000401007181271671717697187271574
502047181553701000000055908211068017671877791527160825507304059610139401001000067250427163650049686857165671746652593654245010040200100007020010000716843511402011009910010000301001000001001095221475071067328218981124351094712131201012610015111717634053294283485810000401007176471940717687181571788
5020471723537011000000569082610728110871721795237155525508054061610137401001000067634727179540049690357181771757651853654615010040200100007020010000716763511402011009910010000301001000001001093011555171064727018952104421094713361323172610015111716074055285084484210000401007180471696717987175571580
50204717335370200000005340833107281108716567994371524255081540636101324010010000677337272308600496853871979717336521936544450100402001000070200100007184735114020110099100100003010010000010010920113550110638274789588291090812351213232610015112718054055688493289010000401007183471601718297176671873

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1933

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e1e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002972305538000044707851696511271867796417169425507104055010139400101000061228027269710496873072038718056526536551250010400201000070020100007205135114002110910100003001010000101088511235681064225612926162221094514851331030025207852571724405241054102899710000400107185871774719517188471939
5002471930538100046508211704110472013792307164125506904052610137400101000061309027284580496877171980719006535336561550010400201000070020100007211835114002110910100003001010000101088111404851065826413919782610904127512210300252048555718074054410971115110610000400107197971831720477210172048
50024718905401100419082917842208718278044271661255069540570101324001010000613178272889014966122719187191265323365584500104002010000700201000072152351140021109101000030010100001010915115450810626254991270211095413841301050025204854271727405321011111192810000400107190572022719977186072045
50024718945381000440083417283268718198144271677255067040550101254001010000612928272852304968989718257186665425365493500104002010000700201000071999351140021109101000030010100001010891216050510693265139337426109301286114103002520585347161740516897109099010000400107202472014718467202871858
500247181354010003960840172051287193382951716852550720405341012740010100006138102728971049689757194971853652173654405001040020100007002010000719713511400211091010000300101000010108841125500106712647906801810944133512410300252048544717294054411191097112710000400107204272009719637200271973
500247182654010004520829172028871959799417163625506654053010126400101000061231627339200496878871928719266533636563750010400201000070020100007187735114002110910100003001010000101093221355091063426912904681910924128612610300252038554716584056810611069113910000400107186371947719737204372109
500247207553900004460812172801087167576740715662550675405341013140010100006119092727396049690037181471990652383655805001040020100007002010000720643511400211091010000300101000010109171150507106702638906922310933128612110300252048543716124054410331083119710000400107225272137716917209972047
500247189153911004430831176841087197980341718472550715405301013640010100006139722726163049689287195271950654873655845001040020100007002010000720923511400211091010000300101000010109922144486106592608956802410940146513310300252048525716944049212211144115610000400107203972012720017201671875
500247195154010003950800175211367194979240718362550620405261012640010100006124492724064049688967184071861655663654445001040020100007002010000718713511400211091010000300101000010108811146494106562579906122510944122412110500252028544716644053610731043105210000400107201371977720677181472055
500247179453910104510822174431087213278832716782550625405421013040010100006128232731574049687707195471803651333655875001040020100007002010000720193511400211091010000300101000010108971135496106432478898762010928133411810300252058544718714050810771107105210000400107182071939720627199472155

Test 3: throughput

Count: 8

Code:

  ldr w0, [x6], #8
  ldr w0, [x7], #8
  ldr w0, [x8], #8
  ldr w0, [x9], #8
  ldr w0, [x10], #8
  ldr w0, [x11], #8
  ldr w0, [x12], #8
  ldr w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3676

retire uop (01)cycle (02)03l1d tlb fill (05)0918191e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602093004422212007002818164813114429271793333179917252029251601778016880000801008000040074112904681714926275294272948994120394041601008020080000802008000029206351180201100993510080000100800000100809402042350488471267817884122473686646782128540445900951101161129767298005660861712480000801002936929422292102954529288
1602042949422200006599818170411696292307973661925208521472516015880173800008010080000400775129902616349262112927229483915203941016010080200800008020080000291543511802011009936100800001008000001008093203936145857807011393266529585918780124514954740351101161129504428007847352711880000801002938429350295742937429255
160204295152190000718981117289888294598213541806172321632516015680166800008010080165401628129970115949264212933029311923903929616047180200800008020080000293633511802011009927100800001008000001008091203735803854537381591954548385761768143521655053051101171129510338007252654911680000801002966929410293292954129603
160204295132210000702381916961051002925881239317741895204858160478803368013080229801614016781293578167492632329691291329242561993731601008037280173802008016529304351180201100993210080000100800000100808660420555685260625108905047688567578013345824827375131125112945622800525474999080000801002950629381294272932729437
16020429461220000064368111504138104291848213971719182618792516016380169800008010080000400826130181316049261292948329345931103937716078680200803428020080000295393511802011009936100800001008000001008088803935873859246881389540501385854799129492450410751101161129430248005654851810680000801002943229265294532948829454
1602042953622200006999799176010992294468053611757181922492516016580151800008010080000400804130210217149262722961229317921503943716010080200800008020080000293763511802011009930100800001008000001008091303905180856286281493136546585467760121449841863451101161129114228007955955913280000801002925429301291162911329530
1602042944921800007047817179213013629144783361168117042147251601568017080000801008000040080213042521484926311292932948493560396301601008020080000802008000029545351180201100992310080000100800001100808890370518685347683119194851308628183613551534754045110116112954138800726004809680000801002957329351293602935329347
1602042947222100006790793153612084294148103571827195619612516017780149800008010080000400816129857016649262912945229429928703938216010080200800008020080000294893511802011009932100800001008000001008093004145493855446591087532512586508837117457050420351101161129574298005743352711080000801002941729264291122947329225
1602042927021900007018813170411119229478799364183118622298251601468015880000801008000040083913049251574926377291622929489960392411601008020080000802008000029308351180201100991610080000100800000100808830387603185866651119243854688622181415352464964095110116112934426800595084929780000801002981829408295352940129322
1602042958522300247528791171210510029261807344203819492068251601608015880000801008017740077613102711704926398293492932993240393151601008020080000802008000029330351180201100992810080000100800000100808780422579785187686139093854468655380012949474691035110116112933527800626125259580000801002966829657297572926129470

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3675

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f181e1f20222324293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160029298752221100074340784100760991962954178862117691837217925160082800618000080010800004003841305451006049265902926829485942703941216001080020800008002080000295103511800211092610800001080000108092338332580908578961513903525573863579371435355535652000502011601129577258006459155212380000800102942529457293462942429466
16002429690220330007416264803100696132188296418195991718205321162516008080085800008001080000400356129778700774926341296202973195620189640160010800208016580020800002968935318002110922108000010800001080987594286403308563862412879326084863177941334894558856000502011601129823378008258454210080000800102942029534294632962129647
1600242944321930100765808101007201023002955081976820391734221525160087800798013080010801734011591303128006949264272959929478956303953216001080020800008002080000295903511800211092610800001080000108092756354527908554061811924745794869317681215460497473050502011601129866378007953053611280000800102954529564296022945029440
160024297832203300072980855100648981642951779059918862075207825160074800678000080010800004003211307414007149265222969429631960403951816001080020800008002080000294173511800211092910800001080000108094149331567608606964716869505488865357231375560489051290502011601129916268005760050110380000800102947429603293282945329384
1600242965622230100765308201005929332429563764680209620682239251600738041280000800108000040037212932520054492637229732293419516433961416001080020800008002080000294213511800211091610800001080000108103461395522808545769712878466945864297401225297520049630502011501129513288007256453511180000800102923029520295472967429591
1600242962822133300730707651006161242562935675367817362058232025160076800798000080010800004003321307477017849262952951029762951003967216001080020800008002080000293723511800211092710800001080000108095852427626008596964111889425641864398721285330525653030502012501129458268008559555610480000800102970029527296622980929741
16002429449222400107035084310053696922965178668921292133215325160084800688000080010800004002961310611007249264252960429447932103948816001080020800008002080000295923511800211092210800001080000108093366329570708621559911915905380863128151235439491570060502011601129592418005752752511480000800102947829711294642961929653
16002429540222400007180084810060811496295378466141825194023482516007480070800008001080000400351130326000684926448296892971496230394961600108002080000800208000029533351180021109271080000108000010808841734652660852606311193034595086113736128556949770390502011601129889348006050648610880000800102941529402297222973329588
1600242947522220000699108021006881182282945580561218732037208325160068800818000080010800004003361307322006449265152936329551965503960416001080020800008002080000295833511800211095110800001080000108089728352556608570860013890265504857977671435027527019050502011601129549208007358151010580000800102950029445294812932529498
1600242957622020000757508801006641112162946277667116261808211725160072800668000080010800004003331297606006849263162941329438943403942316001080020800008002080000293303511800211093510800001080000108096053411516208557362716916345897868766751325685473852330502011601129756408007052648310780000800102947929652294752976529178