Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (post-index, 32-bit)

Test 1: uops

Code:

  ldrsh w0, [x6], #8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e1e2022242b3a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200510408121073221005121025041164225200010001000100010005280445824110401040699377320001000100010001000104044111001100010000100890881073716428291031708335670073116111036100040341000100010411041104110411041
2004104081110433020050102502153225200010001000100010005282445823110401040699377320001000100010001000104044111001100010000100790771041004214271051454276262073116111036100042391000100010411041104410411041
200410407101070240003010251936521252000100010001000100052836458241105010406993773200010001000100010001040441110011000100001039719810611312820361050458272660173116111037100033411000100010411041104110411041
20041040811129422100301025223882325200010001000100010005282445824110401040699377320001000100010001000104044111001100010000103880811057802624441061506385671073116111037100023121000100010411041104110411044
20041040811109500003010250244172520001000100010001000527924582511040104069937732000100010001000100010404411100110001000010267141104140286361056578384074073116111036100025201000100010411041104110411041
200410408110090260002010251223632252000100010001000100052820458241104010406993773200010001000100010001040441110011000100001008701001037002320341051528364871073116111037100043241000100010411041104110411041
2004104081010892300020102515341026252000100010001000100052824458231104010406993773200010001000100010001040441110011000100001027190611073702220271047407234074073116111036100040201000100010411041104110411041
20041040711005521000301025252610312520001000100010001000528244582411040104069937732000100010001000100010404411100110001000010077190105440326261018517284861073116111037100029311000100010411041104110411041
200410407101072240003010250115232520001000100010001000528364582511040104069937732000100010001000100010404411100110001000010346155104110148241040407365671073116111036100033321000100010411041104110411041
200410408111092000010102503432025200010001000100010005283645824110401040699377320001000100010001000104044111001100010000102091551043151270331054539417272073116111037100026251000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh w0, [x6], #8
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1871

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e2022293a3e3f404d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097194353810000443849172021207173377371805255075040612101284010010000613250272716004968846719657174665263036573950100402001000070200100007177635114020110099100100003010010000010010977313349510666268118955223109361241138203261025811716134052411531040102110000401007189371763719927189872277
5020471898538000004878841680314071729782717672550735406041014440100100006164492718638049686657195272026655160365776501004020010000702001000071780351140201100991001000030100100000100108971153526106512641290417267109131282125103261015811714544054010451091104010000401007186972178720127183771836
502047198953910000495869169611527182379771614255066040628101264010010000614143272479204968891718377168765179036547050100402001000070200100007168935114020110099100100003010010000010010910315253610685280118823029109121252114105261015811716144043211251053103810000401007195171908718897201971844
502047182953810000486799173611087182480571545255073040616101474010010000614697272384514968689720167166865285036563150100402001000070200100007179935114020110099100100003010010000010010913115849810641258139616813109371272127204261015811717434057310231109113310000401007206171757718177177771775
502047201653921000481862161611167195981271471255076540548101304010010000613400271957304968969718557170465339036575350100402001000070200100007182235114020110099100100003010010000010010927214451010644264158986832109401280140133261015811716984048410681030105410000401007189571915719057184371809
502047169954010000409831271211407183380871575255076040604101364010010000613166272557604968468719727176765312036554150100402001000070200100007192335114020110099100100003010010000010010896215348310631275149877226109221172123105261015811716654054011231081116610000401007189771868717697199571812
50204718635381000045884516961116718607837144925506954058410118401001000061472927206580496869971918718786527103654285010040200100007020010000718593511402011009910010000301001000001001089321615171065326612918343510984126311811526101581171644405401041104495010000401007210171869717997189071787
502047181753820000460784175211607189578871658255082540656101194010010000615427272198304968887721297195065358036554250100402001000070200100007186335114020110099100100003010010000010010906017149710600265129072634109461362127115261015811716784047210801140110810000401007196371965718337209171860
502047167353810100459810269622167188378371555255069040628101254010010000615740272921804968682717757194765245036551050100402001000070200100007189335114020110099100100003010010000110010922415449010687267119784225109681312125135261015811716594054011691083106810000401007194072052718867177771900
502047186054010000440842160021567192978071562255075540588101364010010000615731272777204968844717057191365259036545550100402001000070200100007167435114020110099100100003010010000010010937314750110672256168928214109201232124105261015811717254053610541061111610000401007210471891721217199871950

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1819

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002972096539000002519861252802927178384273715572550690405181015040010100006313612712595049687560717347194365524365519500104002010000700201000071936352140021109101000030010100002101090801335351067326013920112401093814761200019002520056511716454054483093682210000400107182871857717847170671799
5002471800537000000506831241612567164179563714422550725405301014240010100006295182721492049686760718327188765487365522500104002010000700201000071710351140021109101000030010100000101090411685121063928612922706010912134713312570025200165117149340580970946100410000400107195471840718827190271796
50024717795381000024798502504129271967812527157025506454051410130400101000062883827122200496876307175071892652403654915001040020100007002010000717663511400211091010000300101000001010896116754510635226889111241109291377136164002520026522716414056098484290210000400107204971972718057193371984
500247177453810000049384625681248716818245471807255070540538101344001010000630136271826904968593071731718706529036537250010400201000070020100007183735114002110910100003001010000010109081179501106632751091080421092413561231813202520026511718464054499695289810000400107175471959718397191871696
5002471919538000001494875256002647198281572715892550675405421012640010100006297022720379049685320718177199365226365615500104002010000700201000071891351140021109101000030010100000101089401515241067126812896703510913128712803612002520016522717984058482292896610000400107222371852718487197771736
50024717635360000005428482488126071761799527158325507054057810148400101000063047527256380496868807174271882652523656125001040020100007002010000718323511400211091010000300101000001010918015951610653266149268038109341136133061000252001651171681405441044106096010000400107203371698716817199671755
5002471816537000000463874251202527169881574716172550685405431013940010100006310712723257049687110719137164765259365766500104002010000700201000071875351140021109101000030010100001101089711615221067528210917116421095512271321013002520016511717504051296499487410000400107177171816717517196271962
50024718175380000005298362536021671648818537167725506254051810130400101000063016727263880496873807191171995650103654505001040020100007002010000718573511400211091010000300101000011010936014449810664260119258049109391275141005002520015711717064058087698689010000400107177271945718727181471716
500247170953900000256186026080308718698297371723255071540494101354001010000632890271657404968683071907718606506836541250010400201000070020100007166535114002110910100003001010000110109070142547106582511491168551091613371280020025200257227165640572862960100610000400107180471770717477187371927
50024719215381000005128872536023671770823557172325506504046210137400101000062929727137000496875907193671897651293656405001040020100007002010000718833511400211091010000300101000011010919116050910667279109001064410925132612311411002520016511718044055284689891210000400107198571867718317176371883

Test 3: throughput

Count: 8

Code:

  ldrsh w0, [x6], #8
  ldrsh w0, [x7], #8
  ldrsh w0, [x8], #8
  ldrsh w0, [x9], #8
  ldrsh w0, [x10], #8
  ldrsh w0, [x11], #8
  ldrsh w0, [x12], #8
  ldrsh w0, [x13], #8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3688

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160209301482214000006944821173611112029542819473184517212143251601768017080000801008000040078013023031624926416293702946993390395481601008020080000802008000029254351180201100993910080000100800001008096170413578285187693119577453148580776613947035950513451101161129532358005859957710280000801002942129575294462938129371
16020429530219400100692185517921051522930577749318061876224325160170801618000080100800004007721304242068492625929555293339392039333160100802008000080200800002942635118020110099501008000010080000100809557242151818506072399243845108680166911552754908491451101161129379308005061458410280000801002931729276293112945629433
1602042914522030300067978231720122104291938134071622183520612516017080153800008010080000400745129847606149265582940429354933603933316010080200800008020080000293843511802011009941100800001008000010080964514475492846177311491512253808576983013545634337503951101161129365348005657065511580000801002953329450293512936229398
160204295742203000107215846168811910829341824453160716801960251601578015380000801008000040076412980240584926507295272937692550393721601008020080000802008000029503351180201100994010080000100800001008097031407527985214647159528647878566475313248365705170851101161129379228006961270210480000801002946629528294242923329413
160204295532332000007217848177613012029283813441156517952063251601698015480000801008000040076112985611574926535294022952294680396061601008020080000802008000029546351180201100995210080000100800001008099254400546884857668149348053958619174612248684611443451101161129579408006465458211080000801002935929693293122943829330
160204295502203000006985837176811620829612801499152016072262251601638016280000801008000040080412986991664926590296442933594630394531601008020080000802008000029356351180201100996410080000100800001008098874464540485384672119264052168544073513753284692496051101161129319288005260459511880000801002950929700293592947129309
160204294302213000007373835168811915629363827469177516432245251601698015480000801008000040082313024131684926665294012939493340393031601008020080000802008000029465801180201100994610080000100800001008100853459572185619650119155647938573172211950804719533551101161129543358005456965210680000801002962329436295232945029585
160204295242213000007511827175210613229401821440172118182308251601598017180000801008000040080113007281714926453295612954393910394031601008020080000802008000029322351180201100993910080000100800001008093956414591985523677109243451688610571112254865186180351101161129329218007270253411580000801002937129332294092954529490
160204294952211010006829834178411710029752819446177017142240251601698016580000801008000040077912985430664926401293902951694890394111601008020080000802008000029529351180201100993710080000100800001008095850384569685368683128953651938603571513352615149700451101161129942268004762864811380000801002949329546293132944829762
160204295732203300007077862154410614829620815458167719572016251601458018180000801008000040079113064771654926572297442939894370395851601008020080000802008000029487351180201100992810080000100800001008099953456574885234729129804053528625776513553075216706851101161129597338005460960011280000801002945429241295472937029493

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3682

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002930029223550201063578091696105112295727653662114171921562516008180077800008001080000400333129911106149262730295622957393163941116001080020800008002080000292453511800211092810800001080000010809738039955378588467616926725614860589151355102531088095020001160001129477228007257747511480000800102942129422291802920329486
16002429691220420000076038151720109152292627864122042188221172516007080066800008001080000400354130869406249264020294492955793573940916001080020800008002080000292483511800211093010800001080000010809778139950788549169512902385528862637231355570538886065020001160001129530188006553949011380000800102937729498292912923829497
160024296622224010000664980417121041322946674738919512041213625160072800798000080010800004003431297234062492613302954929524938739504160010800208000080020800002963135118002110948108000010800000108093944377582985362624128754253208648076712853935553720105020001160001129552268006449550911280000800102926629407294062960329525
1600242950022340100106899781175210980293857944042073176722112516007380070800008001080000400353129339115749263020294542942990043935516001080020800008002080000294153511800211092410800001080000110809215237054228570966113875485783862267351255009540753035020001160001129502348005655752014380000800102938229519295052944329631
16002429236220400100069297851728117140294858014012035169321192516006180068800008001080000400302129499507949264350294902929892563991416001080020800008002080000292323511800211093110800001080000110810027139053158576669811890425237863497941355500510469195020001160001129380178006256453311380000800102938529607293512939329585
16002429497220400100072587781656125140292927503551969196120402516006880069800008001080000400390130333316549262270293822936195033951316001080020800008002080000293713511800211092310800001080000110809858638258608557365713883385403858285981155201555487005020001160001129517218005754751310980000800102947229415294272936829499
16002429615219515000073357501688103132293947654272029191821342516006980076800008001080000400337129482005949261380292302962293543938216001080020800008002080000293363511800211093610800001080000110809638837856048529462217867405810861207771315343539990035020001160001129187258006752647610880000800102962629666292642936729387
16002429450221500000073737981712109108294477873782151175520892516006880072800008001080000400263129995715149264130295802944794353933716001080020800008002080000292973511800211092310800001080000010809728543456098571766415867325507867877931244589536272005020001160001129518348005660852510380000800102959729545292752944529399
16002429396218500000069197861600110100293597983672044188421692516007180079800008001080000400388131200506849263680295732935193293954416001080020800008002080000294553511800211092710800001080000010809347139251238537266412876645470859897841335587462469095020001160001129304318007050660310680000800102956329418292452954329515
160024295232213000000722776417528812029261758369176719602154251600818008580000800108000040034812929181544926289029453292059356395401600108002080000800208000029524351180021109221080000108000011080994653626210859756471890530534386604754140470152516704502000116000112942420800685425189780000800102921829582293242920029459