Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (pre-index, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6, #8]!
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e1e202223293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20051054700055201004121025190121142520001000100010001000528484582710411040700377320001000100010001000104044111001100010000103000801047101386191017729165500073216221028100045421000100010411041104110411041
2004104080004924008001025001616252000100010001000100052844458241040104069937732000100010001000100010404411100110001000010106050103150106151019305113100073216221037100043291000100010411041104110411041
20041040800061161000121025000019252000100010001000100052844458241040105769937732000100010001000100010404411100110001000010000055102160226121019306113900073216221037100054541000100010411041104110411041
200410408000641210000102540101125200010001000100010005284045824104010406993773200010001000100010001040441110011000100001015004810464030127104936694100073216221037100072401000100010411041104110411041
20041040700047000000102560641825200010001000100010005283645824104010406993773200010001000100010001040441110011000100001036003910342026814103848673100073216221037100036381000100010411041104110411041
2004104080016314000001025900124252000100010001000100052840458241059104069937732000100010001000100010404411100110001000010220050102830201212103750473400473216221037100037351000100010411041104110411041
2004104070104726100001025270201425200010001000100010005282845824104010406993773200010001000100010001040441110011000100001020120641013000091011247163100073216221037100043421000100010411041104110411041
2004104071003914100001025111119252000100010001000100052832458241056104069937732000100010001000100010404411100110001000010120032102840266151027245324160073216221037100031291000100010411041104110411041
200410407000400100241025023319252000100010001000100052840458241040104069937732000100010001000100010404411100110001000010120033102210266171030305176300073216221037100032311000100010411041104110411041
200410407000573410020102591342025200010001000100010005284445824104010406993773200010001000100010001040441110011000100021020120391020203010141031354105500073216221037100039361000100010411041104110411041

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, #8]!
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1919

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2022293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502097207754020000100446082717202128718738156271792255081540616101464010010000613366272976649688267199671843653683656215010040200100007020010000720733511402011009910010000301001000011001085861444791064626410886442810936130913234900261025811719034054811711123113210000401007198371901717607195671832
502047203253820000000436083217122120719467954371885255070040608101444010010000613957272209549690537179471808654593656645010040200100007020010000719493511402011009910010000301001000011001092731334871065527812923423310875136713430300261016311719394049611071094114410000401007174171978719267193272092
5020471974538330000004160826172038871910784337172625507854056810137401001000061466327232134968779721227200365406365565501004020010000702001000071878351140201100991001000030100100001100108843155485106352411287646351089612881343330026101631171738405121114968105610000401007208772013721067205471939
50204720235383000000045607811704312071885804547153425508504061610134401001000061506527312054969040719547191065272365674501004020010000702001000071910351140201100991001000030100100001100108686150493106522694688482341090412481213030026101631171731404809391039101910000401007190171880718647197771953
5020471824539200000004590802170421167193083152717322550725405721013240100100006148742720767496882871687718806520236556950100402001000070200100007193935114020110099100100003010010000110010919215849810608274138831342510938111712720300261016311716214049210361141111110000401007204971895720057206872031
5020471916539220000003810827170421087192381053717222550665406721012340100100006155382721136496879971893716476541136553250100402001000070200100007180735114020110099100100003010010000110010925315548110631259886244181089211991372350026101631171751405321033108788810000401007189072053719057199272034
5020471845538200000004370775173621407182580262718722550835406241013540100100006149482725718496878872159719546532836564350290402001000070200100007199135114020110099100100003010010000110010878315348210623268988978231089913181172340026101581171793405169561082102110000401007212072015719607204172067
502047195554021100000464082617362108718947608171751255082540616101414010010000613512272963349688067179072069655703656085010040200100007020010000719253511402011009910010000301001000011001085321684831064126312921722810888129812720500261016411717274050410061140111110000401007183972010720197209071995
5020472036539212000004490807170429671788777627157325508054065210129401001000061439827297244968891717987209865316365556501004020010000702001000071937351140201100991001000030100100001100108972157489106242621193111240109461297118226002610158107162040496952105698910000401007185071812719587201371918
50204718815382100000043008261760296717637834271561255075040664101374010010000614662273307049687547197171817654353656665010040200100007020010000718333511402011009910010000301001000011001090121324781064225210883483310919127713324300261015711718614058810291071100410000401007190071967718147195371972

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.1955

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e18191e202224293a3e3f4043494d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50029724835390000000455814107360100719397942071491485066540550101274001010000613816272600414968775717037195265342365524500104002010000700201000071958351140021109101000030010100001010908015549010655265893042273108921232119005252018511717974050810091185105010000400107187171916718247194871915
5002471968540000000040082610728011671828792407164225506304050610132400101000061424827314481496863771782719326522236570050010400201000070020100417188235114002110910100003001010000101092311575161064427079404846109111381126007252018521716324056810811032101810000400107202171955720157201172179
50024720385390000000423826107440132719598104171701255067040570101324001010000611261272520814968830719957209465333365481500104002010000700201000071998351140021109101000030010100001010930014650410639274992738277108751324126005252027123717334052010591096100210000400107171171939718297169771924
5002472084539000010045384910744010871767809307156425506304054210122400101000061298827291161496894371774718016524436539450010400201000070020100007202435114002110910100003001010000101093901665181061425910909485710884118413000525203711271681404809641013105110000400107199671822720227189971950
50024720965390000100449831107520100717357833071810255065040498101254001010000613306271912014968807719297185365506365570500104002010000700201000071711351140021109101000030010100001010912015250610663255991648731087611321290072520171237157540480114198399610000400107181271904719477186171889
500247212553800000004278191062409671826789317158225506754052210111400101000061239527251501496872671958720496551036561750010400201000070020100007184435114002110910100003001010000101087401475211065428899404856108901243118005252027112717134054010431028108910000400107222671808719837177972236
5002472219538000000048983310672014071828796307159625507084051810127400101000061244327292861496882371889719166538036550750010400201000070020100007171835114002110910100003001010000101090701595011065127288804611010910124412400325202712171648404961071981106210000400107190972038718957174471860
5002471902539000010043881610696013271729789307159125506654053810127400101000061338427170931496885971939717776525136564950010400201000070020100007189235114002110910100003001010000101090901654851063425088934459108901224126009252018511717194054410291095110710000400107193572142719347213271958
5002472004539000000045381910720096719117713071646255062040510101324001010000613618271615014968688718497201465223365612500104002010000700201000071917351140021109101000030010100001010896015051810649269888080303109291253125103252017132719034053210091130105310000400107203771849716877197171964
5002472110538101000041682610696114071710805217159925506204054610137400101000061214927220331496876071821718926514536530250010400201000070020100007187035114002110910100003001010000101092601595061065324611906805310896129413003925203711371742405449141113106710000400107193371761719187184371824

Test 3: throughput

Count: 8

Code:

  ldrsb x0, [x6, #8]!
  ldrsb x0, [x7, #8]!
  ldrsb x0, [x8, #8]!
  ldrsb x0, [x9, #8]!
  ldrsb x0, [x10, #8]!
  ldrsb x0, [x11, #8]!
  ldrsb x0, [x12, #8]!
  ldrsb x0, [x13, #8]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3641

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
160209298562201000107087798168884802908676541121862098192925160163801508000080100800004007391280948175492601929243293888986392931601008020080000802008000029153351180201100991110080000100800000100808861840653728557272519918445016857328081315048537414035110111611102933030800503864487780000801002903729302291802913529259
1602042898822010010063378001688107842920578842621412092184725160150801558000080100800004007891292368053492593428967292969003392361601008020080000802008000029136351180201100991010080000100800000100809011837355378569772215891162548185960746133505146971604511010161192917533800703323649380000801002937929057291332917529251
1602042958221810001070298021680961362931679538621692152186925160146801638000080100800004007861287115053492632029352292039167393671601008020080000802008000029084351180201100996100800001008000001008087618382509284610689148663652608546772612954134859170115110121611122933423800763894097580000801002920829336293842913329405
160204290362171000006569785169693152291737834162018235718852516015480159800008010080000400795127893405449260032932829086913339168160100802008000080200800002903635118020110099810080000100800000100809231837554588546672515893264340854157721224754453218035110101612102936634800603053887880000801002911329095292562902729273
160204290772181001006223809171281112291957603902102199118912516013980172800008010080000400785128824605749261982919529066914639172160100802008000080200800002904435118020110099710080000100800000100809291944357048529173017948245121861167891324769559817005110101612102923042800594143728780000801002913229114292472909129141
160204293482171000006550778167295124293187713592170214919602516015680166800008010080000400788128901216449263832917129394914339273160100802008000080200800002924335118020110099141008000010080000010080916163625249853047141686882509185895764117502446631903511071610102918624800623683348380000801002932629361291852940929291
16020429067219100010682180516889010829171809397204622131889251601598017080000801008000040074412799670564926171292342906791993930316010080200800008020080000291833511802011009941008000010080000010080883143805101849327151789834443986309779115539547691904511010161092905523800693623399180000801002911729270290922921829148
1602042911621910000067227951656861082909577442722021960203525160454801728013080273800004007221288504159492614029173292169273392701601008036980000802008000029215351180201100998100800001008000001008097283855037853097491990078514885792818140493150201703511011167142907326800633373728480000801002907729148293042931929110
160204293392201000006808805170410092291207733622013217219632516014980154800008010080000400771128141616349261062922529190903139203160100802008000080200800002903635118020110099510080000100800000100809201737453618471172816854324920862518451285391512818345110121611122919938800573613618980000801002951429266292212909029210
16020429318219100000698879517049811629041798371228220691932251601468015680000801008000040076712786050694925860291852905492483904816010080200800008020080000292233511802011009981008000010080000110080981174196197851566621689222499386596703127496751140055110111612112915727800493923938480000801002913729170291112933029346

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3683

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9e9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
16002929637219200000068827811696841122945075944820382210185425160080800658000080010800004003311299274063982644429335292379186039059160010800208000080020800002917535118002110910108000010800000010809073243150940848257031390042537385969742131558649345503250203163229429258006143338210680000800102916729230292762896029280
160024295712212002000708978317208810829324777438199520762116251600738007480000800108000040034712894381574926182294312941090050390151600108002080000800208000029195351180021109910800001080000001080894334174979085068663109358452338572279512752485344340305020316322907430800523403528980000800102909929286291382933429183
160024296342193100010720681417209896293227604351993180218042516006880074800008001080000400324129158706649260722920029107946703927416001080020800008002080000291633511800211091410800001080000001080933524055529085069669989144582085688658137472556433301005020216322907036800573303589680000800102932429295293232897429333
16002429464219200000062917851592771082907375236218032112176125160053800608000080010800004003171287987062492605729256293109040039051160010800208000080020800002909435118002110910108000010800000010809353539954073285512692139063256018543883313252764876160305020216232933944800733893679780000800102941729369291122908429358
16002429463219201000067098301712829629103762378209122532098251600828007080000800108000040032713001500584926111290562917489420391391600108002080000800208000029215351180021109710800001080000001080950344115814085573725119384847588595969812947344744330305020316322902129800553552999080000800102899529183294062898329109
160024295952182000100656478716961209629111783400208422151896251600678007080000800108000040035312952200504926173292072895390040390031600108002080000800208000029228351180021109610800001080000001080897353925626085050670168735659338576981912245924764470405020316332927733800603813808480000800102953029277292552934829133
1600242940921831030007280798173610310429353796453199521301904251600628006880000800108000040032512893560574925965289342916790990390911600108002080000800208000029087351180021109710800001080000001080934543855228085233710128819047168579487212945785079370305020316232925020800663403488280000800102919429250294252927329297
16002429549219110010069078021720110108293008075212160210919072516005880067800008001080000400287127802205849263802924129030911703912916001080020800008002080000292473511800211091010800001080000001080913334155814085190670790126523986387750117526746695303050203162329275258006439636210180000800102919229183290762905329143
160024292762183000100720179617049810029257764457198820671829251600738006780000800108000040033312908750664926167293352914292130390571600108002080000800208000028848351180021109910800001080000001080878333885084084973709148675057998531464111846924498363405020216332929632800574363759180000800102926029377290832929929098
16002429462220200000065967751760111144293667684242046180917692516006880065800008001080000400325129413206649260822923929149892003932616001080020800008002080000291623511800211091510800001080000001080931534065052085078705118473850508597675011247865154540005020316232897337800583663509080000800102934029387292402893329278