Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASALH

Test 1: uops

Code:

  casalh w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 4.001

Issues: 3.006

Integer unit issues: 0.000

Load/store unit issues: 3.006

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f63696a6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafatomic or exclusive succ (b3)atomic or exclusive fail (b4)bbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
740093414225624211110121053123400001220803009300930092302012149310683362534045102530101002300610006012340372727217100110001000030094110102002073012999130201492710585179353531756238073536443015544932956190961661517604300010013416534129340413407934097
7400534040255201600100700526133976102199930063006300623132514931062336883410572530061002300610016012340382713217100110001000030094210052004043015999151201496610724379833622752237743536444816574633019190761654817347300010013415434049341563405934091
740053406325519160010101053253386500220003006300930062311931493099933599341397253006100230091001601834109274621710011000100003003031006200407301199917120149641065137941351285623761358344408524233039190371668817636300010013408433758340683409034097
7400534126255161900100810535833974102200030093009300923138514931013336233416972630061002300610026018340542729217100110001000030066310082004073011999161001503810809379423565949238583583443911494433111191581647917543300010013410734103341203418634090
7400534099255181400100910527333888002207330063006300322949314931013336303405882630061002300910026012339792729217100110001000030066210042004053011999171201497210615379493561755238123523444315475232992189871669417831300010013408334175340943413734112
740053416225618170010071052773393200220103009300630062310001493104633559341438263006100330091002601233960272121710011000100003006621007200408301099914120149991063427925353974323863358144368564432988191111655317400300010013415834077340503405834148
7400534096255201600101010529533978002201330063006300622984914931049335923404882530091002300610026012340512726217100110001000030094010462006216301099915120150111077027971352155223868355644416474833042192051654017669300010013407734143340623402434099
740053410025617210010051053443388900219703009300630062299051493102033653341588253006100230061003600634047272221710011000100003006001007200407301099917120149901063137938354865523789353544427484932975186761647717720300010013416134193341863415034118
7400534138256151800100910527234011002204130093003300622908714931040337093414072630031001300610036018340312777217100110001000130034210062006073007999171201497610489279133542649238843657443915515033059191141675017744300010013411834078340753419934072
7400534097255191411100900526433879002203630063009300622980714931098336683410982630031002300310026018340092729217100110001000130064210102004063015999150201491810599279843553115023862354444428465133025190371658717702300010013411434071341103413234038

Test 2: throughput

Code:

  casalh w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0061

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f20223a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9e9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafatomic or exclusive succ (b3)atomic or exclusive fail (b4)b6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020870071524000005300835110700463011411304010010100300001010030000505132506080496698170049700613600434010020200300002020060000700613461120201100991001000010100100000010030038704410052200081261832300669998202914326400113101171170026100006630000201007006370062700637006270062
502047006152500000506210311132700472211417314010010100300001010030000505132504660496698170040700613600434010020200300002020060000700613461120201100991001000010100100000010030046785110061200101392443300799998203012398400013101171170026100006630000201007006270062700627006270063
50204700615250001050626241107004631113133040100101003000010100300005051325056604966981700497006113600444010020200300002020060000700613461120201100991001000010100100001010030039564510052200101382439300759998202310357803013101171170026100006630000201007006270062700637006270062
5020470061525000105068143211070046241813304010010100300001010030000505132504040496698170053700613600434010020228300002020060000700613461120201100991001000010100100000010030035602910045200101362632300389998201816325200013101171170026100006630000201007006270062700627006370062
502047006252500000507294211470046241129304010010100300001010030000505132504460496698170049700613600434010020200300002020060000700613461120201100991001000010100100000010030026543410053200102322434300489998202212346800013101171170026100000630000201007006270053700627006270062
50204702685270000050531245110700462511310304010010100300001010030000505132504660496698170049700613600434010020200300002020060000700613461120201100991001000010100100001010030060423810060200102351436300799998202912417600113101251170026100006630000201007006270062700627006270062
50204700615250001050491045110700461911214304010010100300001010030000505132504620496698270050700623600434010020200300002020060000700613461120201100991001000010100100000010030040805610063200102393241300639998202414314800013101171170026100006630000201007006270062700627006270062
50204700615250000050421227110700462411112304010010100300001010030000505132504450496698170049700613600434010020200300002020060000700613461120201100991001000010100100000010030045704310079200142443844300949998202912538400013101171170027100006630000201007006270063700637006270062
502047006152500000506618531147004633161030401001010030000101003000050513250404049669837004970061360043401002020030000202006000070061346112020110099100100001010010000001003001740531005720008027242530051999820228295600013101171170026100006630000201007006270062700627006270053
50204700615240000050461428111270046401121330401001010030000101003000050513250588049669817004970061360043401002020030000202006000070062346112020110099100100001010010000001003003762461006220010053048300739998202314496800013101171170026100006630000201007006270064700627006270062

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0074

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f202223293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9abacafatomic or exclusive succ (b3)atomic or exclusive fail (b4)b6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002870074525222015070134610402070059358107334001010010300001001030000500632511110496699470062700583600564001020020300002002060000700743591120021109101000010010100000103005314584610091200153582441301029998213093352142113071171170039100001313530000200107007670075700757007570076
5002470075525103005057924100207005924943334001010010300001001030000500632507760496699570062700743600564001020020300002002060000700743591120021109101000010010100000103004515565110066200091332855300629998213192454141112701171170039100001313130000200107007570075700757007570075
50024700745251110050361127100107005922866334001010010300001001030000500632507151496699470062700743600564001020020300002002060000700743591120021109101000010010100000103005413505110093200101306593007199982138113270131012701171170039100001313130000200107007570075700767007570075
50024700745251000050539191002070059421056334001010024300001001030000500632510531496699570063700743600564001020020300002002060000700743591120021109101000010010100000103004617505410081200112350513008299982138134488141012701171170030100001010030000200107006670066700667006670066
5002470065524000005042239100107005929844334001010010300001001030000500632507971496699470062700743600564001020020300002002060000700653501120021109101000010010100000103009314807010069200109383452300669998213393766131012701171270039100001313130000200107007570075700757007570075
50024700745251110050681125100107005928842334001010010300001001030000500632507111496699470062700743600564001020020300002002060000700743591120021109101000010010100000103004114684210066200112341842300719998213293362131012701172170039100001313330000200107007570075700757007570075
500247007452511100505292610020700594210105334001010010300001001030000500632505851496699470062700743600564001020020300002002060000700753591120021109101000010010100000103004215484410062200093303059300789998213192858140012701171170039100001313130000200107007570059700757007570075
500247007452410000506092410024070060318178334001010010300001001030000500632508140496699470062700743600564001020020300482002060000700743591120021109101000010010100000103003515485110053200091372041300519998213093256140012701171170039100001313030000200107007570075700757007570075
500247007452510100504711261002127006029845334001010010300001001030000500632509000496699470062700743600574001020020300002002060000700743591120021109101000010010100000103005615765310060200092353646300629998212693754140012701171170039100001313330000200107007570075700757007570075
50024700755251110050489361002070059289815334001010010300001001030000500632507260496699570062700743600564001020020300002002060000700743591120021109101000010010100000103004715643410086200092503847300719998213493972142012701171170039100001313330000200107007570075700757007570075

Test 3: throughput

Code:

  casalh w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 21.1408

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6063696a6b6d6emap rewind (75)map stall (76)dispatch uop (78)797bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafatomic or exclusive succ (b3)atomic or exclusive fail (b4)bbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402102114211574010000024026000211395370433014210030039100300395002858841014920832802100892114221320004230142101021430042102146008421006515052110201100991001000010010000110030039028172403220028001402144062999911401810011172774190170020988413053000010101210066211423210090211410210093
4020521007815830001100240270002100614503430142100300421003004250028586881149208332021006521141113201385301420010214300421021460084211408150821102011009910010000100100000100300420281724030200280014023442739999114014100111727741001700211227101053000010101211410210078211410210079211423
40205211424157300000002401400021006146034301421003004210030042500285884211492083280210076211409132013873014200102143004210214600842100891518211020110099100100001001000001003004200172403520028001402544058999911401202011172774210170021122710053000010101210077211414210066211411210079
402052100891583000000024025000210063306343014210030042100300425002858655114920832802100892114081320138530184101021430042102146008421142429632110201100991001000010010000010030042028172403720028001402944059999911401412011172774150170021123001023000010101210090211412210066211424210066
40205210076158300011002401500021140640634301421003004210030042500285868601492083170210078211410132013853014210102143004210214600842100891516211020110099100100001001000001003004200172403220054501403944374999911401802111172774100170020988201003000010101211424210090211400210077211424
40205211412157300000002402600021139346034301421003004210030042500285884101492083190211416210089132000453014210102143004210214600842100891493211020110099100100001001000001003004202802403220028001402244061999911401802011172874090260021145010023000010101211409210090211424210090211422
40205211423158400010002403800121138240034301391003003910030039500285976501492070090211421210089132000313013910102133003910213600782114081495211020110099100100001001000011003003900172402420026221664244278999911404112011173374142272220985410023000010101211411210090211411210079211424
40205211408157400000002403910121140740634302651003003910030039500285866801492083330210092211423132013903014200102143004210214600842114121484211020110099100100001001000001003004202802403620028001402744061999911401902011172774160170021122710023000010101210066211398210077211411210066
402052100781583000000024015100211395450343014210030042100300425002859163014920699602114092100891320004230142001021430042102146008421007615172110201100991001000010010000010030042028182403420028001401644058999911401300011172774030170020988410003000010101211423210093211474210083211411
402052114091573000000024015000211411066343014210030042100300425002858865014920834102100892114221320138530142101021430042102146008421141215082110201100991001000010010000110030042028172402920028001401544062999911401212011172774150170020988201053000010101210066211411210093211424210082

1000 unrolls and 10 iterations

Result (median cycles for code): 21.1408

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f191e1f22243f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)63696a6d6emap rewind (75)map stall (76)dispatch uop (78)797bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafatomic or exclusive succ (b3)atomic or exclusive fail (b4)bbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40030210089158300000240360102113824063430049103003910300395028611441492070092114212100761020138430049001003330039100336007821142314952110021109101000010100000103003902802403120026122014015440579999114018120000651741531833209855101003000010011210079211403210066211411210079
40025211421157400000240260102100744663430049103003910300395028586671492069962114222100761020004630049101003330039100486007821142315082110021109101000010100001103003900172403120038115014022440539999114020120000651740831833209853101053000010011211453210095211427210090210099
400252100781583000002402601021006140534300491030039103003950287665014920831921007621141510201377300491010033300391003360078211410149521100211091010000101000001030039028172405620026520140254404799991140200200006517415318332112010053000010011210077211411210090211409211400
400252114151573000002404801021005046534300491030084103003950287648414920833021007621142310201367300490010033300391003360078211412148421100211091010000101000001030039028172402620026101401644051999911401900000065174153183321119013003000010011210090211409210079211413211458
400252114111573000002401300021005040034300491030039103003950285848214920834421008121142410201389300491010033300391003360078211415150821100211091010000101000001030039028024032200261101401844057999911401812000065174153183321119813053000010011210090211424210090211416210075
40025210092158300100240270002100614663430049103003910300395028588211492070012114142100651020004430049101003330039100336007821009229313110021109101000010100000103003902817240312002610501402644062999911401210000065174063183321120113053000010011210082211400210090211425211446
400252114251574000002401401021007446634300491030039103003950285886314920699821139821008110200057300490010033300391003360078211415149521100211091010000101000001030039028024031200263901402144051999911401700000065174053183320985301053000010011210093211416210066211425211463
4002521141416400000024027000211393455343004910300391030039502858638149206996211421210472102013913004921100333003910033600782114081508211002110910100001010000010300390017240242002611014033440589999114017120000668740931833211196131053000010011210079211400210079211411211491
4002521142315730000024026000210077455343004910300391030039502858667149207012211424210076102013673004910100333003910033600782114101484211002110910100001010000010300390281724034200267014014440519999114020100000651740331833209855101003000010011210077211414210088211424211475
40025211412157300110240270102100740663430049103003910300845028766111492083432100782114261020137830049101003330039100336007821141015082110021109101000010100000103003902817240322002612014014440579999114023100000696741231833209869131053000010011211413210093211422210082210093