Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPALB

Test 1: uops

Code:

  swpalb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e2223243a3f4951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
7200534068256220011010100600053393396902902520002000200010000849311673365934089310200020003000338305339117100110001000020002210061000072005523149101061017941353085724127354144421047463296619023170321850320003410034000339813403434123
72004340872551500170001007000527433888029026200020002000100021049309603376534173310200020003000336935333117100110001000020003310051000042006532149811078227973351584524152351044371244453298718979166241848720003414334054341383411034133
7200434069255160018000100210053623393602890720002000200010000949310693372834069310200020003000338735389117100110001000020002210021000052004320150461069218035362544824213356744441648453301718795167741832720003405034138340553405634082
7200434071256200018000100610052453394502896520002000200010000049309673371134027310200020003000338745343117100110001000020000310031000062008530150271080908014361054924242361744341245413295218965169441849120003412134149340613405734111
7200434058255150020000100800053353401002899520002000200010001849310913390834364310200020003000338445393117100110001000020000210041000052007322150071066518013354564624207357444341251493307519092169731854620003413934082341693410334130
7200434129253140019000100710053733392602897120002000200010001549310063368634131310200020003000338695316117100110001000020003010041000032006420151691068138045367274724201356444401343503295019045168691836820003403734133340403414034117
7200434098254120016010100710053023401812903420002000200010000949309583365634030310200020003000337605388117100110001000020002310041000032003430149811095737954355455124245358044391451473303019115168751819620003412434161341353414134062
72004341682571000150001005000538033981029044200020002000100001049310393376234095310200020003000337685386117100110001000020000210061000042005400149171071217971349064224225361444351248443293619150170471854520003408134171341723402534042
7200434184255160013000100900053393396402896220002000200010000549310543363234075310200020003000338235390117100110001000020000210021000052006300150271072917999354085124176354844421251443303219277167701821920003407834136341613412834183
720043413725616001900010041005296339870290522000200020001000084930909336483414131020002000300033825533611710011000100002000221004100004200423015070111132798535567492422536204435650523300719029170441842220003414234132341413409834136

Test 2: throughput

Code:

  swpalb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0059

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f181e1f2022233a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30206700615250220505602020120700441211912423853010010100200001010020000592121224860496698070047700591366267301001020020000102003000070059533112020110099100100001010010000010020043031611005610004253663320041283133420213101161170024100001010420000101007006070060700607006070060
3020470059525000050661311013207004429312194238630100101002000010100200005921212124514966979700517005913662673010010200200001020030000700595331120201100991001000010100100000100200320396410056100024533507720067152143300313101161170024100001010420000101007006070060700607006070060
302047005952500005068024201447004441161242364301001010020000101002000059212122465149669827004770059136627030100102002000010200300007005953311202011009910010000101001000001002003203455100761000620662220043154048410313101161170024100001010420000101007006070060700607006070060
302047005952500005065033201007004429214174238530100101002000010100200005921212248504966979700477005913662673010010200200001020030000700595331120201100991001000010100100000100200280325810064100010529224120062160125320113101161170024100001010420000101007006070060700607006070060
302047005952500005072037001007004431215204236130100101002000010100200005921212124704966979700427005913662673010010200200001020030000700595331120201100991001000010100100000100200240274410064100012617284220055154053300113101161170024100001010420000101007006070060700607006070060
302047005952500005055023206127004418214154236130100101002000010100200005921212248504966979700477005913662673010010200200001020030000700595331120201100991001000010100100000100200190275610054100082121263420056148049420113101161170024100001010420000101007006070060700607006070060
302047005952500005057140201007004412113134236230100101002000010100200005921212248504966979700477005913662673010010200200001020030000701075331120201100991001000010100100000100200050435610047100022628163220061176129323313101161170024100001010420000101007006070060700607006070060
302047005952400005060129201607004423113134238530100101002000010100200005921212124804966979700477005913662673010010200200001020030000700595331120201100991001000010100100000100200200317010050100022527185020040172083530013101161170024100001010420000101007006070518700607006070060
30204700595240000504312710907004418219134236130100101002000010100200005921212124504966979700477005913662673010010200200001020030000700595331120201100991001000010100100000100200370365110114100055624283420064164042250213101161170024100001010420000101007006070060700607006370060
30204700595250000507224101207004442224234236230100101002000010100200005921212124804966979700477005913662673010010200200001020030000700595331120201100991001000010100100000100200106426510050100011523143820052155052570213101161170024100001010720000101007006070060700607006070063

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0059

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e1f20222324293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30026700585430000115052253100012070043361171042465300101001020000100102000058829122552010496697570043700551366289300101002020000100203000070055529112002110910100001001010000010200320386410062100080836264720082038144410011270041512700231000060120000100107005970056700517005970056
3002470055525000010504712400001007004323116164246430010100102000010010200005882912255601049669757004670058236628130010100202000010020300007005852911200211091010000100101000001020025032411006110006182424340120062025155410001270031551700151000096120000100107005970056700597005670059
3002470055525000010507023110001307004544121204247230010100102000010010200005882912255501049669787004770055136628130010100202000010020300007005052911200211091010000100101000001020032030601005310009280383720068032133270001270021522700201000066120000100107005970056700517005670059
3002470058524000010504312610008070040191282842465300101001020000100102000058829121991010496697570046700581366289300101002020000100203000070055529112002110910100001001010000010200000506310060100071736424020070139047390011270011512700151000096120000100107005670061700567005670059
3002470055524020011506103310008070040251131342470300101001020000100102000058829122555010496697570046700551366289300101002020000100203000070058529112002110910100001001010000010200350356610062100091140244420098127135270001270011521700151000060320000100107005670059700567005670059
3002470058525000011505728100010127003523120164246730010100102000010010200005882912249201049669707005170098136629330010100202000010020300007005953311200211091010000100101000001020024033411006110002191826302004312403737000127001151170024100001010720000100107006370060700637006070060
300247005952500001054578825010018070047295151242475300101001020000100102000058829122552110496697970047700621366293300101002020000100203000070059533112002110910100001001010000010200321245751006610015283030312006512804742001127001152270027100001310420000100107006370063700607006370109
3002470062524000010509018300048171670044201411424733001010010200001001020000588291225511104966982700507005913662933001010020200001002030000700625331120021109101000010010100000102003006264100771000103142244220063146264710001270021522700241000029221420000100107007470063700637006370060
3002470059525000000507604601001807003531220204246430010100102000010010200005882912255111049669797004770059136629330010100202000010020300007005952411200211091010000100101000001020031037621006010003172432302006113603731000127001152270027100001310420000100107006070060700637005170063
300247005052500001150821360100150700472821528424733001010010200001001020000588291225520104966981700547005913662903001010020200001002030000700595331120021109101000010010100001102003604361100831000117243034200671391394200012700115127002410000100420000100107006070060700607006070060

Test 3: throughput

Code:

  swpalb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 18.0048

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1map dispatch bubble (d6)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20205180048134800100100246860100018003300160322201001002000010020000500106462049176959018005118005126177505201003231200200002003000018005127121110201100991001000010010000010020000012024324100000144433447114397012121117167860161798681301320000100180053180040180053180049180053
2020418005113480000000024237010001800360015993120100100200001002000050010711204917695901800511800392617750520100333420020000200300001800512712111020110099100100001001000001002000001214245501000001458434533145410121211171677581617988010131320000100180053180053180049180049180052
2020418005213490100100024447264000018003600160014201001002000010020000500107462049176971018005118003926177493201003587200200002003000018005127211110201100991001000010010000010020000000242981000001466334353143390001117167822161798681313020000100180052180052180040180052180052
202041800511348000001412422100000180033001600222010010020000100200005001065490491769590180039180051461775082010033412002000020030000180039272111102011009910010000100100000100200000121424466100005143123427914357012121117168007161798801313020000100180125180050180040180040180052
2020418005113490000110024347000001800360015994220100100200241002000050010664704917697101800521800392617750520124357120020000200300001800515434111020110099100100001001000001002000001212243061000001431834654144040121211171675241617991613131320000100180052180052180040180052180052
20204180039134900001100243070010018003600160033201001002000010020000500107022049176971018003918005126177505201003609200200002003000018003927211110201100991001000010010000010020000012142465310000214791346841433400121117167638161798807131320000100180052180052180040180040180040
20204180051134800000100243860000018018200159690201001002000010020000500106714049176971018005118005126177505201003352200200002003000018004827211110201100991001000010010000010020000000241731000001431434589144290012111716796216179880001420000100180053180053180053180053180040
2020418003913490000000124375000011800360016008120100100200001002000050010664404917697101800511800512617750520100341720020000200300001800622721111020110099100100001001000001002000000024352100000142593425114630012121117167934161798800131320000100180052180040180052180049180052
202041800511348000000002437400000180037001599222010010020000100200005001056320491769590180039180052261775052010032812002000020030000180048272111102011009910010000100100000100200000013243321000011445834759144550121211171678151617988013141320000100180052180052180053180052180040
20204180051134900000000244850010018003600160114201001002000010020000500106439049176971018005118005126177493201003453200200002003000018005154341110201100991001000010010000010020000012132436010000014361342701428201201117167605161798681310020000100180052180040180052180052180052

1000 unrolls and 10 iterations

Result (median cycles for code): 18.0044

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch cond mispred nonspec (c5)cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2002518004613490001100267140011800292421647802001010200001020000501000171491769641800471800442317752420010820200002030000180044273511100211091010000101000001020000121426777100001451677136742166971212064010150516331798639662000010180045180045180045180054180045
200241800441349000100026748000180040122164780200101020000102000050100017149176964180044180044231775242001082020000203000018004427301110021109101000010100000102000012142674210000621674436764167741212064010072316331798639662000010180045180045180045180048180091
200241800801349000000026749001180029040164788200101020000102000050100003149176968180048180044231775192001002020000203000018008027351110021109101000010100000102000012026712100001011671636737167391212064010116316331798676762000010180110180046180054180045180077
20024180446134900000002675110018003300216478820010102000010200005010000304917696418004818004423177528200100202000020300001800442735111002110910100001010000010200001202670210000916771367471673612120640101143163317986310662000010180049180045180045180045180102
20024180044134800000002675210118003412416478820010102000010200005010000314917696418004418003923177524200100202000020300001800442735111002110910100001010000010200001215267411000001673536748167461212065710142316331798631010102000010180045180049180045180045180048
2002418003913490000100267121001800330221647882001010200001020000501000030491769641800441800392317752420010020200002030000180039273011100211091010000101000001020000121526743100001241670636743167361212064010108316331798636662000010180045180050180045180045180137
20024180039134900000002672400018002902416478820010102000010200005010000314917696418004418004421117756920010020200002030000180044273511100211091010000101000001020000002670310000171673936744167131212064010083316331798676662000010180049180045180045180045180115
200241800441348000000026721000180029040164812200101020000102000050100003049176959180044180044231775242001002020000203000018004427351110021109101000010100000102000012152675810000641674036746167451212064010110316331798636662000010180045180040180040180045180094
2002418004413490000000267010011800290441647882001010200001020000501000030491769641800441800392317752820010020200002030000180044273511100211091010000101000001020000120267461000011516745367811671112120640101043163317986766102000010180050180045180046180045180072
20024180044134900000002677600018002900416478820010102000010200005010000314917696418004418004823177524200100202000020300001800442735111002110910100001010000010200001215267691000012316704367151671412120640101103163317990310662000010180046180040180045180045180049