Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPALH

Test 1: uops

Code:

  swpalh w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)09l2 tlb miss instruction (0a)0e0f1e1f223a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
7200533116246030311100600601132917102786120002000200010001649300183268633251310200020003000329415184117100110001000020000010031000042004422157491188118377391414523114394144403436393241816420148381551120003273032834327413293132865
72004330192463101001005005777327720028038200020002000100005492987732646332173122000200030003297752511171001100010001200020100310000112003222163141183318152392204622833383044403041433253616811151251574420003317232626331423297632985
7200432918249000200100600589333111002764820002000200010001449297393281632891313200020003000329165236117100110001000120002010021000042003322162121188328447398513723123408344393038353261416495149501602420003279332797329223276332974
7200433252247210200100701584932907002801620002000200010000349298763279033176315200020003000327855217117100110001000120000210021000042004322162751207928434401014122954382644423141363247717386153951681320003305033096330563322633048
7200432866246020200101101598033055002770920002000200010001449296533262932868313200020003000328865257117100110001000020002210061000082004422161871204808435388713823381380344372741433245016817149171683920003306432776327743286933004
72004329052491101001006005770329950028009200020002000100018493002232701328023152000200030003297552631171001100010000200022100310000112005202163931214518514409813823055392744382942463242317029152151632220003285533031329033306132950
7200433008249020100100800571832816002778920002000200010000849299953275332996313200020003000327065242117100110001000020002010051000052004522163011150418487390914023110391644313540363259217411150591615720003296933137328883284232949
7200433222247010100101401600233015112779920002000200010000449299623278232996314200020003000328815189117100110001000120000010021000032002322162321219718421402503823038393344362838363248416549151301613220003294232819327933286833132
7200433071248021000100601575433051002805320002000200010000649301793293732855313200020003000327585225117100110001000020002210041000052002322157791154018296380503223018385244453236403243316495155281636720003299432871330553281632873
72004330012480201001006015696329810028116200020002000100001349297513273032981314200020003000329535236117100110001000020002210031000042005422162991206508529401103823002398544362536413253616493147001596020003285132960329113301332848

Test 2: throughput

Code:

  swpalh w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0059

retire uop (01)cycle (02)03l1d tlb fill (05)l2 tlb miss data (0b)0e0f181e1f202223293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
302067007052500000514703610021070044221081815423623010010100200001010020000592121224850496697970047700591366267301001020020000102003000070059533112020110099100100001010010000010020033065811020110014253361482017521311735300131011611700321000010101020000101007006870068700687006870069
3020470067525001105168042100130700474810119234236530100101002000010100200005921212248614966979700477006813662673010010200200001020030000700595331120201100991001000010100100000100200301436641019110008144132140201581137039460113101161170024100001310720000101007006370060700607006370063
302047005952500000515022310013070047241011217423653010010100200001010020000592121224851496697970050700591366267301001020020000102003000070062533112020110099100100001010010000010020039040631017610001344530151201851135156620013281161170027100001010720000101007006370063700637006370063
302047005952400010516013210015167004432101127423653010010100200001010020000592121212481496697970050700621366267301001020020000102003000070062533112020110099100100001010010000010020049057741017510004353938162201711132041400113101331170024100001313720000101007006370063700637006370060
30204700625240000151540301001907004717107109423893010010100200001010020000592121224851496698270047700591366270301001020020000102003000070062533112020110099100100001010010000010020031049711019310001143116141201811137033300113101161170024100001010720000101007006070060700607006070060
30204700595240010051510431002107005230108811423703010010100200001010020000592121224861496698770055700671366275301001020020000102003000070067541112020110099100100001010010000010020031050611019710002263930148201571135044400113101161170027100001310720000101007006370063700637006370051
30204700595240000051451151001416700473210217134238630100101002000010100200005921212247914966979700477006213662673010010213200271020030082700595331120201100991001000010100100000100200750363510170100530421301300201471121127260213101171170027100001310720000101007006070060700607006070051
302047005952500000515812510010470047161031510423653010010100200001010020000592121224851496697970050700591366267301001020020000102003000070059533112020110099100100001010010000010020020027461015310001152732140201631137248320113101161170027100001310720000101007006370060700607006070063
30204700595250000051560451003070044351021813423623010010100200001010020000592121224851496697970047700591366267301001020020000102003000070059533112020110099100100001010010000010020035034461017610001043728153201631130246310213101161170025100001010720000101007006370060700607006370063
3020470062524000005173032100190700443110199423613010010100200001010020000592121212411496698270050700621366260301001020020000102003000070062533112020110099100100001010010000010020021032551016910000153526146201661110231320113101160170032100001310420000101007006070060700607006370063

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0064

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f20222324293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9e9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3002670064524100005046732100015070049250151042470300101001020000100102000058829121359049669847005670064136629530010100202000010033300407005853811200211091010000100101000010102005314315610067100472934446420103135851361410127021511700291000099720000100107006570065700657006570065
300247006452511100507793900009070049210191242473300101001020000100102000058829122568049669847005270064136629530010100202000010020300007006453811200211091010000100101000000102004313374810073100151924305320060036731401411127011511700291000099620000100107006570065700657006570065
30024700645251110050988391000807004925171042477300101001020000100102000058829122580149669847005270064136628730010100202000010020300007006453811200211091010000100101000000102004615367710096100182054246720100152851351411127011511700291000099720000100107005770065700657006570065
3002470064525100005054724100018070041260171342476300101001020000100102000058829122361149669847005270056136629530010100202000010020300007006453811200211091010000100101000000102004115366610076100120730185420074038725341300127011510700291000099620000100107006870065700697006570057
3002470066543110005072872100011070049360102042466300101001020000100102000058829122563149669847005670056136629530010100202000010020300007006453811200211091010000100101000000102004315314210093100122736265920082136843461410127011511700291000009620000100107006570057700577006570065
300247005652510000507975410001107004930012842474300101001020000100102000058829122568149669847005270064136629530010100202000010020300007006453811200211091010000100101000010102002514214710064100140724304720077127730251420127011512700611000099720000100107006570065700577006570057
300247006452511200507282310005070049230151242471300101001020000100102000058829122517149669847005270064136629530010100202000010020300007006453811200211091010000100101000000102005214297810084100140730305420093237737381426127021511700291000099720000100107006570065700657006570065
300247006452511100506373110001070049310131842473300101001020000100102000058829122570149669847004470064136629530010100202000010020300007006453811200211091010000100101000000102004615454910102100222948246520075148833421400127021511700291000009620000100107006570065700577005770065
3002470064525101005058826100090700492201310424753001010010200001001020000588291225690496698470052700641366287300101002020000100203000070064538112002110910100001001010000001020023145054100551001331024264420066139835341470127021511700211000099720000100107006570065700577006570065
30024700645251101150838251000153670049230712424753001010010200001001020000589761225671496698470056700641366295300101002020000100203000070064538112002110910100001001010000101020051144272100841001821442425120112043751451340127011511700291000099020000100107006570065700657006570065

Test 3: throughput

Code:

  swpalh w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 18.0052

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2020518006214360000100024838120001180041808160098201001002000010020000500107170149176976180056180056261775012010033662002000020030000180056272911102011009910010000100100000100200000121324261100000014175343581434101212001117228182325221798721001020000100180057180057180057180057180057
2020418005613950000000024336010011800411208159867201001002000010020000500106563149176961180056180056225177552201003225200200002003000018005427271110201100991001000010010000010020000012024277100000014452343951448601212001117227439225221798721010020000100180057180057180057180057180057
202041800561349000000002429212100018004312301601992010010020000100200005001071470491769671800471800472617751220100322820020000200300001800472731111020110099100100001001000001002002524122424243100120014273344241454912121223011171677250160017988766620000100180059180059180059180059180059
2020418004713491101000025415120100180043124016029820100100200001002000050010754314917697818005818004726177512201003808200200002003000018005827201110201100991001000010010000010020023240024001100120113946339331414712121222111171675140160017988766620000100180059180059180059180048180059
20204180047134910100000240421201001800327311160296201001002000010020000500107767098177008180058180058261775122010035432002000020030000180058273111102011009910010000100100000100200242312024398100120014438344641435512121222311171678810160017988860620000100180048180059180059180059180059
202041800581349100000002583612001018004374015979220100100200001002000050010737714917697818005818005826177512201003293200200002003000018005827311110201100991001000010010000110020023241202438610012011448434508144601212023011171679410160017987606620000100180059180059180059180059180048
20204180058134911010000257371200101800431211316017120100100200001002000050010849514917697818005818005821017763820100365820020000200300001800582720111020110099100100001001000001002002324122425272100120114941348751447912121222111171678770160017988766020000100180049180048180059180059180059
2020418005813481011000024451120000180043831116013820100100200001002000050010640204917697818005818005826177512201003341200200002003000018005827311110201100991001000010010000010020023240242451610012111452234556142601212022411171675620160017988766020000100180059180059180059180059180059
2020418005813491001000024064120000180032641116064620100100200001002000050010798804917697818004718005826177501201004019200200002003000018005827201110201100991001000010010000010020022230242409010012001404434110139941212022011172777880180017991766020000100180059180048180048180048180059
2020418004713481011000024197120100180083122816047020100100200001002000050010767414917697818005818005826177512201003825200200002003000018005827311110201100991001000010010000010020025240023915100120114008340961399712121223111171677271160017988700620000100180059180048180059180059180048

1000 unrolls and 10 iterations

Result (median cycles for code): 18.0048

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e1f22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2branch mispred nonspec (cb)cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200251800441349000000267820101180024144164780200101020000102000050100017049176959018009518005123177527200108202000020300001800442735111002110910100001010000010200000121526768100000016703367451674512120064010113216221798636662000010180040180049180040180049180049
200241800481349000000267480000180033022164792200101020000102000050100003149176968018009418004423177519200100202000020300001800442735111002110910100001010000010200000121526741100001016751367691678200006401010121622179867106102000010180040180049180049180049180045
200241800481349000000267860001180034002164783200101020000102000050100003149176959018064418005823177519200100202004820300001800482735111002110910100001010000010200000121526780100000016711367391677800016401010321622179867106102000010180049180040180049180051180113
200241800391349000000267530101180029000164788200101020000102000050100003149176959018012318006023177529200100202000020300001800482735111002110910100001010000010200000120267031000000167713674516746121200640100983162217985810662000010180050180049180045180045180045
200241800441348010000267460101180033020164792200101020000102000050100003049176959018009618004623177528200100202000020300001800482735111002110910100001010000010200000120267141000000167783673816714121200640100762162217986710602000010180077180049180049180049180045
200241800481348000000267080000180029020164792200101020000102000050100003049176968018008818004523177519200100202000020300001800482730111002110910100001010000010200000002673910000001673936779167711212006401013821622179867101002000010180049180049180062180049180049
200241800481349000000267760101180029040164788200101020000102000050100003049176968018003918004823177524200100202000020300001800482730111002110910100001010000110200000120267451000020167343678216778120006401011421622179867101002000010180049180049180049180049180045
2002418004413490000002674900011800330201647922001010200001020000501000031491769680180594180064231775282001002020000203000018004427351110021109101000010100000102000001215267791000000167113674316745120006401014621622179863101062000010180049180040180040180040180049
20024180048134800000026745000018003302216478820010102000010200005010000314917695901800441800442317751920010020200002030000180039273911100211091010000101000001020000012152673810000001673736778167680120064010066216221798586662000010180049180049180049180049180049
20024180039134800010026776010118002900216479220010102000010200005010000304917696401800671800472317752820010020200002030000180048273911100211091010000101000001020000001526741100000016746367681676901200640101082162217986701002000010180040180040180049180049180049