Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPA (32-bit)

Test 1: uops

Code:

  swpa w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)0e0f18191e1f2223243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
72006329362461411711001038010060953260801276682000200020001000020049299463293232795310200020003000327965301117100110001000020002210021000042002222160431216318400371095122997396544371240433259017338155121585320003289133056329013281032841
72004328442461301500001019010059943274701276162000200020001000010049298383305833094310200020003000327765291117100110001000020002210021000022002222164631205518145397684722950397644391843443231016373152791708520003282432909329863269132789
72004329372461501400001005000060763276210277562000200020001000030049300483278533039310200020003000330705253117100110001000020002210021000062002222161981195508159369954422745371544391443423240816459149681588520003283232650327873304932815
720043280024515013000010020100610932634012785620002000200010000100492984932768328033102000200030003306752831171001100010000200022100210000420012221636311492184284030104923083394044451144443246817040146411591020003285432832331213278933291
72004328112451801500401026010059733278711277722000200020001000010049302553293832801313200020003000327405275117100110001000020002210011000052001122162091167718411401354522993406044401743403248816631150491565520003269532737327793273233142
72004327742451501300001002010060473268710276132000200020001000030049298913305133160310200020003003330225274117100110001000020002210041000042002122163221201408342401274322943377044401544433238016697144511624220003283632916328233279032677
72004327632481601500001005000056903278500279542000200020001000000049298563282433079310200020003000326675288117100110001000020002210031000042002322163051211408437403174222937402744411943463248916809145031594820003275332863327483281033053
72004332032461501500001005010059253267011275732000200020001000010049300243279233008310200020003000326105259117100110001000020002210011000022002222163851207318220397364522880402544441349453234216529144751568020003272232740327613274532820
72004327962451201500001004010060863284410279462000200020001000000049299573274733103310200020003000328365267117100110001000020002210031000042001222160761207208318387674722996383044381750463236116385142031573620003260532699329153284432756
720043263624612014000010190000571732634102807420002000200010000200492994532667329713102002200030003280052671171001100010000200022100210000420011221633511377184573944104222760379644431448513248917631142461544620003327132724328873275732639

Test 2: throughput

Code:

  swpa w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0338

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f202223293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
302073043122700000000415123207101926113630303169118536540533010010100200001010020000592121217731492733830350303861326612301001020020000102003000030388270112020110099100100001010010000010020313243523401052010060191221370404204629299254353632332131011611303251000043312520000101003033230303302923039130361
30204303112283333000039442022310069603027821811852523997301001010020000101002000059212121760149273903036630351132656630100102002000010200300003032627011202011009910010000101001000001002021818306343104931009625621332418204593244194142571703131011611303471000044282020000101003029530297302713027730308
30204302702273332000039542416810160551630294208127506039613010010100200001010020000592121217431492724830374303231326480301001020020000102003000030356270112020110099100100001010010000010020231182973341042410089291017350350204444223234122781802131011611303011000039222120000101003027130357303033027330299
3020430319227222100004000212461016050843022820113160613987301001010020000101002000059212121774149272483027330233132643930100102002000010200300003025927611202011009910010000101001000001002025120323415104151008422817156344204277232184232791802131011611303891000027231920000101003025930358302943023730296
3020430381228222000003877222321040529230260208128586639373010010100200001010020000592121217891492726430303303351326528301001020020000102003000030305270112020110099100100001010010000010020215202433411046210048221120734346204724246203933331842131011611303071000038222320000101003028430353302643025330356
302043026522722220000392628322930849403028013911360773946301001010020000101002000059212121753149273633027230354132656330100102002000010200300003029327011202011009910010000101001000001002029513213413104591007818621354382204474261193462901404131011611303421000026231720000101003031830284302743030630293
3020430293226100100003998231861006324302812201086265401430100101002000010100200005921212176614927282303543040913265593010010200200001020030000303252701120201100991001000010100100000100202761529938710502100952312187644732057312276234273321461131011611303811000025182620000101003034230376303993030830328
3020430350227111000003852142271010458483034420511469704019301001010020000101002000059212121757149273323036330393132653130100102002000010200300003040227011202011009910010000101001000001002023113330370105241004110726026466205219292133212531411131011611303921000047282420000101003031430336303273026430308
3020430349227101000003997212471005948303491881404569403030100101002000010100200005921212180914927342304133035329266903010010200200001020030000303572762120201100991001000010100100000100202491628542110534100792210243181067204356251163523761321131011611303111000054273020000101003035230345303703038630308
3020430348227100000003881182421040786830403237139377340573010010100200001010020000592121217891492730230342302981326492301001020020000102003000030380276112020110099100100001010010000010020212143193601048210074132424360468205937288213753171411131011611302791000042312620000101003035230361303023039130414

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0266

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f202224293a3e3f4043464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c3cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3002730325227201202000038611922510324756302291817831103958300101001020000100102000058830121774149272023027630297132654330010100202000010020300003028326611200211091010000100101000001020206312163081029510054146321226314204397238183382502531127010161313302421000020161420000100103024030266302463024630259
300243029322730300300003735201731003872303152101191673985300101001020000100102000058830121771049272193028430296132652630010100342000010020300003029526611200211091010000100101000001020266202473831023010092217015458249203845226161852211931127012161111302631000024191320000100103025130264302733028830245
3002430284226201202000038571620510032643026315995890399630010100102000010010200005883012178304927221302693025913264903001010020200001002030000302142661120021109101000010010100000102023620224272102731009325611647030020263322714239236200012706161112302951000016151420000100103025830266302493028430304
30024302892272022010000387220223108331563025916989971393830010100102000010010200005883012179304927179302493025013264723001010020200001002030000302772661120021109101000010010100000102019620258328102761008512441001831820378321918356280205012701316612302541000030181620000100103027030301302633026930300
3002430276226202000000037331416710120197630208200873843959300101001020000100102000058830121787049271563028830341132649830010100202000010020300003026026611200211091010000100101000001020219192443271029010029189020418330203285213182682761951127012161112301951000017121320000100103024930234302463024730269
3002430266226202200000037851816310104357230198190114980395030010100102000010010200005883012179104927180302303025713264933001010020200001002030000302362661120021109101000010010100000102025222307332102961009811591761835120307521121211218190012701116512302541000024151120000100103024930248302583023930223
30024302842262020020000374018182103234443026016413808539773001010010200001001020000588301217720492721230267302691326488300101002020000100203000030266266112002110910100001001010000010202532422232510292100832149128132238203755222182142621921127012161111302581000024191120000100103022130252302223025230231
3002430208226202002000037912021210128401563019621267295394030010100102000010010200005883012179504927192302723025413264703001010020200001002030000302222661120021109101000010010100000102022121251307103421004922621608834520311418919249167201112704161111302751000019161420000100103028830250302673029030258
3002430295227101101000037401124810803913630214229882533945300101001020000100102000058830121806049272153029530275132646530010100202000010020300003025826611200211091010000100101000001020234192573901029210034144814420268203254206162542382002127012161111302511000021171320000100103025330262302403024930308
30024302582262020000000379013177101364888301881978801023950300101001020000100102000058830121783049271503027830254132645030010100202000010020300003027026611200211091010000100101000011020227212403451033710096213518818288203533217152942192020127012161213302921000015181320000100103028130270302543030030340

Test 3: throughput

Code:

  swpa w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 13.0061

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f223a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
202051300619741111011449312012113004705011011320100100200001002000050010699914912698101300601300612612750620100344820020000200300001300611988111020110099100100001001000001002002322022241341001201142953423214245121212231111722772122522129946130020000100130062130062130062130062130065
20204130048974111000143311201251300460501100182010010020000100200005001065681491269810130060130061261274932010035032002000020030000130048198811102011009910010000100100000100200222212232430810012001422734282142771212122301117227805225221299331313020000100130049130062130062130049130062
202041300619741111011403512012313004600011029020100100200001002000050010746514912698101300601301002111275062010040002002000020030000130048198811102011009910010000100100000100200242512023849100230013752340261372612121222111172277042252212994613131320000100130064130062130062130169130062
20204130048974101000138961201231300460561104512010010020000100200005001074061491269810130068130061261275062010035112002000020030000130061197511102011009910010000100100000100200222212222424110012001434933994140731212122301117227636025221299460131320000100130067130062130062130049130062
20204130061974111100143461201231300460661100062010010020000100200005001071241981270630130060130061261274932010032202002000020030000130061198811102011009910010000100100000100200232412222423510012001445334759142401212022011172275272252212994600020000100130062130062130062130062130062
20204130061975100000141681211191300330651101332010010020000100200005001062201491269810130047130061261274932010034772002000020030000130061198811102011009910010000100100000100200262412232447410012001418634143141081201223111172276742252212993313131320000100130062130062130062130062130062
202041300619741011001425012012313004605611014020100100200001002000050010651914912698101300471300482612750620100351620020000200300001300611988111020110099100100001001000001002002524122224325100120214198343971411212121222011172275222252212994613131320000100130062130062130049130049130140
20204130061974110100142531211231300461551100062010010020000100200005001063781491269810130060130061261275062012433952002000020030000130061198811102011009910010000100100000100200252612024363100120114434344901426112121222011172276392252212994613131320000100130062130062130062130049130062
20204130048974111100142751201231300460451100392010010020000100200005001068921491269810130060130048261275062010036842002000020030000130048198811102011009910010000100100000100200242512242409810012011425434116142541212122311117227595225221299331301320000100130062130062130065130062130062
20204130061974111100141621201221300460661097882010010020000100200005001072021491269680130060130061261274932010034782002000020030000130048197511102011009910010000100100000100200232412024342100120014219343401435712121223011172274322252212993313131320000100130062130062130062130062130062

1000 unrolls and 10 iterations

Result (median cycles for code): 13.0076

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f1e22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20025130061974000016646101111300331221148542001010200001020000501000171491269680130051130052231275322001072020000203000013005219821110021109101000010100001102000000152666510000016822366301666712006401000521622129934106102000010130043130053130049130043130043
2002413009397400001668500128130027000114854200101020000102000050100017149126968013005113004223127528200107202000020300001300481982111002110910100001010000010200000015266361000001669936670166711212064010001216221299280062000010130053130053130053130049130053
2002413010197400001667610107130027020114855200101020000102000050100003149126962013005113005223127532200100202000020300001300521988111002110910100001010000010200000121526679100000169023663916635121206409999216221299381010102000010130053130053130053130053130043
2002413004597400001664900951300320221148522001010200001020000501000031491269730130051130052231275332001002020000203000013004219821110021109101000010100000102000001215266711000001669436668166410120640100312162212993410602000010130050130049130056130053130043
20024130048974000016695101011300330221148612001010200001020000501000031491269720130051130052231275282001002020000203000013004819881110021109101000010100000102000001202666910000116884366431664412006401001321622129928106102000010130053130053130049130043130043
200241301429740000166530012613003800211485520010102000010200005010000314912697201300511300522312753220010020200002030000130052198811100211091010000101000001020000012152663710000017043366451664512120640100332162213000410602000010130049130043130049130053130043
20024130115974001116646001271300270221148612001010200001020000501000031491269720130051130052231275322001002020000203000013005219881110021109101000010100000102000001215266471000001698436676166800006409999216221299286002000010130053130043130053130043130049
200241301569740000166790010713002700011485520010102000010200005010000314912697201300411300522312753220010020200002030000130052198811100211091010000101000001020000012152664510000016896366301663501206401000721622129938100102000010130053130053130053130053130053
20024130047974000016647101281300370041148652001010200001020000501000031491269620130051130042231275282001002020000203000013004219881110021109101000010100000102000000152664510000016816366461664512120640100522162212993866102000010130043130053130053130053130049
2002413007197400011667600127130037022114861200101020000102000050100003149126972013005213005623127528200100202000020300001300481988111002110910100001010000110200000121526680100000169633664416645121206401001121622129938100102000010130053130053130053130053130053