Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPB

Test 1: uops

Code:

  swpb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f61696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
7200533166250130300010020060353269300276842000200020001000010049297993290232715310200020003000326975240117100110001000120002210011000022001122163731190018378395905922941403544531241453237617826151541592520003307433170331123258532607
720043290524502000001003106102327380027912200020002000100005049295183262732909310200020003000330985258117100110001000020002210011000022001122164431224108479379704222959405344571342493239816250152581707220003269232720332513264833164
7200432838246000100010030061173302500275422000200020001000010049298093256232920310200020003000330845273117100110001000020002210011000022001122163231208218253377204222987407544421844413229416079151821681520003310832652327723271532615
720043270924601020111003006123326170027669200020002000100002049295523262332750310200020003000326825290117100110001000020002210011000022001122162031202308523392204423586369544521144383230916473142461575520003279332612331363312832713
7200433028244000001110030057883271800277132000200020001000010049297373308032922310200020003000327195303117100110001000020002210061002122001122156311140218487405904922873391644441647453236817723151721678020003308632820332193278532644
720043286624400000011003105632327670028064200020002000100007049296133259832734310200020003000327495232117100110001000020002210011000052001122163651223708472378304823280397644451643513230216791145201560520003281732935326933318432569
7200432717245100101110030057813263300275402000200020001000010049296153267433199310200020003000330985285117100110001000020002210011000022001122165911215018287410604622914382544442550413247217535143431580820003281632576327103269432710
720043282524900010011004006120325720027595200020002000100002049295423302032758310200020003000327565227117100110001000020002210011000022001122155451136218322389824823393383544451743433244317563149501632220003292932980329983315732502
720043275824600000001004005968328150027472200020002000100005049296403266032651310200020003000329935240117100110001000020002210011000022001122163621147108512408304622836373744432340373227116287145411573420003283332750327083269132634
720043269624500010011003006028329680027738200020002000100002049300693296032822310200020003000326455245117100110001000020002210011000022001122157511104918401409614622852406944481243503248216598147071701420003275232682327583278132711

Test 2: throughput

Code:

  swpb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0062

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f20223a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
302063006222500013393329119203003724158111352625301001010120000101002000060528141135310492698530059300622602332626030334102002000010200300003006276112020110099100100001010010000010020047012452100531000331443043200701314441641310116113006210000131020000101003006630063300663006330053
3020430062225000033802191210300502733518135222530100101022000010100200006052814112691049269853006530062260293262703010010200200001020030000300647611202011009910010000101001000001002006401916410049100140137264320073418137981310116113005910000101320000101003006330063300633006330066
302043006222500003375424112030050285312013549253010010100200001010020000605501411242104926982300593006526027326273301001020020000102003000030065751120201100991001000010100100001100200350227601005110009414405120069228454145131011611300621000001020000101003006330063300633006330066
30204300652250000338522301803004723223231351125301011010120000101002000060558141112610492698230062300622601632627330100102002000010200300003005275112020110099100100001010010000010020032028778100661000411402051200832426741471310116113006210000131020000101003006330066300663006330063
302043006222500003402332118030050254243613524253010010100200001010020000604951411237114926985300593006226012326260301001020020000102003000030062751120201100991001000010100100000100200281215059100321000731253053200580305521561310116113006210000131020000101003006330068300633006630063
30204300652250200340643511303005041939371349425301001010020000101002000060495141134010492698530059300622602432627030100102002000010200300003006576112020110099100100001010010000010020041010540100771000210282840200701244511911310116113005910000131020000101003006330063300633006630066
302043006522500003385454114123005020740321351325301001010020000101002000060546141119610492698230059300652602332627330100102002000010200300003006275112020110099100100001010010000010020042019783100551000311443035200733274471581310116113006210000101320000101003006330063300663006330066
3020430062225000033963331903004723323171351925301011010020000101002000060663141114510492698230062300622602632627030100102002000010200300003006576112020110099100100001010010000010020027012350100571000731330552009123226524113101161130059100000020000101003006330066300633006330053
30204300652250000339932011003005008273313527253010110100200001010020000605581411258104926982300593006226027326270301001020020000102003000030062751120201100991001000010100100000100200410247991008810008103605520058223454128131011611300591000001020000101003006330066300633006330066
30204300622250010340332311703003729418151353725301001010020000101002000060550141124710492698530052300652602432627030100102002000010200300003006275112020110099100100001010010000010020051018272100651000201413829200721311381291310116113005910000101020000101003006630063300633006330066

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0065

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2022233a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30026300712250101100003414425101803005622830191355025300101001020000100102000060019141166600492698230062300622604632630130010100202000010020300003006575112002110901010000100101000001020069162128201005110001012518432005116394213313001270116113006810000131020000100103006330066300633006330063
300243006522500000100034152301011030050249131213533253001010010200001001020000602541411155104926982300653005226047326292300101002020000100203000030062751120021109010100001001010000010200381533366010063100090021129420063144106616414101270116113006810000131320000100103007230072300723007230072
30024300712250111000003409220101503003722817141356725300101001020000100102000060132141126710492698530065300622604432629630010100202000010020300003006575112002110901010000100101000001020035141438501010010018022022622006815593418214001270116113006810000131320000100103007230072300723007230072
300243007122601111000033962231015030047175262713530253001010010200001001020000600971411257004926982300623006526046326295300101002020000100203000030065751120021109010100001001010000010200210128360100591000121182839200694411401100001270116113005910000131020000100103006330063300633006330053
3002430062225000000000340812310240300472181918135242530010100102000010010200006012914112050049269853006230062260483262953001010020200001002030000300627611200211090101000010010100000102003612263710100601000811441851200621392492160001270116113006210000101020000100103007230072300723007230072
3002430071225011110000341383610512300502315161013537253001010010200001001020000601411411287004926982300653006526045326295300101002020000100203000030065751120021109010100001001010000010200310201690100671000451401224200632262451510001270116113006210000131020000100103006530063300663007230072
3002430071225011000000340493110928300563272516135092530010100102000010010200006018614110051049269783007130071260543263013001010020200001002030000300627511200211090101000010010100000102006415205760100711001130193217262008213985513114101270116123008910000131320000100103007230072300723007230072
3002430071225011011100341492610210300562910251913522253001010010200001001020000600941411284004926982300623006526045326295300101002020000100203000030052751120021109010100001001010000110200450183680100801000321302854200871301612550001270115113005910000101020000100103007230072300633006330063
300243006222500000000033721251014030050279191113531253001010010200001001020000601291410727104926985300623006526047326292300101002020000100203000030062741120021109010100001001010000010200451412961010080100080028225020069135349980001270115113005910000101020000100103006330063300633006630063
300243006222500000100034224251014030050241414171351925300101001020000100102000060097141129710492698530065300622604532629330010100202000010020300003006275112002110901010000100101000001020038010356010056100041134244720098139363277000127011521300621000001020000100103007230072300593007230072

Test 3: throughput

Code:

  swpb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0148

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20205301802261100003401141001430185818151355725201001002000010020000500141004114926972303433005227548727810201002002000820030012303565561110201100998310010000100100000100200000039214100501000210572005172000011171600160030332014100200001003034730344303343022730053
20204301562250000003383200026303390141413585252010010020000100200005001423167149272523026630332275516278002010020020008200300123005228111102011009914310010000100100001100200002642001007210002005120046122403800111716410160030153014140200001003034630257301473005330243
20204303432250000013380000003031531414136652520100100200001002000050014165790492702130223301822742962770120100200200082003001230143484111020110099551001000010010000010020026284054110100771001602105200445516314225211171654016003019710140200001003005330053300873034730207
2020430343225000000337820002930241213181364625201001002000010020000500141751004926979301803028927380627595201002002000820030012300592871110201100994510010000100100000100200272742001007310015028020048691629026011171653016003008811301200001003015230060302013026830211
202043021022610100034221600063019501817137532520100100200001002000050014107911492713030200302002727962751320100200200082003001230180519111020110099561001000010010000110020026260011010077100160295200617414290252111716570160030177113130200001003026130181301833021130181
20204300592261000003422160001230165700136482520100100200001002000050014108601492723730121300592744262767020100200200082003001230346472111020110099410010000100100000100200000038232100461000200572005250234000111716300160030049010100200001003020230060300603018130060
202043005922510000034191600003022300171361125201001002000010020000500141380704927120302513020027418627634201002002000820030012301982861110201100999810010000100100001100200262542538610074100140168200627716304126111171630016003020741300200001003022330103300533005330053
20204300522260000003383000083029621401384425201001002000010020000500142363514927266300863034627269627650201002002000820030012303462821110201100991281001000010010000010020000046022210050100000010201644820380011171630016003004901400200001003032130211302013006030211
20204300592271010003418160001302450017136652520100100200001002000050014134231492710030290300592727762763420100200200082003001230210439111020110099501001000010010000110020026254154481006910014005720059551429026111171657016003020740130200001003021130261302013020130239
202043018022610011034181600029300372130136402520100100200001002000050014137351492726430333301262756062780720100200200082003001230086560111020110099110100100001001000011002000004638010050100020056200514820380011171635016003035301000200001003022330053300533032330053

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0245

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2002530157226110000341016109301328181813604252001010200001020000501415479492706730147301472738032763720010202000020300003014729311100211094010100001010000010200262734072100641001801942004465142834261640412162330144099020000103006030098301583014830108
200243017522611110034041600130044901813539252001010200001020000501417425492710730146301472738332753920010202000020300003014737811100211093310100001010000010200262700010062100180197200566116034262640454164230144099120000103015830158301573006030148
200243014722610010034111400113004491818135042520010102000010200005014131604927071301533014727381327539200102020000203000030147388111002110910101000010100000102002827340010066100160190200525516280260640284164430144099020000103014830197300603015830187
20024301512261100003404161013014291818136032520010102000010200005014150234926979301473009727291327676200102020000203000030147430111002110915101000010100000102013026344259100661001600942005662162734261640414163430056090020000103015830155301483014830157
2002430059225111000337121003006321515137452520010102000010200005014100844927165301583024727284327532200102020000203000030246479111002110987101000010100000102000003028146100311000200122003372263000640234164330245006020000103005330053301883005330249
200243005222500000033480001930037101513512252001010200001020000501410146492697230148302482748232772920162202000020300003012847711100211098710100001010000010200000302914710011100020043200453222500064004164430245166020000103024730249302493017930249
200243005222600000133722101930233001513821252001010200001020000501410092492716830052302482748132772820010202000020300003005247911100211098210100001010000010200000302901001110002004020033312253000640324164430134066020000103024930209301293024930219
20024300522260000003372200530233115151375825200101020000102000050141955449269723005230052274113276382001020200002030000302484171110021109841010000101000001020000002801003410002005320036332253000640234164430225106020000103024930053300533013830053
200243005222500001033692000302332151413751252001010200001020000501419676492697230108300522728532767820010202000020300003024545911100211098210100001010000010200000382891100311000200392001233203000640163163430049160020000103021930129302493024930138
200243024422500001133732001930124215151374625200101020000102000050141963049272523020330052272843278012001020200002030000302474791110021109010100001010000010200000300107100341000000392003534225300064004162330244160020000103012930249302493024930053