Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPH

Test 1: uops

Code:

  swph w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e1f22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f61696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
72005333522482619111147010605932590112756820002000200010000004929753327773283831020002000300033213530611710011000100002000022100110000520011220162411123708492366756623004389544561953553239116519145321595520003283532999332283267432745
720043298324718181010050106036327861127842200020002000100000049297233280833006312200020003000328295250117100110001000020000221001100002200112201618312027084594049105023453391744421856483242016318142791559920003270732774327843296432804
720043271224516200010070106087332011127679200020002000100002049296553278132758310200020003000331305277117100110001000020000221001100002200112201622311986183473991125023101399344422552583269317048142311598020003280733022327143282633250
720043268924520200010290106011327351127785200020002000100007049298773288432966313200020003000326375240117100110001000020000221001100012200112201548512073184314061125222973395844421651523259016748143981607620003276032670332233322132690
720043277024521250010500105973326991127651200020002000100000049297643290433228310200020003000327345289117100110001000020000221001100005200112201648811289084014000155423360405544441742533254216326143091564820003295232863329093288432845
720043270724524180010020005952327861127684200020002000100002049297413279032970310200020003000331035220117100110001000020000221001100005200112201626812091084693889135323167402544481848483238016858144361708620003322732542326593260033243
720043324024923210010020105798326761127766200020002000100000049302853282233019310200020003000327405256117100110001000020000221001100002200112201555911842081054016105122779370044371854523238116461144571706820003330133202327193279532963
72004328242462222111525010609733058112811720002000200010000904929688327393301331020002000300032568528311710011000100002000022100110000220011221163991226708473396695023329399144451856553239017907153881610320003280332843328443326733193
720043274424818230010070106080326871127517200020002000100018749298713286233033310200020003000326895337117100110001000020000221001100002200112201635811820084483980124622876399644441154473237617552151871680120003268933249327783272032723
720043287524522220010020006048328121127658200020002000100002049296993276032741313200020003000326835223117100110001000020000221001100002200112211545312163184683805104922764401044511645473245016298142071577120003270332738326103271533223

Test 2: throughput

Code:

  swph w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0067

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f2022233a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3020630077225202000343818321014830052810191513564253010010100200001010020000604621411351004926987030067300712602832627530100102002000010200300003006776112020110099100100001010010000010020032141327310072100131134072200677471054152140013102162230055100009920000101003006830059300683006830068
3020430067225222200342514351010030052301211913541253010010101200001010020000604651411373004926987030067300672602732627530100102002000010200300003006777112020110099100100001010010000010020043187752100961002734263280200551451445155160013102162230064100009920000101003006830068300683006830068
3020430067225222000347417350062430052168101613533253010010100200001010020000604621411369004926987030067300692602732627530100102002000010200300003006777112020110099100100001010010000010020035272817610078100092054108120052143938157141013102162230064100009920000101003006830059300683006830059
302043006722610110037489261050300521861120135542530100101002000010100200006045814110421149269780300643006726029326275301001020020000102003000030067761120201100991001000010100100000100200481476531006410021002006720060145134798140013102162230064100009920000101003005930068300683006830068
3020430058225101000347173610160300522313141613524253010010100200001010020000604421411419004926987030067300672602832627530100102002000010200300003006776112020110099100100001010010000010020040159866100901002001312862200662431056130140013102162230064100009920000101003006830068300683006830068
302043006722511011033771026001024301171613231913549253010010100200001010020000604521411406004926987030064300682602832626630100102002000010200300003005876112020110099100100001010010000010020045151735210057100130128083200511451180244130013102162230055100009920000101003006830068300683006830068
302043006722511100034811140101403005217711161355825301001010120000101002000060472141194910492698703006730067260283262663010010200200001020030000300677611202011009910010000101001000001002004915774810073100233032246920062045958205140013102162230055100009920000101003006830068300683006830068
302043006722510110034188271019030052259151213535253010010101200001010020000605811411309004926978030058300672602732627530100102002000010200300003005876112020110099100100001010010000010020041169938100741001020192458200580451046120130013102162230064100009920000101003006830059300683006830068
3020430067225100001365813280013030052368182713521253010310100200001010020000604511411392104926987030064300692602932627530100102002000010200300003006776112020110099100100001010010000010020049293038010082100132020105220058148125575160013102162230064100009920000101003006830068300683006830068
30204300672262022103470152010160300523612101813513253010010100200001010020000604681411382104926987330064300582603132627530100102002000010200300003006776112020110099100100001010010000010020044161025610072100170029069200773371241158172013102162230064100009920000101003006830068300683006830068

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0058

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f20223a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30026300582250001003747427115030046122021241352325300101001020000100102000060246141091700492698130058300622604132628830010100202000010020300003006175112002110901010000100101000001020000032356010079100020131657200851412491550012702162230049100006620000100103005930059300623005930059
300243005822500000036032001403004627171812135502530010100102000010010200006014914109840049269813005530060260463262883001010020200001002030000300587411200211090101000010010100000102002403011110100881000401301854200672355401120012702162230049100006620000100103005930059300593005930059
30024300582250000003592234113203004320261220135192530010100102000010010200006024714111160049269723005830064260433262913001010020200001002030000300587511200211090101000010010100000102002501194301006610004100183620061158240990012702162230055100000920000100103005930053300623005930059
3002430052225000000346252711903004624171110135462530010100102000010010200006014614106500049269813005530058260413262923001010020200001002030000300617511200211090101000010010100000102003309738010074100040152226220083260246670012702162230058100006920000100103005330062300623005930059
3002430058225001000345150114030046252211141354525300101001020000100102000060155141090200492698130058300702604132628930010100202000010020300003005875112002110901010000100101000001020019097340100831000331323073200722517593240012702162230055100020920000100103005930062300593005930059
3002430058225000000341933512030037231828171348625300101001020000100102000060174141096400492697830058300752604432629230010100202000010020300003005874112002110901010000100101000001020025010334010087100060119666200632394412460012703162230058100009620000100103006230062300593006230062
300243005822500000037875017243004602936161353025300101001020000100102000060177141102900492698130058300582604132628830010100202000010020300003005275112002110901010000100101000001020026069360100771000730006620057148137970012702162230058100006620000100103005930053300593005930062
30024300612250000003476300120300433024131413525253001210010200001001020000601821410990004926978300593005826035326289300101002020000100203000030058751120021109010100001001010000010200380129590100871000221321860200592353301290012702152230055100009020000100103005930062300593005930053
30024300522250000003427526150300372617111613525253001010010200001001020000601781411013004926978300553006226054326282300101002020000100203000030058751120021109010100001001010000010200260117460100861000410253671200820472521800012702162230058100009620000100103006230059300593005930059
3002430058225000100343524812228300432526162613545253001010010200001001020000601811410937104926978300583005826041326291300101002020000100203000030058751120021109010100001001010000010200340206550100751000311311850200761462221770012702163230055100006620000100103006230062300593006230059

Test 3: throughput

Code:

  swph w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0197

retire uop (01)cycle (02)03090e0f181e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)797bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20205302422271000338421005303042014138162520100100200001002000050014101571492710230052300522752962778720100002002000020030000303324191110201100994910010000100100001100200000021410043100020522004872340111722272252230327001010200001003014330321303303031830053
20204303392260000334621002730317014151349925201001002000010020000500141011414927262300533033227249627777201000020020000200300003021355011102011009913310010000100100000100200004638120100431000205020012472350111716270160030239101010200001003025330163303323032030223
20204300522271000340320000300372131413853252010010020000100200005001422981149272623023330052275386277862010000200200082003001230210560111020110099127100100001001000001002000003821510046100020512004612235011171630016003004910010200001003005330053300533005330331
202043005222500003342200010303172140138182520100100200001002000050014230050492697230053303322727362778520100002002000820030012300525701110201100990100100001001000001002000038391751001210002114200091123438111716320160030198101010200001003020230053303203034030187
20204303432270000334201000300372001351625201001002000010020000500141008404927229303223005227270627676201000020020008200300123024255011102011009959100100001001000001002000038014410012100000522004772353811172229225223007900100200001003034730053301933009730053
202043011222700003360000127303150131513562252010010020000100200005001416365149272493022230330275507275062010000200200082003001230052568111020110099125100100001001000001002000038381111004310002014200091103438111716310160030319001014200001003033330053303333005330333
2020430292226000033760000163030201614137682520100100200001002000050014101711492724230112303322747062774620100002002000820030012300722801110201100990100100001001000011002000038390100111000004920045410353811171700160030329101010200001003033330323300533005330333
20204300522250000334220102630037214013648252010010020000100200005001419228149270823032530262275486275362010000200200082003001230328281111020110099641001000010010000010020000038214100111000201420045432340111722322252230353301010200001003005330323300533023330053
20204302522250000338220001302672170138212520100100200001002000050014236351492712130318303012730062757620100002002000820030012303284991110201100997810010000100100000100200000011110048100020482001246235011171730016003029910010200001003005330053302533017330347
202043019222700003381200003003730161385525201001002000010020000500141213314926972301523005227520627793201000020020008200300123011256011102011009912710010000100100001100200003838214100431000205020049110343811171627016003016910010200001003005330222301273032030283

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0154

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2002530147227000113411140010300449018136632520010102000010200005014155651492704730059301472738032766620010202000020300003005938911100211092710100001010000102002626340010063100140094200526216034260640333162230056099120000103014830147300603015530060
200243009722510111341214001430142918181366425200101020000102000050141557614926979300593015727390327627200102020000203000030147290111002110901010000101000010200282834420100621001802972005666142933250640352162230154099020000103014830187301483015830177
20024301482261011134071400930122918181360525200101020000102000050141503304927067301473014727383327627200102020000203000030156378111002110951010000101000010200272634051100491001600932002566162933251640412162230144099020000103014830060300603014830060
2002430157225100013409160093015281818136382520010102000010200005014141120492697930059300592729232763720010202000020300003014737811100211092910100001010000102002627344257100651001601892005265142834260640412162230056090020000103015830158301383014830158
2002430157226100103402161015301320180135412520010102000010200005014108310492708730059301472738132753920010202000020300003019438811100211090101000010100001020026253442010066100161093200526616270260640412162230144099020000103014830197301583006030205
20024301372251011133821400123008291801358325200101020000102000050141508004927067301473016727380327617200102020000203000030059385111002110927101000010100001020025260082100541001601922005254142833260640412162230144099020000103014830060301583006030177
2002430154226101113409160093014201818134902520010102000010200005014108170492711630137301572738032762720010202000020300003015737811100211090101000010100001020027283400100651001601942004861142834260640412162230144099020000103014830148300603014830148
20024301472251000134111610030139918013580252001010200001020000501415482049271073010730157272933276272001020200002030000302363781110021109010100001010000102002726344227100391001601972004980163034250640332162230154399020000103013830148301483017630060
200243015722610000341016000301320181813575252001010200001020000501416941049270373017530059273803275392001020200002030000300593891110021109251010000101000010200252534010010062100140095200255916034261640412162230104000020000103016830060302003006030148
2002430147226101103408160003013291818136132520010102000010200005014107590492697930154301572739732762620010202000020300003014739811100211093310100001010000102002727344245100411001701812005563162734261640322162230056099020000103015530158301583019830158