Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPL (32-bit)

Test 1: uops

Code:

  swpl w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)0e0f1e223a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)dfe0eaebec? ldst retires (ed)f5f6f7f8fd
720053407225520001111020053173394601290862000200020001000064931034335763401231020002000300033843540311710011000100012000022100510000420044221490910657279603587166242033552444015554803302218728168771835220003420334194341183405834118
720043410425600001001005052703395500289482000200020001000044931009335823409731020002000300033820538811710011000100012000022100310000420044031497910595180163591057242463586443415524603307919259169551852520003411034253340753414134119
720043414725620000001005153643392600289662000200020001000084931063336043416331020002000300033791542811710011000100012000022100510000320023201497210563180183556047242623581443817574803305019107168261851420003408634081341233411634111
7200434121255000010010060544533943102900820002000200010001749309783364434156310200020003000338245422117100110001000120000221005100031220033021494910567179933573050242163585437717565303300419186168151850320003412934139341783402634186
720043412825502000001004052623396500290002000200020001000064930990335993415731020002000300033807540211710011000100012000022100310000320024021492510703179573615048241993547443911565303298819142168011857220003404334132341233403134079
720043401125500000011004052333397300289682000200020001000024930942336433408131020002000300033910539711710011000100012000000100410000520024221504810809180073552153241853523443713544503297019325167321834520003410134141341023409434050
720043410325520000001005052973396700291242000200020001000054930962336523415531020002000300033771544811710011000100012000022100310000420024021484810431179843522152241863618444612474603304618983167481817020003410734165340493408934085
720043416125500000001006053103395100289482000200020001000074931024336773405931020002000300033800541811710011000100012000023100210000420065301486210625379323564044241973588443510635403301019366171191841920003419934046340683416834167
720043401725500000001005053033395310289092000200020001000064930992336503415631020002000300033800540611710011000100012000022100610000220043221492710510179613529048242663511443721465603308219055169381841320003404634043340993401534140
720043417225500000001004153643396500289692000200020001000034931015336443415331020002000300033737538611710011000100012000022100110000320042321492110726180093563050241803571443411504503293419119169821827820003403534153340403408734144

Test 2: throughput

Code:

  swpl w0, w1, [x6]
  add x6, x6, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0055

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f2022293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3020770079525111100052353171184212070040184149584958525301001010020000101002000060381331982849669857002670055656863662803010010200200001020030000700557911202011009901001000010100100001100202361496147198681025210049233214381752028304411174780013101161169818100006620000101007005670056722197005670057
302047005552400000005246221710514870040230236444960125301001010020000101002000060380331983149669757001570055657033662643010010200200001020030000700557811202011009901001000010100100001100202130516154198681026910054283212601772026107211445460013101161169818100006620000101007006170061700567005670071
3020470055525000000051961178185114470040195251424958625301001010020000101002000060372331982849669757001570072656913662633010010200200001020030000700697811202011009901001000010100100001100202530666205198681025110032223231721112030004431235930013101161169834100006620000101007007470059700747007170056
302047007152500001005283119418443270040211136424960325301001010020000101002000060378332006949669757003870068656863662633010010200200001020030000700557811202011009901001000010100100001100202180473151198681025910058213207221222026007421035450013101161169834100006620000101007005670056700567005670056
302047007452500011105249223210531567004021124340495862530100101002000010100200006037833201174966976700157005565686366279301001020020000102003000070055781120201100990100100001010010000110020247069816019868102191004313322581192024705521274950013101161169818100006620000101007006170056700757006870060
302047005552400000005357117910488870053270245554959525301001010020000101002000060370331982849669897006470073657033662773010010200200001020030000700557911202011009901001000010100100001100201800627180198841029210058183156921422021307511054790013101161169821100006620000101007005670056722117005670056
302047005552500001005207222418468470040183233454958625301001010020000101002000060384331982949669767001570070656973662633010010200200001020030000700747811202011009901001000010100100001100202250631173198681028810042253259281432029005831074930013101161169818100006620000101007006770056700567005670056
3020470055525000000052641220104515670045196145504958625301001010020000101002000060375332050149669757001570055657023662813010010200200001020030000700587911202011009901001000010100100001100202620586153198741030110047193196241532026026711095430013101161169818100006620000101007006270056700567005670056
302047007152400000005243220511845414070040178244474958525301001010020000101002000060375331982949669927003770058656863662633010010200200001020030000700707811202011009901001000010100100001100202050536172198701027610051173257281272029106121026090013101161169818100006620000101007006270067700597005670064
302047007352400000005264220411604316070040222139494959825301001010020000101002122460378331982849669757001570066657013662633010010200200001020030000700557811202011009901001000010100100001100201720502198198681023210036193193261762019703921205580013101161169819100006620000101007005670056700567005670056

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0055

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f202223293a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30027700865241012010524822271018459216700402444455349588253001010010200001001020000599943320837049669757003370055657083662863001010020200001002030000700707911200221090101000010010100001102026126451831988710285100591842322812720245067311181611012701151169837100006620000100107005670056700567005670056
300247005552511100005324223810052108700451925405049585253001010010200001001020000599903319828049669757002070055657253662863001010020200001002030000700557811200211090101000010010100001102027616491811986910273100443032383213920321176313751112012701151169818100006620000100107007070061716737005770056
30024700665241111000537832331012054212700402803455849586253001010010200001001020000599973319831049669767001670073657083662863001010020200001002030000700557911200211090101000010010100000102025136341581986810292100442531974817720305062312359010012701151169818100006620000100107005670056700617005670060
30024700555251101100529532081005222070035263561464958525300101001020000100102000059994331982904966976700157007565726366303300101002020000100203000070055781120021109010100001001010000010202451582217198681027210057234219801612025705138670311012701151169819100006620000100107005670074700567007770056
300247006552511011005321221310041212700402264354549585253001010010200001001020000600013320792049669757001970055657273662873001010020200001002030000700557811200211090101000010010100000102022616641591988010263100252532457814420335070212160711012701151169818100006620000100107005770056700567005670056
3002470066525101100052912218108461127004022973645495862530010100102000010010200006000033207421496697570034700556572836630330010100202000010020300007005579112002110901010000100101000001020268257915119868102561003626323410016120291072317368911012701151169818100006620000100107005670056700747005670051
300247005552510111005241221010053188700461617355149586253001010010200001001020000600033320694149707567001970055657213662913001010020200001002030000700557811200211090101000010010100000102023435411711986810299100352131953412720294075312157510012701151169818100006620000100107005670056700617006270056
300247005552510120005291219610055140700562297434149585253001010010200001001020000599943416571049669757001570055657083663063001010020200001002030000700557811200211090101000010010100000102028016011921986910234100521952472213820358084312957710012701151169818100006620000100107005670069700757005670056
300247005552511001005294325910200651327004120235945495872530010100102000010010200005998933198311496697570028700666570836628630010100202000010020300007005578112002110901010000100101000001020163255115219886102341004914319913017220247169311452911012701151169818100006620000100107006770056700767005670056
3002470055525100000052692208100571207004020484034496022530010100102000010010200005999733198310496697570015700706570836628630010100202000010020300007005578112002110901010000100101000001020253356115719885102971000422321710010620295059211586312012701151169818100006620000100107005670056700567005670056

Test 3: throughput

Code:

  swpl w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 18.0044

retire uop (01)cycle (02)03mmu table walk data (08)090e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20205180053134800000032151210018002901514158753252010010020000100200005008565636149176964018004418004417604161774982010020020008200300121800441436111020110099995451001000010010000010020000302912921232144100020221484216622138226301117161549101600178931106620000100180045180040180040180046180045
2020418004813480000003214601021800241015158806252010010020000100200005008565629149176964018004418003917604161774972010020020008200300121800441436111020110099995431001000010010000010020000302812921632139100004221484214122147226301117161550301600178934000620000100180045180040180045180045180045
20204180039134800000032146210218002921514158759252010010020000100200005008565361049176968018004418004417602261774892010020020000200300001800391431111020110099995421001000010010000010020000302912921532151100020221524213822136226301117221550722522178920206620000100180045180045180045180050180040
20204180044134900000032148210218002921515158757252010010020000100200005008565637049176964018004418004417602261774892010020020000200300001800481436111020110099995401001000010010000010020000302812921532139100002221424214522143226381117161549901600178931206620000100180045180040180045180045180045
202041800441348000100321622102180029114015876025201001002000010020000500856563504917696401800391800441760416177498201002002000820030012180044143621102011009999543100100001001000001002000030012921232142100020221474213822143226301117161550401600178931006620000100180058180045180046180049180045
20204180044134800000032150210218002910151587582520100100200001002000050085656391491769590180044180044176041617749820100200200082003001218004814361110201100999953910010000100100000100200003001292163214310000022145421552216720301117161549501600178931106020000100180045180045180045180045180045
2020418004413490000003213901021800291151415876025201001002000010020000500856563514917696401800441800481760366177493201002002000820030012180044143611102011009999540100100001001000001002000002912921532167100020221744214922152225381117161549001600178936206620000100180045180047180045180045180040
2020418004413490000003216121031800290161415875325201001002000010020000500856536104917696431800441800391760417177493201002002000820030012180039143611102011009999540100100001001000001002000030291292163214310002022145421482214120301117161549701600178931106620000100180040180045180040180045180045
2020418004413490000003217200031800242141415875425201001002000010020000500856564114917696401800441800441760416177498201002002000820030012180039143611102011009999540100100001001000001002000030281292153213910002022151421412214620301117161550001600178931106620000100180040180045180045180045180040
202041800441349000000321482001180029114151587592520100102200001002000050085656350491769640180039180044176041617749820100200200082003001218004414361110201100999954510010000100100000100201053029129213321411000012221464214122139203011171715496016001789401010620000100180045180045180040180045180045

1000 unrolls and 10 iterations

Result (median cycles for code): 18.0052

retire uop (01)cycle (02)03l1d tlb fill (05)l2 tlb miss data (0b)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch mispred nonspec (cb)cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
200251800571348111132181160011800240013158754252001010200001020000508565997149176972180048180053176064317753220163202000020300001800391458111002110999530101000010100000102000004638129212321431000002214442137221442404606401549531622178927014102000010180040180049180049180053180053
2002418005213480000321500001180037213141587652520010102000010200005085653611491769721800521800531760643177532200102020000203000018005314581110021109995401010000101000001020000000129221321531000202215542143221472000640154972162217896610102000010180053180049180053180053180050
2002418005213480000321712101180024315151587542520010102000010200005085659971491769681800521800391760643177532200102020000203000018005214581110021109995401010000101000001020000046012922132163100000221614214422149040380640154952162217897710102000010180040180053180053180053180053
20024180052134800003217620011800332131515875325200101020000102000050856599714917696818003918003917606031775322001020200002030000180061145811100211099954310100001010000010200000383912922032143100001221414217022146240006611550121622180612110102000010180053180053180053180053180040
200241800521349000032147000018002431501587532520010102000010200005085658071491769721800521800391760643177532200102020000203000018005214581110021109995361010000101000001020000038012921232136100020221504217022145239480640155182162217891500102000010180040180054180040180049180053
20024180048134900003214800011800370130158764252001010200001020000508565996149176968180052180052176051317752820010202000020300001800521458111002110999540101000010100000102000003838129212321611000002216442143221542404606401549321622178981014102000010180053180053180040180053180040
2002418005213480000321370101180024213015876525200101020000102000050856599314917697218005218005217605131775282001020200002030000180052145811100211099953910100001010000010200000460129221321491000002215042142221430404616401550621622178956414102000010180040180040180040180040180053
20024180052134900003213300001800373131315876425200101020000102000050856580614917695918004818005317605131775322001020200002030000180052144911100211099954010100001010000010200000463912922132138100020221494215822141239460640155442162217893010102000010180090180067180053180053180053
200241800481349001032139210018003310141587682520010102000010200005085658111491769731800501800521760643177532200102020000203000018004814581110021109995401010000101000001020000046381292213213610002122137421642211723446064015495216221789150002000010180053180049180040180049180053
2002418004813490000321510101180037213015876425200101020000102000050856549614917695918003918003917606031775322001020200002030000180052145811100211099954010100001010000010200000383812921932163100020221604213922141204606401548921622178915414102000010180049180053180049180053180053