Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SWPLB

Test 1: uops

Code:

  swplb w0, w1, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e223a3f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696a6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd0d1d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
7200534122256111200100405309338972896520002000200010000100493108533577341533102000200030003385853731171001100010000200022100410000220032221505610744180033535125824272361744411139363300818745168371848420003406934102340433400734023
720043406125512121110050524034016289172000200020001000010049310123363134045310200020003000337615347117100110001000020002310041000422002222150141055717964353274124312354844381441393300419092169231857220003416534069340233401034096
7200434137255151800100505354339422902620002000200010000400493100533669340993102000200030003382353721171001100010000200022100210005219200322014939106210795135945362423935494436840453300719181168481841820003402934085340873405434118
720043408825515160010040534633965290912000200020001000010049310493365834103310200020003000338295393117100110001000020002210021000118200542215034107111797435654482418235554439637343296519160167901832520003403834145340303403134205
72004341022551511001004053063400529020200020022000100004164930984336273411931020002000300033758534911710011000100002000231002100004200322214941106031796035678402420136324443839363299418936168441839120003408234101341393415334168
720043404025513140010050536733992289212000200020001000110049310073359934019310200020003000337955391117100110001000020002310041000162002222148751054917920355993524210365744461437353313318985168401827420003400634122340543403834034
7200434023255131900100505337339422892820002000200010000310493095833626340583102000200030003386453591171001100010000200022100210006811200422214938106721794035315392413135754439656393299719409169751829720003400034016340873402934191
72004340272551413001004153413396628879200020002000100019004930962336573410431020002000300033861539511710011000100002000221003100002200242214915107061796635806332419535794435738403306419123167131839020003406334083340613416134079
720043408325517140010050538133922290322000200020001000110049310863367534060310200020003000338155382117100110001000020002210041000132004222148811054727970353763824174363244301040373296118991167831824820003409134033341163402534308
720043412025515140010050530233973289592000200020001000010049310353356434017310200020003000338115396117100110001000020002210041000462002222152401064318042352333924106354544341138423299618967169331811220003409534115340833408334097

Test 2: throughput

Code:

  swplb w0, w1, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0068

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e1f2022233a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
302067006852511100050488201016287005335116104959825301001010020000101002000060314332045514966988700327006865699366276301001020020000102003000070056801120201100990100100001010010000010020047151025319868100601001510440652006613584514613102131231623698311000013020000101007006970069700697006970057
302047006852511100050681037101512700533511412495972530100101002000010100200006031433204421496698870032700686569936627630100102002000010200300007006878112020110099010010000101001000001002004914110331987210081100241061403820050139956218131001312216336983110000131320000101007006970069700697006970069
302047006852510010050809310013070053351111749597253010010100200001010020000603333320442149669887003270068656993662763010010200200001020030000700567811202011009901001000010100100000100200671617573198721009210010313736542010113994210213200131231632698311000013020000101007006970069700697006970069
30204700685251110005058736001607005324214749597253010010100200001010020000603063320446149669767002870068656873662763010010200200001020030000700688011202011009901001000010100100000100200331483551987210082100212030245020052136943101131001312316336981910000131320000101007006970069700697006970069
30204700565251010005058929101107005323120164958725301001010020000101002000060278332045114966988700287006865700366276301001020020000102893000070068791120201100990100100001010010000010020047153226919872100671000830383057200841361151139142001312216336983110000131320000101007006970069700697006970069
302047006852511100050778360016407005330475495972530100101002000010100200006029733204551496698870032700686569936627630100102002000010200300007006877112020110099010010000101001000001002001415212691986810057100082000642003514184183130001312216326981910000131320000101007006970069700697006970069
302047006852510000050669491062870053161613495972530100101002000010100200006030533204581496698870032700686569936627630100102002000010200300007006877112020110099010010000101001000001002006614147100198721009610021024406020087141865175142001312216326983110000131320000101007006970069700697006970069
3020470068525110000506293610100700532021210495982530100101002000010100200006033033204571496698870028700686569936627630100102002000010200300007006879112020110099010010000101001000011002004513148471986810068100174262286020082141943120140001312316326983110000131320000101007005772235706037006970069
3020470056533112000504971910130700533421217495974630100101862000010100201536030333684751496698870032700686569936627630100102002000010200300007005678112020110099010010000101001000001002002915102551987210067100102130243820077033835173141001312316336983110000131320000101007006970069700577006970069
302047005652411100050551023101607004119312214959725301001010020000101002000060327332045714966988700287006865699366276301001020020000102003000070068781120201100990100100001010010000010020036271665819872100681001810360302004911584918113100131231642708641000013020000101007061670604700697006970069

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0055

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f181e1f202223243a3e3f404346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)797bmap int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9aaabacafb6bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
30026705985250000005053128000180700603317105003325300101001020000100102000061700331983114966975700157005565708366286300100010103200001010730000700557811200211090101000010010100001020026087411986810040100012124245689200340161279012702152169818100006620000100107005172224700597005670051
300247005052410000052651341001707005351111114958525300101001020000100102045959993331982914966975700157005565708366286300100010020200001002030000700557811200211090101000010010100001020040147230198721004610008002422222004401213014412701151170349100006020000100107005670056700567005170056
30024700555250000005029127100160700506911010495852530010100102000010010200006000333195921496697570015700556570836628630010001002020000100203000070055781120021109010100001001010000102002012141481986810048100012120039200570150308712701151169818100006620000100107005670056700567005170056
300247005552500000050450301001307006251113105004425300101001020000100102000060003331982914966970700107005565708366286300100010020200001002030000700557811200211090101000010010100001020039014439198681005910001113436312006603115021412911151169818100006620000100107005670051700517005670056
3002470055525010000505613210064700403411010495822530010100102000010010200006001733198291496697570010700556570836628630010001002020000100203000070055781120021109010100001001010000102002501232819868100581000121302423200460152295112701151169818100009020000100107005170056700567005670051
3002470050525000110504104610017367004042010104958525300101001020000100102000060000331983114966975700147005565708366281300100010020200001002030000700557811200211090101000010010100001020060141154419872100701000811413242200470913210812701151169818100006620000100107005170056700567005670056
3002470055525000000505210928100130700404801194958525300101001020000100102000060017331982914966975700157005565708106628630010001002020000100203000070055781120021109010100001001010000102001901413119868100551000110242422200270131306912701151169818100006020000100107005670056700517005670056
300247005552400000050371341001207004959112114958525300101001020000100102000060001331983014966970705617005065708366281300100010020200001002030000700557811200211090101000010010100001020041131154119872100571001330440512005301313217712701151169813100006620000100107005670056700567005670056
300247005552500000050491351001907004043111124958225300101001020000100102000060017331983114966975700157005565708366286300100010020200001002030000700557811200211090101000010010100001020024012539198681005310000113836162005202022712512701151169813100009620000100107005670056700567005670056
3002470055525000000504312610016287004049191749582253001010010200001001020000599973319831149669757001570055657083662863001000100202000010020300007005578112002110901010000100101000010200240874119868100451000121251821200480171186912701151169818100009620000100107005670056700567005670056

Test 3: throughput

Code:

  swplb w0, w1, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 18.0048

retire uop (01)cycle (02)030e191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20205180048134910321642100118003310151587652520100100200001002000050085653611491769680180053180048176049717750120100200200082003001218004814401110201100999954310010000100100001100200003838129219321421000202215042164221410404611171615495016001789690001020000100180053180049180049180053180053
2020418004813490032146210011800720150158764252010010020000100200005008565361149176972018005218004817604961775022010020020008200300121800521431111020110099995401001000010010000010020000463912922132143100020221724214522152203811171715529016001795791001420000100180053180053180053180040180049
202041800481349003219920000180024314131587542520100100200001002000050085653621491769680180052180052176036617750120100200200082003001218003914311110201100999954610010000100100000100200000381292193213910002222164421362213920011171615501016001789390001020000100180053180040180040180053180049
202041800521349003215300001180063015151587532520100100200001002000050085659971491769680180052180048176036617750520100200200082003001218003914401110201100999953810010000100100000100200000391292213214910002022152421382213624046111716155080160017893901101220000100180053180040180040180040180053
2020418003913480032138200001800242001587632520205100200001002000050085659950491769720180039180039176049617749320100200200082003001218003914401110201100999953310010000100100000100200004638129221321411000202213942166221440046111716154980160017893840101420000100180049180049180053180049180050
20204180052134900321350100118007600141587682520100100200001002000050085660020491769590180052180039176049717750620100200200082003001218005214401110201100999953910010000100100000100200004638129219321481000202215642138221492400111716155040160017893901141420000100180053180053180053180056180053
20204180052134900322130100018003730015876425201001002000010020000500856535204917696801800701800481760496177506201002002000820030012180039144011102021009999540100100001001000001002000003812921232165100020221684213522152204611171615524016001789354014020000100180053180049180053180040180053
2020418004813480032176210001800372140158763252010010020000100200005008565811149176972018005218005217603661775052010020020008200300121800521440111020110099995471001000010010000010020000383812922132147100020221444213622141204611171615498016001789270014020000100180053180053180053180053180053
202041800521348003214220000180033201615876425201001002000010020000500856581714917696801800391800521760366177898201002002000820030012180048144011102011009999543100100001001000001002000046012921032161100024422147421442216804046111732154951160017894000101020000100180040180040180040180040180053
202041800481349103215500000180024301415876125201001002000010020000500856599814917696801800481800391760496177502201002002000820030012180052144011102011009999543100100001001000001002000003812922132141100000223764214322152000111716154870160017892740101420000100180053180040180049180049180040

1000 unrolls and 10 iterations

Result (median cycles for code): 18.0048

retire uop (01)cycle (02)030e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)l1d cache miss ld (a3)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)cfd1d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20025180053134800321762100180029015141587622520010102000010200005085658071491769590180048180039176060317757320010202000020300001800581454111002110999540101000010100001102010528028129212321391000200221494216422144235386401549221622178915001062000010180049180040180049180049180049
200241800481349003215501001800290001587632520010102000010200005085656411491769640180039180048176056317756820010202000020300001800481454111002110999861101000010100001102000000281292193215110002002216042147221352340640155212162217931710662000010180049180040180049180049180049
2002418004813490032168210018002921515158753252001010200001020000508565451149176968018003918003917606031775692001020200002030000180048145411100211099953910100001010000010200000382912922032140100020022144421662214403406401549521622180183401262000010180045180045180045180049180049
20024180048134900321432100180033201415876425200101020000102000050856580214917696801800481800481760603177568200102020000203000018004814491110021109995421010000101000001020000030012922032141100020022147421622214322630640154942162217892040002000010180045180045180045180049180049
20024180048134800321542101180024115151587602520010102000010200005085658071491769680180039180048176060317759420010202000020302701800391454111002110999536101000010100000102000003029129216321411000000221474214922157234386401550121622178915001062000010180040180040180049180049180049
20024180048134911321722101180033214151587542520010102000010200005085658111491769680180044180048176060317755220010202000020300001800481449111002110999546101000010100000102000000291292123213610002002213442162221382006401550122522178924301062000010180040180050180104180045180049
2002418004813480032140210118003311414158763252001010200001020000508565641149176959018004818004817606031775352001020200002030000180039145411100211099954610100001010000010200000302912921532138100000022155421452213920386401550521624178947001062000010180049180049180049180049180049
200241800391349003214721001800330151615876225200101020000102000050856536114917696801800481800481760603177579200102020000203000018004814541110021109995431010000101000001020000003812921232136100020022146421562214400386401550121622179356101062000010180045180045180049180049180049
200241800481348003214721011800332001587542520010102000010200005085658101491769680180079180048176060317752820010202000020300001800781454111002110999539101000010100000102000003828129220321671000200221534214722141026386401550421622178915101062000010180049180049180049180049180049
200241800711348013219121011800330014158764252001010200001020000508565815149176968018004818004817606031775192001020200002030000180048145411100211099953610100001010000010200000029129219321571000200221574214622147034386401552922522178924401002000010180040180040180049180049180040