Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PACGA

Test 1: uops

Code:

  pacga x0, x0, x1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100470295700000025158242510001000100017833014939490702970296623368181000100020007029870111001100000000731851167891000100070307030703070307030
10047029570000006158242510001000100017833014939490702970296623368181000100020007029870111001100000000731851167891000100070307030703070307030
10047029560000006158242510001000100017833014939490702970296623768181000104220867068870111001100000000731851167891000100070307030703070307030
10047029560000006158242510001000100017833014939490702970296623368181000100020007029870111001100000000731851167891000100070307030703070307030
10047029570000006158242510001000100017833014939490702970296623368181000100020867029870111001100001000731851167891000100070307030703070307030
10047029560000006158242510001000100017833014939490702970296623368181000100020007029870111001100000000731851167891000100070307030703070307030
10047029560000120615824251000100010001783301493949070297029662336818100010002000702987011100110000143880411111091168261006100071487109710771097108
10047109580132144176133357976610091009107617852414940270710770686645768461056108322507107870311001100070017730731851167891000100070307030703070307030
10047029560000006158242510001000100017833014939490702970296623368181000100020007029870111001100000000731851167891000100070307030703070307030
10047029560000006158242510001000100017833014939490702970296623368181000100020007029870111001100000030731851167891000100070307030703070307030

Test 2: Latency 1->2

Code:

  pacga x0, x0, x1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l1d tlb fill (05)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020470029562010000888959828251010010106101011807949149669497002970029685497687311010110216202327002985611102011009910010100100000001301501117192280017075210000101007003070030700307003070030
102047002956200000120895982825101001010010101180783404966949700297002968549768731101011021620232700298561110201100991001010010000024000111719016206988110000101007003070030700307003070030
102047006756300000088615982425101001010010100180783004966949700297002968480368674101001020020200700298561110201100991001010010000000120000710179116979410000101007006970030700307003070030
1020470029562000000061598242510100101001010018078300496694970186700296848036867410100102002020070029856111020110099100101001000000000000710179116985910000101007003070030700307003070030
10204700295630001000895982447101051010010100180783004966949700297002968480368674101001020020200700298561110201100991001010010000030030000710179116979410000101007003070030700697003070030
1020470029562000000089598242510100101001010018078300496694970029700296848036867410100102002020070029856111020110099100101001000000090000710181116979410000101007003070030700307006970030
10204700295630000012061598242510100101001010018078300496694970029700296848036867410100102002020070029856111020110099100101001000000000000710179116979410000101007003070030700307003070030
10204700295620000000103598242510100101001010018078300496694970029700296848036867410100102002020070029856111020110099100101001000000030000710179116979410000101007003070030700307003070030
1020470029563000000061598242510100101001010018078300496694970029700296848036867410100102002020070029856111020110099100101001000000090000710079116979410000101007003070030700307003070030
10204700295630012000232598242510124101001010018078300496694970029700676848076867410130102002020070029856111020110099100101001000200060000710179116979410000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024700295440000000615982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000000001640479446980110000100107003070030700307003070030
10025700295620000000615982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000000000640379446980110000100107003070030700307003070030
100247002956300000120615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000000000640479446980110000100107003070030700307003070030
10024700295430000000615982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000000000640379446980110000100107003070030700307003070030
100247002954300000007265982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000003000640479346980110000100107003070030700307003070030
10024700295430000000615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000003000640479346980110000100107003070030700307003070030
100247002954300000120615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000000000640479436980110000100107003070030700307003070030
10024700295430000000615982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000000000640479346980110000100107003070030700307003070030
100247002954300000005065982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000000000640379446980110000100107003070030700307003070030
100247002954300000001035982425100101001010010180738004966949070029700296850236869610010100202002070029870111002110910100101000000010640479436980110000100107003070030700307003070030

Test 3: Latency 1->3

Code:

  pacga x0, x1, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102047002954307265982425101001010010100180783014966949070029700296848036867410100102002020070029856111020110099100101001000120710179116979410000101007003070030700307003070030
1020470029543061598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100000710179146979410000101007003070030700307003070030
1020470029543061598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100000710179116979410000101007003070030700307003070030
102047002954312751598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100100710179116979410000101007003070030700307003070030
1020470029543361598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100000710179116979410000101007003070030700307003070030
1020470029542061598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100000710179146979410023101007003070109700707003070030
10204700295434589598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100000710179146979410000101007003070030700307003070030
1020470029542061598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100001710179116979410000101007003070030700307003070030
1020470029543061598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100000710179116979410000101007003070030700307003070030
10204700295431261598242510100101001010018078301496694907002970029684803686741010010200202007002985611102011009910010100100030710179116979410000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100247002956200000000615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000079000640279226980110000100107003070030700307003070030
1002470029563010000005895981546100101001410010180747714966988070029700686850236869610010100202002070029870211002110910100101020083194520656279226980110000100107003070030700307006970030
1002470029563000000004265982425100101001010040180748304966949070029700296851436869610010100202010670029870211002110910100101010068000640279226980110000100107003070030700307003070030
1002470029562000000006159824251001010010100101807380149669490700297002968502368696100101002020020700298701110021109101001010000141000640279226980110000100107003070030700307003070030
1002470029562000000006159824251001010010100381807380149669490700297002968502156880010010100202002070029870111002110910100101000080600640279226980110000100107003070030700307003070030
100247002956200000000615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000312185800654279226980110000100107003070030700307003070030
100247002956200000000615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000087000640279226980110000100107003070030700307003070030
1002470029563000000001315982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000052000640279226980110000100107003070030700307003070069
100247002956300052000615982425100101001010010180738014966949070029700296850236869610010100202002070029870111002110910100101000073000640279226980110000100107003070030700307003070030
1002470029563000001200615982425100101001010010180738014966949070029700696850236869610010100202002070068870211002110910100101000034000640279226980110000100107003070030700307003070030

Test 4: throughput

Count: 8

Code:

  pacga x0, x8, x9
  pacga x1, x8, x9
  pacga x2, x8, x9
  pacga x3, x8, x9
  pacga x4, x8, x9
  pacga x5, x8, x9
  pacga x6, x8, x9
  pacga x7, x8, x9
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acc2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
8020480052621000352580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100003051100044111800248000000801008003680036800368003680036
8020480035620000912580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100000051100012511800248000000801008003680036800368003680036
80204800356200120842580100801008010040050010497695580035800356996636998480100802001602008003516411802011009910080100100003051100012511800248000000801008003680036800368003680036
80204800356200001002580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100000051100012521800248000000801008003680036800368003680036
80204800356200088352580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100100051100012511800248000000801008003680036800368003680036
8020480035621000632580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100000051100012511800248000000801008003680036800368003680036
8020480035621000352580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100000051100012511800248000000801008003680036800368003680036
80204800356210607003480100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100000051100012511800248000000801008003680036800368003680036
802048003562001207842580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100000051100012511800248000000801008003680036800368003680036
80204800356200007002580100801008010040050000497695580035800356996636998480100802001602008003516411802011009910080100100003051100012511800248000000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)1e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
8002480050620000133258001080010800104000500497695508003580035699883700068001080020160020800351641180021109108001010030305020005250055800268000000800108003680036800368003680036
8002580035620003756258001080010800104000500497695508003580035699883700068001080020160020800351641180021109108001010021905020006250063800268000000800108003680036800368003680036
80024800356200007552580010800108001240005004976955080035800356998837000680010800201601288003516411800211091080010100168050200052500512800268000000800108003680036800368003680036
8002480035620000130258001080010800104004900497695508003580035699883700068001080020160020800801641180021109108001010011405020005250055800268000000800108003680036800368003680036
800248003562100091258001080010800104000500497695508003580035699883700068001080020160020800351641180021109108001010033005020003250153800268000000800108003680036800368003680036
8002480035620000105258001080010800104000500497695508003580035699883700068001080020160020800351641180021109108001010024905020003250036800268000000800108003680036800368003680036
8002480035620006663258001080010800104000500497695508003580035699883700068001080020160020800351641180021109108001010125805020705250055800268000000800108003680036800368003680036
80024800356200042168258001080010800104000500497695538003580035699883700068001080020160020800351641180021109108001010143505020003250035800268000000800108003680036800368003680036
8002480035620000352580010800108001040049604976955080035800356998837000680010800201600208008016411800211091080010100126250200052500358002680000014800108003680036800368003680036
8002480035620011235258001080010800104000500497695508003580035699883700068001080020160020800351641180021109108001010023705020003250036800268000000800108003680036800368003680036