Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PACDB

Test 1: uops

Code:

  pacdb x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10047029670000000061582425100010001000178330149394970297029662336818100010002000702987011100110000016000731851167891000100070307030703070307030
10047029650000000012658242510001000100017833014939497029702966233681810001000200070298701110011000002000731851167891000100070307030703070307030
100470296800000000103582425100010001000178330149394970297029662336818100010002000702987011100110000024030731851167891000100070307030703070307030
1004702967000000006158242510001000100017833014939497029702966233681810001000200070298701110011000000000731851167891000100070307030703070307030
1004702967000000006158242510001000100017833014939497029702966233681810001000200070298701110011000000000731851167891000100070307030703070307030
10047029680000000010358242510001000100017833014939497029702966233681810001000200070298701110011000000030752852167891000100070307030703070307030
1004702968000000006158242510001000100017833014939497029702966233681810001000200070298701110011000002000731851267891000100070307030703070307030
1004702974000000006158242510001000100017833014939497029702966233681810001000200070298701110011000000060731851167891000100070307030703070307030
10047029680000000010458242510001000100017833014939497029702966233681810001000200070298701110011000000000731851167891000100070307030703070307030
1004702965000000006158242510001000100017833014939497029702966233681810001000200070298701110011000001000731851167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  pacdb x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
10204700296170044159824251020010200102001808330149669497002970029684913686741020010200202007002991211102011009910100007101791169796101000101007003070030700307003070030
1020470029622006159824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101000101007003070030700307003070030
10204700296220010359824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101000101007003070030700307003070030
1020470029651006159824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101000101007003070030700307003070030
10204700296220023259824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100137101791169796101000101007003070030700307003070030
10204700296230072659824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007321791169796101000101007003070030700307003070030
1020470029622006159824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101000101007003070030700307003070030
1020470029622009959824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101000101007010870030700307003070070
10204700296221012459824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101040101007003070030700307003070030
1020470029623066159824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100007101791169796101000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4d5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002470029626000081041159824025100201002010020180743010496694907002970029685023686961002010020200207002987011100211091001000000006400279326980510010100107003070030700307003070030
1002470029622000000103598240251002010020100201807430154966949070029701876850236869610020100202002070029174011100211091001000000006405279226980510010100107003070030700307003070030
10024700296190000007159824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000010006400279226980510010100107003070030700307003070030
100247002962200000076859824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000000006400279226980510010100107003070030700307003070030
10024700296220000006159824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000000306400279226980510010100107003070030700307003070030
10024700296220000008959824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000000006400279226980510010100107003070030700307003070030
10024700296220000006159824025100201002010020180743000496694907002970184685143686961002010020200207002987011100211091001000000306400279226980510010100107003070030700307003070030
10024700296220000006159824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000000006400279226980510010100107003070030700307003070030
10024700296220000006159824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000010006400279226980510010100107003070030700307003070030
100247002962200001206159824025100201002010020180743000496694907002970029685023686961002010020200207002987011100211091001000010006400279226980510010100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  pacdb x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
30204800296970000114061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000075019102721179809201000301008003080030800308003080030
3020480029750000000156697992520200202002020049433041497694980029800297596137618120200202004020080029144113020110099301000012019101721179794201000301008003080030800308007680030
30204800297090000007266979925202002020020200494260114976949800298002975952376181202002020040200800291441130201100993010001192019101721179794201000301008003080030800308003080030
3020480029699000090726697991092020020200202004942634149769498002980029759613761812020020200402008007514411302011009930100000019101721179794201000301008003080030800308003080030
302048002969900000072669799252020020200202004942601149769498002980029759613761812020020200402008002914411302011009930100009019101721179794201000301008003080030800308003080030
3020480029699000012072669799252020020200202004942601149769498002980029759613761812020020200402008002914411302011009930100000019101721179794201000301008003080030800308003080030
3020480029699000000616979925202042020020200494260114976949800298002975961376181202002020040360800291441130201100993010000102019101721179794201000301008003080030800308003080030
302048002969900100010369799252020020200202004942601149769498002980029759613761812020020200402008002914411302011009930100000019101721179794201000301008003080075800308003080030
302048002969700001206169799252020020200202004942601149769498002980029759613761812020020200402008002914411302011009930100009019101721179794201000301008003080030800308003080030
3020480029695000000616979925202002020020200494260114977370800298007475961376181202002020040362800291441130201100993010001829834263513921180820201900301008003080030800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
30024800296950000000006169799252002020020200204952048149769498002980029759833762032002020020400208007914411300211093001000000219018905724379803200100300108003080030800308003080030
3002480029743000000120061697992520020200202002049520481497694980029800297598337620320020200204002080029144113002110930010000300018903724479803200100300108003080030800308003080030
3002480029702000000000103697992520020200202002049520481497694980060800297598337620320020200204002080029144113002110930010000000018904724379803200100300108003080030800308003080030
3002480029702000000000616979925200202002020020495204819876949800298002975983376203200202002040020800291441130021109300100001000018903723479803200100300108003080030800308003080030
30024800296990000000001298697992520020200382002049520481497694980029800297598337620320088200204002080029144213002110930010000700018904724379803200100300108003080030800308003080030
3002480029742000000000768697992520020200202002049520481497694980029800297598337620320020200204002080029144113002110930010000000018903723479803200100300108003080030800308003080030
30024800296980000001200103697992520020200202002049520481497694980029800297598337620320020200204002080029144113002110930010000000018904724379803200100300108003080030800308003080030
30024800296980000001200616979925200202002020020495204814976994800748012075983376208200822002040020800291441130021109300100002803018904724479803200100300108003080030800308003080030
3002480029704000000120061697992520020200202002049520481497694980029800297598337620320020200204002080060144113002110930010000403018904724379803200100300108003080030800308003080030
300248002970200000000061697992520020200402002049520481497694980029800297598337620820020200204002080029144113002110930010000500018904724379803200100300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  pacdb x0, x8
  pacdb x1, x8
  pacdb x2, x8
  pacdb x3, x8
  pacdb x4, x8
  pacdb x5, x8
  pacdb x6, x8
  pacdb x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
802048008579100000012028298020580205802074010301497696080040800416998076999080207802241602488004116411802011009980100000500011151191161180037801050801008004180041800428004280222
80204800407440000000033298020580205802074010301497696180040800416998076999080207802241602488004016411802011009980100000200011151191161180038801050801008008180041800418004180042
802048004069900000000282880205802058020740103014976960800418004069980869990802078022416024880040164118020110099801000008200011151191161180038801050801008004180042800428004280042
802048004170300000000277217480352803738034040189614977322803588044770121447028880317802241602488004016411802011009980100000700011151201161180038801050801008004280042800418004180041
802048004069700000001312258020080200802004010001497695580035800356996636998480200802001602008003516411802011009980100000000000051103253380025801002801008003680036800368003680036
802048003574600000000700258020080200802004010001497695580035800356996636998480200802001602008003516411802011009980100000300000051103253380025801000801008003680036800368003680036
80204800357020000000035258020080200802004010001497695580035800356996636998480200802001602008003516411802011009980100000420000051103253380025801000801008003680036800368003680036
80204800356970000000035258020080200802004010001497695580035800356996636998480200802001602008003516411802011009980100000200000051103254480025801000801008003680036800368003680036
802048003570210000000632580200802008020040100014976955800358003569966369984802008020016020080035164118020110099801000000015000051103253380025801000801008003680036800368003680036
80204800357020000000035258020080200802004010001497695580035800356996636998480200802001602008003516411802011009980100000203000051103253380025801000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)9dl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
8002480052694000012035258002080020800204001001497695508003580035700183700068002080020160020800351642180021109800100000000000050200425033800248001000800108003680036800368003680036
8002480035748000000772580020800208002040010014976955080035800357000237000680020800201600208003516411800211098001000000340000050200325033800248001000800108003680036800368003680036
800248003570000000035258002080020800204001001497695508003580035699883700068002080020160020800831641180021109800100000020000050200325032800248001000800108003680036800368003680036
8002480035699000000632580020800208002040010009876955080035800356998837000680020800201600208003516411800211098001000000520000050200225023800248001000800108003680036800368003680036
80024800357000000003525800208002080020400100149769550800358003569988370006800208002016002080035164118002110980010000005050000050200325032800248001000800108003680036800368008180081
80024800357490000120772580020800208002040010014976955080035800356998837000680020800201600208003516411800211098001000002340300050200425023800248001000800108003680036800368003680036
8002480035752000000772580020800208002040010014976955080035800356998837000680020800201600208003516411800211098001000000740300150200225023800248001000800108003680036800368003680036
800248003574900000077258010480020800204001000497695508003580035699883700068002080020160020800351641180021109800100000000000050200225023800248001000800108003680036800368003680070
800248003570000000035258002080020800204001001497695508003580035699883700068002080020160020800351641180021109800100000010000050200225023800248001000800108003680036800368003680036
800248003574900010077258002080020800204001001497695508003580035699883700068002080020160020800351641180021109800100000060300050200325032800248001000800108003680036800368003680036