Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PACDA

Test 1: uops

Code:

  pacda x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03091e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10047029670006158242510001000100017833014939497029702966233681810001000200070298701110011000009731851167891000100070307030703070307030
10047029650006158242510001000100017833014939497029702966233681810001000200070298701110011000000731851167891000100070307030703070307030
10047029640006158242510001000100017833004939497029702966233681810001000200070298701110011000000731851167891000100070307030703070307030
10047029670006158242510001000100017833014939497029702966233681810001000200070298701110011000000731851167891000100070307030703070307030
10047029650006158242510001000100017833014939497029702966233681810001000208270298701110011000000731851167891000100070307030703070307030
10047029660006158242510001000100017833004939497029702966233681810001000200070298701110011000040731851167891000100070307030703070307030
10047029670006158242510001000100017833004939497029702966233681810001000200070298701110011000010731851167891000100070307030703070307030
10047029670006158242510001000100017833014939497029702966233681810001000200070298701110011000000731851167891000100070307030703070307030
10047029660006158242510001000100017833004939497029702966233681810001000200070298701110011000000731851167891000100070307030703070307030
10047029650006158242510001000100017833004939497029702966233681810001000200070298701110011000110731851167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  pacda x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102047002960700000000615982425102001020001020018083301496694970029700296848036867410200102000202807002991211102011009910100000000000710179116979610100101007003070069700307007270030
1020470029614000000007265982425102001020001020018087031496694970029700296848036888410200102000202007002991211102011009910100000000200710179116979610100101007003070030700307003070030
102047002960800000000615982425102001020001020018083300496694970029700296848036867410200102000202007002991211102011009910100002100000710179116979610100101007003070030700307003070030
1020470029608000000001325982425102001020001020018083300496694970029700296848036867410200102000202007002991211102011009910100000003000710279116979610100101007003070030700307003070030
102047002960900000000615982425102001020001020018083300496694970029700296848036867410200102000202007002991211102011009910100000000000710179116979610100101007003070030700307003070030
1020470029615000001200615982425102001020001020018083300496694970029700296848036867410200102000202007002991211102011009910100000000000710179116979610103101007003070030700307003070030
102047002960800000000615981925102001020001020018083300496694970029700296848036867410200102000202007002991211102011009910100000000000710179116979610100101007003070030700307003070030
1020470029609000000001045982425102001020001020018083301496694970029700296848036867410200102000202007002991211102011009910100000100000710179116979610100101007003070030700307003070030
1020470029606000006001035982425102001020001020018083300496694970029700696848036867410200102000202007002991211102011009910100000003000710179216979610100101007003070030700307003070030
1020470069607001000001035982425102001020001020018083300496694970029700296850236867410200102000202007002991211102011009910100000003000710179116979610100101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100247002962101035982425100201002010020180743014966949700297006868526368722100201002020020700298701110021109100100001090006400279226980510010100107003070030700307003070030
10024700296220615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100001000006400279226980510010100107003070030700307003070030
100247002963391075982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006400279226980510010100107003070030700307003070030
1002470029651212495982425100231002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006400279226980510010100107003070030700307003070030
10024700296510615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006400279226980510010100107003070030700307003070030
10024700296220615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100001000006400279226980510010100107003070030700307003070030
10024700296500615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006404279226980510010100107003070030700307003070030
10024700296220965982432100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006400279226980510010100107003070030700307003070030
10024700296220615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006400279226980510010100107003070030700307003070030
100247002962201035982425100221002010020180743014966949700297002968502368696100201002020020700298701110021109100100000000006400279226980510010100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  pacda x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204800296930000000006169799252020020200202114943049497694980029800297602067622820211202274025480029144113020110099301000000000001111920116117987020100301008003080030800308003080030
302048002979310000000072669799252020020200202114943049497694980029800297596137618120200202004020080029144113020110099301000000000000001910172117979420100301008003080030800308003080030
30204800296940000000006169799252020020200202004942601497694980029800297596137618120200202004020080029144113020110099301000000000000001910172117979420100301008003080030800308003080030
302048002969300000000089697992520200202002020049426014976949800298002975961376181202002020040200800291441130201100993010000000002053000001910172117979420100301008003080030800308003080030
302048002969300000000072669799252020020200202004942601497694980029800297596137618120200202004020080029144113020110099301000000000000001910172117979420100301008003080030800308003080030
30204800296930000000006169799252020020200202004942601497694980029800297596137618120200202004020080029144113020110099301000000000000001910172117979420100301008003080030800308003080030
30204800296930000000006169799252020020200202004942601497694980029800297596137618120200202004020080029144113020110099301000000010000001910172117979420100301008003080030800308003080030
302048002969500000000072669799252020020200202004942601497694980029800737596137618120200202004020080029144113020110099301000000000000001910172117979420100301008003080030800308003080030
302048002969300000001201036979925202002020020200494299449769498002980029759613761812020020200402008002914411302011009930100000000034000000023052274218050220208301008093080411800308003080030
30204800296940000000006169799252020020200202004942601497694980029800297596137618120200202004020080029144113020110099301000000000000001910172117979420100301008003080030800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002480029698000000006169799252002020020200204952048049769490800298002975983376203200202002040020800291441130021109300100033030001890272227980320010300108003080030800308003080030
3002480029705000001200616979925200202002020020495204804976949080029800297598337620320020200204023880029144113002110930010000030001890272227980320010300108003080030800308003080030
300248002969700000000616979925200202002020020495204804976949080029800297598337620320020200204002080029144113002110930010002000001890272327980320010300108003080030800308003080165
30024800297020000012001036979925200202002020020495204804976949080029800297598337620320020200204002080029144113002210930010002090001890272227980320010300108003080030800308003080030
3002480029702000001200616979925200202002020020495204804976949080029800297598337620320020200204002080029144113002110930010001000001890272227980320010300108003080030800308003080030
300248002970400000000616979925200202002020020495204804976949080029800297598337620320020200204002080029144113002110930010003000001890272227980320010300108003080030800308003080030
300248002969600000000616979925200202002020020495204814976949080029800297598337620320020200204002080029144113002110930010006000001890231227980320010300108003080030800308003080030
3002480029701000000001176979925200202002020020495204804976949080029800297598337620320020200204002080029144113002110930010004000001890272227980320010300108003080030800308003080207
3002480029736000000006169799252002020020200204952048049769490800298002975983277623320020200204002080029144113002110930010001000001890272227980320010300108003080030800308003080030
3002480029702000000008166979925200202002020020495204804976949080029800297598337620320020200204002080029144113002110930010004000001890272237980320010300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  pacda x0, x8
  pacda x1, x8
  pacda x2, x8
  pacda x3, x8
  pacda x4, x8
  pacda x5, x8
  pacda x6, x8
  pacda x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5e60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)9dl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204800406420001206672880205802058020740103001497696080041800406998086999080207802241602488004016411802011009980100000000011151190016008003880105801008004280042800428004280041
802048004064500000702980205802058020740103001497696180041800416998076999080207802241602488004116411802011009980100000003011151190016008003880105801008004280042800418004180041
8020480040644000120772580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000100000051100225228002580100801008003680036800368003680036
802048003570100000772580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000003000051100225228002580100801008003680036800368003680036
802048003570200000352580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000000000051100225228002580100801008003680036800368003680036
802048003569700000632580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000000000051100225228002580100801008003680036800368003680036
8020480035702000120772580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000000000051100225228002580100801008003680036800368003680036
802048003570200000352580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000000000051100225228002580100801008003680036800368003680036
802048003569900000352580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000000000051100225228002580100801008003680036800368003680036
802048003570200000772580200802008020040100001497695580035800356996636998480200802001602008003516411802011009980100000003000051100225228002580100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
80024800406990000000811258004180020800204001001497695580035800356998837004080020800201600208003516411800211098001000010305020000525003480024800100800108003680036800368003680036
80024800357010000120035258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000010005020000425004380024800100800108003680036800368003680036
800248003570000000240035258002080020800204001000497695580035800356998837000680042800201600208003516411800211098001000000005033000425007680024800100800108003680082800368003680036
800248003569800000002592580020800208002040010004976955800358003569988370006800208002016002080035164118002110980010000100050200004251183800248001015800108003680036800368003680036
80024800356980000000352580020800208002040010014976955800358003569988370006800208002016002080035164118002110980010000021205032000625004380024800100800108003680036800368003680036
80024800357500000000700258002080020800204001000497695580035800356998837000680020800201600208003516411800211098001000020605020000425004580024800100800108003680036800368003680036
8002480035698000000035258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000005020000425004480024800100800108007380036800368003680036
8002480081698000000035258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000010005020000425007480024800100800108003680036800368003680036
800248003569800001320035258002080020800204001000497695580035800356998837000680020800201600208003516411800211098001000000005020000725004480024800100800108003680036800368003680036
800248003569900000007002580020800208002040010004976955800358003569988370006800208002016002080035164111800211098001004012121784533500162170010580772804930800108003680036800368003680036