Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PACIA

Test 1: uops

Code:

  pacia x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
1004702970010458242510001000100017833014939497029702966233681810001000200071078701110011000000307328511678910000100070307030703070307030
1004702963010458242510001000100017833004939497029702966233681810001000200070298701110011000000007318511678910000100070307030703070307030
1004702970128258242510001000100017833004939497029702966233681810001000200070298701110011000000307318511678910000100070307030703070307030
100470297006158242510001000100017833004939497029702966233681810001000200070298701110011000000007318511678910000100070307030703070307030
100470297006158242510001000100017833004939497029702966233681810001000200070298701110011000000307318511678910000100070307030703070307030
100470297006158242510001000100017833004939497029702966233681810001000200070298701110011000001307318511678910000100070307030703070307030
100470297006158242510001000100017833004939497029702966233681810001000200070298701110011000001007318511678910000100070307030703070307030
100470297006158242510001000100017833014939497029702966233681810001000200070298701110011000001007318511678910030100070307030703070307030
100470297006158242510001000100017833014939497029702966233681810001000200070298701110011000001007318511678910000100070307030703070307070
100470297009658242510001000100017833014939497029702966233681810001000200070298701110011000001307318511678910000100070307030703070307030

Test 2: Latency 1->1

Code:

  pacia x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)090f18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020470029570000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030
10204700296230000000103598242510200102001020018083300496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007067770030700307003070030
1020470029619000000061598242510200102001020018083300496694907002970029684803686741020010200202007002991211102011009910100000007100179116983310100101007003070030700307003070030
10204700296190000000726598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030
1020470029622000000061598242510200102001020018083300496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030
1020470029622000000061598242510200102001020018083300496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030
1020470029622000009061598242510200102001020018083300496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030
10204700296220000000103598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100010007100179116979610100101007003070030700307003070030
1020470029622000000061598242510200102001020018083300496694907002970029684803686741020010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030
1020470029622000000061598242510200102001020018083300496694907002970029684803686741023010200202007002991211102011009910100000007100179116979610100101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
100247002961600000210615982425100201002010020180743014966949713317129268911154695021092011287226847121087033110021109100102016276821012435605570600101130100107129371367713627081370030
10024700296750000112094359824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000036064027902269905100100100107003070030700307003070030
100247002965500000002135982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000064027902269805100100100107003070030700557003070030
1002470029657000001201035982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000064027902269805100100100107003070030700307003070030
100247006865500000002855982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000064027902269805100100100107003070030700307003070030
10024700296150000000665982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100003064027902269805100100100107003070030700307003070030
100247002962100000120615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000064027902269805100100100107003070030700307003070030
10024700296200000000615982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000064037902369805100100100107003070030700307003070030
100247002961900000002515982425100201002010020180743014966949700297002968502368696100201002020138700298701110021109100100000064027902269805100100100107003070030700307003070030
100247002965500000007265982425100201002010020180743014966949700297002968502368696100201002020020700298701110021109100100000064027902269805100100100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  pacia x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020480029697000001200616979925202002020020200494260114976994800298002975961376181202002020040200800291441130201100993010000000000001910272117979420100301008003080030800308003080030
302048002975000000000616979925202002020020200494260114976949800298002975961376181202002020040362800291441130201100993010000000000001910172117979420100301008003080030800308003080030
302048002970000000000616979925202002020020200494260114976949800298002975961376181202002020040200800291441130201100993010000000000001910172117979420100301008003080030800308003080030
3020480029696000000001036979925202002020020200494260114976949800298002975961376181202002020040200800681441130201100993010000000200001931172137979420100301008003080076800308003080030
3020480029749000000007266979925202002020020200494260114976949800298002975961376181202002020040200800291441130201100993010000000000001910172217979420100301008003080030800308003080030
302048002974900000000616979925202002020020264494260114976949800298002975961376181202002020040200800291441130201100993010000000000001910172117979420100301008003080030800308003080030
3020480029700000100007266979945202002020020200494260114976949800298002975961376181202002020040200800291441130201100993010000003030001910172117979420100301008003080030800308003080030
3020480074749000001200616979925202002020020200494260114976949800298002975961376188202002020040200800291441130201100993010000000000001910172117979420100301008003080030800308003080030
3020480029695000001200616979925202002020020200494260114976949800298002975961376181202002020040200800741441130201100993010000000030001910172117979420100301008003080030800308003080030
302048002969900000000616979025202002020020200494260114976949800298002975961376181202002028040200800291441130201100993010000000000001910172117979420100301008007180030800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5e60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024800296960000006169799252002020020200204952048014976949800298002975983376203200202002040020800291441130021109300100000000018902720227980320010300108003080030800308003080030
30024800297020000008969799252002020020200204952048014976949800298002975983376203200202002040020800291441130021109300100003030018902720427980320010300108003080030800308003080030
30024800297020000004386979925200202002020020495204800497694980029800297598313376854220722260345402814601443313002110930010200106797000222432640528036520010300108129081053811108109881105
30024808537541120000133169799252002020020200204952048004976949800298002975983376203200202002040020800291441130021109300100001000018902720227980320010300108003080030800308003080030
300248002970200000048769799252002020020200204952048004976949800298002975983376199200202002040020800791441130021109300100000000018902720227980320010300108003080030800308003080030
30024800297020000006169799252002020020200204952048004976949800298002975983376203200202002040020800781441130021109300100000000018902720227980320015300108003080030800308003080030
30024800297020001350010369799252002020020200204952048004976949800298002975983376203200202002040020800291441130021109300100000000018902721227980320010300108003080030800308003080030
300248002970200000014569799252002020020200204952048004976949800298002975983376203200202002040020800291441130021109300100000000018902720227980320010300108003080030800308003080030
300248002971400000010369799252002020020200204952048004976949800298002975983376203200202002040020800291441130021109300100000030018902720227980320010300108003080030800308003080030
30024800297020000006169799252002020020200204952048004976949800298002975983376203200202002040020800291441130021109300100000000018902720227980320010300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  pacia x0, x8
  pacia x1, x8
  pacia x2, x8
  pacia x3, x8
  pacia x4, x8
  pacia x5, x8
  pacia x6, x8
  pacia x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802048004169800001170074225802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000006303005110225228002580100801008003680036800368003680036
802048003574400000007725802218020080200401000049769558003580035699663699848020080200160264800351641180201100998010000003603005110233228002580100801008003680036800368003680036
8020480035702000012009125802008020080222401000049769558003580035699663699848020080200160200800351641180201100998010000003100005110225228006780100801008008280036800368003680036
8020480035699000030003525802008020080200401000049769558003580035699843699848020080200160200800351641180201100998010000005900005110225228002580100801008003680036800368003680036
80204800357000000000352580200802008020040100004976955800358008169966369984802008020016020080035164118020110099801000000306005110225328002580100801008003680036800368003680036
80204800806990000000352580200802008020040100004976955800358003569966369984802008020016020080035164118020110099801000000503005110225228002580100801008003680036800368003680036
80204800357000000000782580200802008020040100004976955800358003569966369984802228020016031280035164118020110099801000000203005110225228002580100801008003680036800368003680036
802048003570000001200352580200802008020040100004976955800358003569966369984802008020016020080035164118020110099801004000000005110225228002580100801008003680036800368003680036
802048003570000000007002580200802008020040100004976955800358003569966369984802008020016020080035164118020110099801000000203005110225228002580100801008003680036800368003680036
80204800356991000000352580200802008020040100004976955800358003569966369984802008020016020080035164118020110099801000020100005110225228002580100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03l1i tlb fill (04)1e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024800406990035258002080020800204001000497695580035800356998837000680020800201600208003516411800211098001000000050200525328002480010800108008180173800368003680036
80024800357020083258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000050200325328002480010800108003680036800368003680036
80024800357030091258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000050200225238002480010800108003680036800368003680036
80024800357020035258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000050200325328002480010800108003680036800368003680036
80024800356940035438002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000050200325328002480010800108003680036800368003680036
80024800357020035258002080020800204001000497695580035800356998837000680020800201600208003516411800211098001000000050510225248002480010800108003680036800368003680036
800248003570200700258002080020800204001001497695580035800356998837000680064800201600208003516411800211098001000036050200425328002480010800108003680036800368003680036
80024800357010035258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000050200225238002480010800108003680036800368003680036
800248003569700700258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000050200325328002480010800108003680036800368003680036
800248003570200700258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001004020050200225238002480010800108003680036800368003680036