Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

XPACI

Test 1: uops

Code:

  xpaci x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004702970181045824251000100010001783300493949702970296623368181000100010007029870111001100010732851167891000100070307030703070307030
10047029630615824251000100010001783301493949702970296623368181000100010007029870111001100010731851167891000100070307030703070307030
10047029700615824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030
10047029700615824251000100010001783300493949702970296623368181000100010007069870111001100003731851167891000100070307030703070307030
100470297106158242510001000100017833004939497029702966233681810001000100070298701110011000001320260059721001100060306030603060306030
100460297505549502210011001100052725010060296029567035751100000602922031110011000001320260059721001100060306030603060306030
1004602976052549502210011001100052725010060296029567035751100000602924131110011000001320260059721001100060306030603060306030
100460297605549502210011001100052725011060296029567035751100000602922031110011000001321260059721001100060306030603060306030
10046029750554950221001100110005272501106029602957003575110000060292203111001100003731851167891000100070307030703070307030
100470297012615824251000100010001783300493949702970296623368181000100010007029870111001100010731851167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  xpaci x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020470029700000149598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099010100070000710179116979610100101007003070030700307003070187
102047002965210061598242510200102001020018083300496694970029700296848016686741020010200102007002991221102011009946110100050000710179116979610100101007003070030700307003070030
10204700296220012615982425102001020010200180833014966949700297002968480368674102001020010200700299121110201100990101000580000710179216979610100101007003070030700307003070030
102047002962300061598242510200102001020018083301496694970029700296848036867410200102001020070029912111020110099010100060000710179116979610100101007003070030700307003070030
1020470029613000117598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099010100020000710179116979610100101007003070030700307003070030
1020470029623000103598242510200102001020018083300496694970029700296848036867410200102001020070029912111020210099010100000000710179116979610100101007003070030700307003070030
102047002962200061598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099010100020000710179116979610100101007003070030700307003070030
1020470029618000345598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099010100040000710179116979610100101007003070030700307003070030
102047002962200061598192510200102001020018083300496694970029700296848036867410200102001020070029912111020110099010100000010710179116979610100101007003070030700307003070030
102047002962200061598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099010100020000710179116979610100101007003070030700307006070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
10024700296207889598242510020100201002018076240496694970068701086850236869610020100201002070029870111002110910010019206402793269805100100100107003070030700307003070030
10024700296224561598132510020100201002018074301496694970029700296850236869610020100201002070029870111002110910010000006402792269853100100100107003070030700307003070030
100247002962257103598242510020100201002018074301496694970029700296850236869610020100201002070029870111002110910010000006401872269805100100100107003070030700307003070030
1002470029619061598242510020100201002018074301496694970029700296850236869610020100201002070029870111002110910010000006402792269805100100100107026870186700307003070030
1002470029656061598242510020100201002018074300496694970029700296850236869610020100201002070029870111002110910010000006402792269805100100100107003070030700307003070030
1002470029650061598242510020100201002018074300496694970029700296850236869610020100201002070029870111002110910010000006402792269805100100100107003070030700307003070030
1002470029622361485598242510020100201002018074301496694970029700296850236869610020100201002070058870111002110910010000006402792269805100100100107003070030700307003070030
1002470029622061598242510020100201002018074300496694970029700296850236869610020100201002070029870111002110910010000006407792269805100100100107003070030700307003070030
1002470057611061598242510020100201002018074300496694970029700296850236869610020100201002070029870111002110910010000006402792269805100100100107003070030700307003070030
1002470029622061598242510020100201002018074300496694970029700296850236869610020100201002070029870111002110910010010006402792269805100100100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  xpaci x0
  xpaci x1
  xpaci x2
  xpaci x3
  xpaci x4
  xpaci x5
  xpaci x6
  xpaci x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204800406960000000120053325802008020080200401000149769550800358003569966369984802008020080200800351641180201100998010000000030005110425338002580100801008003680036800368003680036
802048003570000000000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000000030005110325338002580100801008003680036800368003680036
802048003575100000000009425802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000000030005110325338002580100801008003680036800368003680036
802048003569900000000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000000030005110325338002580100801008003680036800368003680036
802048003570000000000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000000030005110325338002580100801008003680036800368003680036
802048003570100000000003525802008020080222401000049769550800358003569966369984802008020080200800351641180201100998010000002000005110325338002580100801008008180036800368003680036
802048003575000000000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000000030005110325338002580100801008003680036800368003680036
80204800357500000000132007725802008020080200401000149769550800358003569966369984802008020080200800351641180201100998010000000030005122325338002580100801008003680036800368003680036
802048003570000000000007725802008020080200401000149769553800358003569966369984802008020080200800351641180201100998010000000000005110325338005780100801008003680036800368008180036
802048003569500000000003525802008020080222401000049769550800358003569966369984802008023280200800351641180201100998010000000000005110325338002580100801008003680036800828003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e191e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)dcddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
8002480041699101000007062580020800208002040010004976955800358003569988370006800208002080072800351641180021109800100300502100725064800288001000800108021880036800368003680036
800248003575111100000412580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000502100625064800288001000800108003680036800368003680036
8002480035702101000004125800208002080020400100049743038003580035699883700068002080020800208003516411800211098001062010502100625064800288001000800108003680036800368003680036
800248003570200100000412580020800208002040010004973910800358003569988370006800208002080020800351641180021109800100000502100625084800288001000800108003680036800368003680036
800248003570210100011703612580020800208002040010004976955800358003570021370006800208002080020800351641180021109800100000502100425046800278001000800108003680036800368003680036
8002480035695101000006425800208002080020400100049769558003580035699883700068002080020800208003516411800211098001000005021005250610800278001000800108003680036800368003680036
800248003569910100000412580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000502100425046800278001000800108003680036800368003680036
800248003575110100000412580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100600502100525059800288001000800108003680036800368003680036
8002480035702101000001552580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000502100425046800278001000800108003680036800368003680036
800248003574310100000832580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000502100625064800278001000800108003680036800368003680036