Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PACDZA

Test 1: uops

Code:

  pacdza x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10047029700615824251000100010001783301493949702970296623368181000100010007029870111001100000732853367891000100070307030703070307030
10047029700825824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
10047029700615824251000100010001783301493949702970296623368181000100010827029870111001100000733853367891000100070307030703070307030
10047029660615824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
10047029700615824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
10047029710615824251000100010001783301493949702970296623368181000100010007029870111001100010733853367891000100070307030703070307030
10047029700615824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
100470297012615824251000100010001783301493949702970296623368181000100010007029870111001100013733853367891000100070307030703070307030
10047029700615824251000100010001783301493949702970296623368181000100010007029870111001100010733853367891000100070307030703070307030
10047029700615824251000100010001783301493949702970296623368181000100010007029870111001100010733853367891000100070307030703070307030

Test 2: Latency 1->1

Code:

  pacdza x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
1020470029616000000726598242510200102001020018083301496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
1020470029622000120089598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
10204700296220001200595598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
1020470029620000000726598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
102047002962400000061598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099101000000071017901169906101000101007003070030700307003070030
10204700296230001200217598242510200102001020018083301496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
102047002962000000061598242510200102001020018083300496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
102047002962200000061598242510200102001020018083301496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
102047002962200000061598242510200102001020018083301496694970029700296848036867410200102001020070029912111020110099101000000071017901169796101000101007003070030700307003070030
102047002962200000061598242510200102001020018083301496694970029700296848036867410200102001020070029912111020110099101000001071017901169796101000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)d9daddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
1002470029616024006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000000640379002269805100100100107003070030700307003070030
100247002962100006159824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279002269805100100100107003070030700307003070030
1002470029615000072659824251002010020100201807430149669497002970029685023686971002010020100207002987011100211091001000000640279002269805100100100107003070030700307003070030
100247002965700006159824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000030640379002269805100100100107003070030700307003070030
1002470029621000010359824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279002269805100100100107003070030700307003070030
100247002962100006159824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279002269805100100100107003070030700307003070030
10024700296210120057859824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279002269805100100100107003070030700307003070030
100247002962000008959824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001001000640279002269805100100100107003070030700307003070030
1002470029679000010459824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000000640279002269805100100100107003070030700307003070030
1002470029671000061598242510020100201002018074301496694970029700296850236869610020100201002070029870111002110910010012000640279002269805100100100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  pacdza x0
  pacdza x1
  pacdza x2
  pacdza x3
  pacdza x4
  pacdza x5
  pacdza x6
  pacdza x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
802048004070000000005625802008020080200401000154976955080035800356996636998480200802008020080035164118020110099801000000005110513255380025801000801008003680036800368003680036
802048003574400000009125802008020080200401000104976955080035800356996636998480200802008020080035164118020110099801000000005110003253580025801000801008003680036800368003680036
802048003574600000003525802008020080200401000104976955080035800356996636998480229802008020080035164118020110099801000001265110003255580025801000801008003680036800368003680036
802048003569900000003525802008020080200401000104976955080035800356996636998480200802008020080035164118020110099801000000005110005255580025801000801008003680036800368003680036
802048003574601000007725802008020080200401000104976955080035800356996636998480200802008020080035164118020110099801000000005110005255380025801000801008003680036800368003680036
802048003570200000003525802008020080200401000004976955080035800356996636998480200802008020080035164118020110099801000000005110003255580025801000801008003680036800368003680036
8020480035702000001203525802008020080200401000104976955080035800356996636998480200802008020080035164118020110099801000000005110005255580025801000801008003680036800368003680036
802048003574400000003525802008020080200401000004976955080035800356996636998480200802008020080035164118020110099801000005005110005255580025801000801008003680036800368003680036
802048003570000000003525802218020080200401000004976955080035800356996636998480200802008020080035164118020110099801000003005110005255380025801000801008003680036800368003680036
802048003570200000003525802008020080200401000104976955080035800356996636998480200802008020080035164118020110099801000000005110003255580025801000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd0d5map dispatch bubble (d6)d9dbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024800466970000003525800208002080020400100149769558003580035699883700068002080020800208003516411800211098001000000300050200122500628002480010800108003680036800368003680036
8002480035702000000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000000005020032500268002480010800108007280036800368003680036
8002480035703000000562580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000000005020062500628002480010800108003680036800368003680036
8002480035701000000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000000005020062500638002480010800108003680036800368003680070
8002480035702000000352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000000005020032500268002480010800108003680036800368003680036
8002480071698000000352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000000005020062500628002480010800108003680036800368003680036
8002480035704000000352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100001000005020052500638002480010800108003680036800368003680036
8002480072693000000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100001000005020032500358002480010800108003680036800368003680036
8002580035702000000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100001000005020022500368002480010800108003680036800368003680036
8002480035710000000352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000000005020062500648009180010800108003680036800368003680036