Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTDZB

Test 1: uops

Code:

  autdzb x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100470297006158242510001000100017833014939497029702966233681810001000100070298701110011000000732852167891000100070307030703070307030
1004702970126158242510001000100017833014939497029702966233681810001000100070298701110011000000731851168521000100070307030703070307030
100470297006158242510001000100017833014939497029702966233681810001000100070298701110011000010731851167891000100070307030703070307030
100470297106158242510001000100017833004939497029702966233681810001000100070298701110011000000731851167891000100070307030703070307030
100470297106158242510001000100017833014939497029702966233681810001000100070298701110011000010731851167891000100070307030703070307030
1004702970027258242510001000100017833014939497029702966233681810001000100070298701110011000003731851167891000100070307030703070307030
100470297006158242510001000100017833004939497029702966233681810001000100070298701110011000000731851167891000100070307030703070307030
1004702970010358242510001000100017833004939497029702966233681810001000100070298701110011000000731851167891000100070307030703070307030
1004702970010558242510001000100017833004939497029702966233681810001000100070298701110011000013731851167891000100070307030703070307030
100470297006158242510001000100017833014939497029702966233681810001000100070298701110011000000731851167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autdzb x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ebec? int retires (ef)f5f6f7f8fd
1020470029675000061598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000710179116979610100720101007003070030700307003070030
1020470029680000063159824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700307003070030
1020470029691000010359824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700307003070030
10204700296790000103598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100650071017911697961010000101007003070030700307003070030
1020470029679000072659824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071027911697961010000101007003070030700307003070030
102047002968600006159824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700557003070030
1020470029679000034659824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700307003070030
102047002968600006159824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700307003070030
1020470029674000010559824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700307003070030
102047002967900006159824251020010200102001808330049669490700297002968480368674102001020010200700299121110201100991010000071017911697961010000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024700296140010012006159824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279226980510010100107003070030700307003070030
10024700296190000000040459824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000100640779226980510010100107003070030700307003070030
10024700296190000012006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000100640279226980510010100107003070030700307003070030
1002470029619000000006159824251002010020100201807430049669497006970029685023686961002010080100207002987011100211091001000006640279226980510010100107003070030700307003070030
10024700296200000000025359824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000000640279226980510010100107003070030700307003070030
1002470029619000000006159824251002010020100201807430049669497002970029685023686961002010020100207006887021100211091001000000640279226980510010100107003070030700307003070030
10024700296240000000011759824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279226980510010100107003070030700307003070030
1002470029622000000006159824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279226980510010100107003070030700307003070030
1002470029618000000008959824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000000640279226980510010100107003070030700307003070030
1002470029622000000006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000000640279326980510010100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  autdzb x0
  autdzb x1
  autdzb x2
  autdzb x3
  autdzb x4
  autdzb x5
  autdzb x6
  autdzb x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
80204800406430000120702880205802058020740103014976960080040800416998076999080207802248022480041164118020110099801000010300051102252280025801000801008003680036800368003680036
80204800356450000120772580200802008020040100014976955080035800356996636998480200802008020080035164118020110099801000010300051102252280025801000801008003680036800368003680036
802048003564500001206282580200802008020040100014976955080035800356996636998480200802008020080035164118020110099801000070300051102252280025801000801008003680036800368003680036
80204800356460000120352580200802008020040100004976955080035800356996636998480200802008020080035164118020110099801000000000051102252280025801000801008003680036800368003680036
8020480035701000000772580200802008020040100004976955080035800356996636998480200802008020080035164118020110099801000000000051102252280025801000801008003680036800368003680036
802048003569900000011302580200802008020040100014976955080035800356996636998480200802008020080035164118020110099801000000000051102252280025801000801008003680036800368003680036
80204800357020000120772580200802008020040100004976955380035800356996636998480200802008020080035164118020110099801000000000051102254280025801000801008003680036800368003680036
8020480035702000027010002580200802008020040100014976955080035800356996636998480200802008020080035164118020110099801000010000051102252280025801000801008003680036800368003680036
8020480035703000000352580200802008020040100014976955080035800356996636998480200802008020080035164118020110099801000000000051102252280025801000801008003680036800368003680036
80204800357020000003525802008020080200401000049769550800358003569966196998480200802008020080035164118020110099801000000000051102252280025801000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)9el1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
800248003569500000000070525800208002080020400100004976955800358003569988370006800208002080020800351641180021109800100000000000502023252280024800100800108003680036800368003680036
80024800357020000000003525800208002080020400100124976955800358003569988370006800208002080020800351641180021109800100000000300502023252280027800100800108003680036800368003680036
800248003570200000001209125800208002080020400100004976955800358003569988370006800208002080020800351641180021109800100000000000502003252380024800100800108003680036800368003680036
800248003570200000000070025800208002080020400100124976955800358003569988370006800208002080020800351641180021109800100000000000502002252280024800100800108003680036800368003680036
80024800357010000000003525800208002080020400100024976955800358003569988370006800208002080020800351641180021109800100000000000502003252280024800100800108003680036800368003680036
8002480035702000000012010525800208002080020400100024976955800358003569988370006800208002080020800351641180021109800100000000000502023253380024800100800108003680036800368003680036
80024800357010000000003525800208002080020400100024976955800358003569988370006800208002080020800351641180021109800100000000300502022252280024800100800108003680036800368003680036
80024800357020000000003525800208002080020400100024976955800358003569988370006800208002080020800351641180021109800100000000000502022253380024800100800108003680036800368003680036
80024800357020000000003525800208002080020400100124976955800358003569988370006800208002080020800351641180021109800100000000000502023253280024800100800108003680036800368003680036
80024800357020000000003525800208002080020400100004976955800358003569988370006800208002080020800351641180021109800100000000000502022253380024800370800108003680036800368003680036