Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTIZB

Test 1: uops

Code:

  autizb x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100470297193465824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030
1004702970147615824251000100010001783300493949702970296623368181000100010007029870111001100000731851267891000100070307030703070307030
1004702970132615824251000100010001783300493949702970296623368181000100010007029870111001100003731851167891000100070307030703070307030
10047029700615824251000100010001783300493949702970296623368181000100010007029870111001100000731851168051000100070307030703070307030
1004702971180615824251000100010001783300493949702970296623368181028100010007029870111001100003731851167891000100070307030703070307030
10047029700615824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030
1004702970121035824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030
10047029706615824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030
1004702970498615824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030
10047029700615824251000100010001783300493949702970296623368181000100010007029870111001100000731851167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autizb x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
1020470029617000000061598242510200102001020018084270496694907002970029684803686741020010200102007002991211102011009910100000000007262791169817101000101007003070030700307003070030
1020470029620000000061598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100200000007101791169796101000101007003070030700307003070030
10204700296210000000276598242510203102001020018083300496694907002970029684803686741045810200102007002991211102011009910100000000007101791169796101000101007003070030700307003070030
10204700296200000001375859770151102231022610346180842704967811070029708516871396692581062210200102007002991211102011009910100000003007101791169796101000101007006970307701877007170030
10204700296200000000547598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000000007101791169796101000101007003070030700307003070030
1020470029621000000161598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000100007101791169796101000101007003070030700307003070030
10204700296210000120089598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000003007101791169796101000101007003070030700307003070030
102047002965600000005792598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000006007101791169796101000101007003070030700307003070030
1020470029617000000061598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000000007101791169796101000101007003070030700307003070030
10204700296120000120061598244610200102001020018083300496694907002970029684807686741020010200102007006991221102011009910100000003007101791169796101000101007003070070700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
10024700296170000061598242510020100201002018074300496694970029700296850236869610020100201002070029174411100211091001000006403792269805100100100107003070030700307003070030
10024700296520000080459824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000096402792269805100100100107003070030700307003070030
10024700296210002708259824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001001006402792269805100100100107003070030700307003070030
1002470029626000006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000073586402952269805100100100107003070030700307003070030
1002470029622100120184459824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000006402792269805100100100107003070030700307003070030
10024700296220000061598242510020100201002018074300496694970029700296850236869610020100201002070029870111002110910010000156402792269805100100100107003070030700307003070030
1002470029622000006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000096402792269805100100100107003070030700307003070030
1002470029621000006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000006402792269805100100100107003070030700307003070030
10024700296230001808959824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001010074906402793269805100100100107003070030700307003070030
1002470029620000006159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000006402792269805100100100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  autizb x0
  autizb x1
  autizb x2
  autizb x3
  autizb x4
  autizb x5
  autizb x6
  autizb x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)0309l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020480041643000015056110802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010001005110325228002580100801008003680036800368003680036
80204800357440100007725802008020080200401000149769550800358003569966369984802008020080200800351641180201100998010001005110225228002580100801008003680036800368003680036
8020480035702000000156225802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000005110225228002580100801008003680036800368003680036
80204800357020000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000005110225228002580100801008003680036800368003680036
80204800357020000003525802008020080200401000149769550800358003569966369984802008020080200800351641180201100998010000005110225228002580100801008003680036800368003680036
80204800357020000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010001005110217228002580100801008003680036800368003680036
802048003570200000060525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000005110225228002580100801008003680036800368003680036
80204800357020000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010000005110225208002580100801008003680036800368003680036
80204800357020000003525802008020080200401000049769550800358003569966369984802008020080200800351641180201100998010001005110225228002580100801008003680036800368003680036
80204800357020000004025802008020080200401000149769550800358003569966369984802008020080200800351641180201100998010000005110225228002580100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)181e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)abacc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024800406960000026425800208002080020400100149769558003580035699883700068002080020800208003516411800211098001000000000502013251158002480010800108003680036800368003680036
800248003570000000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000000050205256118002480010800108003680036800368003680036
800248003574600000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000000050204255128002480010800108003680036800368003680036
800248003575000000352580020800208002040010014976955800728003569988370006800208002080020800351641180021109800100000000050205254108002480010800108003680036800368003680036
8002480035701000001434258002080020800204001001497695580035800356998837000680020800208002080035164118002110980010000000005020925958002480010800108003680036800368003680036
800248003570000000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100001000050203254128002480010800108008180036800368003680036
800248003569900000352580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100001000050206251048002480010800108003680036800368003680036
8002480035699000003525800208002080020400100149769558003580035699883700068002080020800208003516411800211098001000000000502012251258002480010800108003680036800368003680036
8002480035700000007002580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000000050205331048002480010800108003680036800368003680036
800248003570000000572580020800208002040010014976955800358003569988370006800208002080020800351641180021109800100000002050205256118002480010800108003680036800368003680036