Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTIA

Test 1: uops

Code:

  autia x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03181e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100470297000615824251000100010001783301493949702970296623368181000100020007029870111001100000731851167891000100070307030703070307030
1004702970112615824251000100010001783301493949702970296623368181000100020007029870111001100000731851167891000100070307030703070307030
1004702970006158242510001000100017833014939497029702966233681810001000200070298701110011000001320260059721001100060306030603060306030
100460297600554950221001100110005272500060296029567035751100000602922031110011000001320260059721001100060306030603060306030
100460297600554950221001100110005272500060296029567035751100000602922031110011000201320260059721001100060306030603060306030
100460297500554950221001100110005272500060296029567035751100000602922031110011000001320260059721001100060306030603060306030
10046029760099495022100110011000527250006029602956843575110000060292203111001100000731851167891000100070307030703070307030
1004702961001035824251000100010001783300493949702970296623368181000100020007029870111001100000731851167891000100070307030703070307030
10047029640121035824251000100010001783300493949702970296623368181000100020007029870111001100000731851167891000100070307030703070307030
10047029660126158242510001000100017833004939497029702966233681810001000200070298701110011000190731851167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autia x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204700296180000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179126979610100101007003070030700307003070030
10204700296200000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000030710179116979610100101007003070030700307003070030
10204700296220000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179116979610100101007003070030700307003070030
102047002962200000000615982425102001020010200180833014966949070029700296848036867410200102002020070029912111020110099101000000240710179116979610100101007003070030700307003070030
1020470029622000000001248598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179116979610100101007003070030700307003070030
10204700296230000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000030710179116979610100101007003070030700307003070030
10204700296220000000061598242510205102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179116979610100101007003070030700307003070030
10204700296230000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179116979610100101007003070030700307003070030
102047002962200000000726598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179116979610100101007003070030700307003070030
10204700296210000000061598242510200102001020018083301496694907002970029684803686741020010200202007002991211102011009910100000000710179116979610100101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002470029616000000000209598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
1002470029622000000012061598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000001000640279236980510010100107003070030700307003070030
100247002962200000000082598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
100247002962200000000061598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
10024700546140000000002110598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
1002470029623000000030156598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
1002470029622000000000103598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
100247002962000000000061598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
100247002962100000000061598242510020100201002018074301496694907002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
1002470029622000000000103598242510020100201002018074301496694907002970029685023686961002010020200207005487011100211091001000000000640279226980510010100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  autia x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ebec? int retires (ef)f5f6f7f8fd
302048002969700000089697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000000191058033797942010000301008003080030800308003080030
302048002975000003061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000000191037233797942010000301008003080030800308003080030
302048002970000000061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000000191037233797942010000301008003080030800308003080030
302048002970000000061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000000191037233797942010000301008003080030800308003080030
3020480029700000012061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000010191037233797942010000301008003080030800308003080030
302048002974800000076697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000000192937233797942010000301008003080030800308003080030
302048002975000000061697994520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000200000000191037233797942010000301008003080030800308003080030
3020480029699100000726697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000000000191037233797942010000301008003080030800308003080030
302048002970100000061697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000000000191037233797942010000301008003080030800308003080030
302048002970000000061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000006000191037233797942010000301008003080030800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4d5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002480060745000000616979902520020200202002049520481497694908002980029759833762032002020020400208002914411300211093001000000018903720227980320010300108003080030800308003080030
3002480029743000000616979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001000000018902720227980320010300108003080030800308003080030
3002480029783000000896979902520020200202008649520480497694908002980029759613762032002020020400208002914411300211093001000000018902720227980320015300108003080030800308003080030
30024800296970000002516979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001000030018902720227980320010300108003080030800308003080030
30024800297010000120616979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001001000018902720227980320010300108003080030800308003080030
30024800297020100120616979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001000030018902720227980320010300108003080030800308003080030
3002480029700000000616979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001000000018902720227985620010300108003080030800308003080030
30024800297020000007266979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001001000018902720227980320010300108003080030800308003080030
3002480029702000000616979902520020200202002049520480497694908002980029759833762032002020020401828002914421300211093001001030018902720327980320010300108003080030800308003080030
3002480029701010000616979902520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001000000018902720227980320010300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  autia x0, x8
  autia x1, x8
  autia x2, x8
  autia x3, x8
  autia x4, x8
  autia x5, x8
  autia x6, x8
  autia x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802048004169900000000772580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110325228002580100801008003680036800368003680036
8020480035747000000003292580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110225228002580100801008003680036800368003680036
802048003569800000000352580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110225228002580100801008003680036800368003680036
8020480035714000000006782580200802008020040100014976955080035800356996636998480288802001602008003516411802011009980100000000005110225228002580100801008003680036800368003680036
8020480035699000000002252580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110225228002580100801008003680036801268008180217
802048003570200000300352580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000100005110225228002580100801008003680036800368003680036
802048003570100000000632580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110225228002580100801008003680036800368003680036
80204800357020000012001332580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110225228002580100801008003680036800368003680036
8020480035702000000001752580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000100005110225228002580100801008003680036800368003680036
8020480035702000001200352580200802008020040100014976955080035800356996636998480200802001602008003516411802011009980100000000005110233228002580100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? int retires (ef)f5f6f7f8fd
80024800526970000000352580020800208002040010004976955800358003569988370006800208002016002080080164218002110980010000000015020525448002480010000800108003680036800368003680036
80024800357000000030182725800208007680020400100049769558003580035699883700068002080020160020800351641180021109800100001015005020425428002480010000800108003680036800368003680036
800248003570000100120352580020800208002040010014976955800358003569988370006800208002016002080035164118002110980010000003005033225248002480010000800108003680036800368003680036
80024800356990000030016602580020800208002040010014976955800358003569988370006800208002016002080035164118002110980010002000005020225248002480010000800108003680036800368003680036
80024800357500000000352580020800208002040010014976955800358003569988370006800498002016002080035164118002110980010000000005020225248002480010000800108003680036800368003680036
800248003569900000001752580020800208002040010014976955800358003569988370006800208002016002080035164118002110980010000000005020225248002480010000800108003680036800368003680036
80024800356990000000352580020800208002040010014977000800358003569988370006800208002016002080035164118002110980010000000005020425428002480010000800108003680036800368003680036
80024800357000000000352580020800208002040010014976955800358003569988370006800208002016002080035164218002110980010000100005020425428002480010000800108003680036800368003680036
800248003570100000600772580020800208002040010014976955800358003569988370006800208002016002080035164118002110980010000000005020425428002480010000800108003680036800368003680036
80024800357020000000772580041800208002040010014976955800358003569988370006800428002016002080035164118002110980010000000005020425428002480010000800108003680036800368003680036