Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTDA

Test 1: uops

Code:

  autda x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
1004702971111526858242510001000100017833004939497029702966233681810001000200070298701110011000007658544678910000100070307030703070307030
100470296611026858242510001000100017833004939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
1004702966110211058242510001000100017833014939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
100470296511026858242510001000100017833014939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
100470296611026858242510001000100017833004939497029702966237681810001000208670298701110011000007648544678910000100070307030703070307030
100470296611026858242510001000100017833004939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
100470296511026858242510001000100017833004939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
100470296511026858242510001000100017833004939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
100470296611026858242510001000100017833004939497029702966233681810001000200070298701110011000007648544678910000100070307030703070307030
100470296611026858242510001000100017833004939497029710966233681810001000208270298701110011000007648544678910000100070307030703070307030

Test 2: Latency 1->1

Code:

  autda x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e18191e1f3a3f4d5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acbranch call indir mispred nonspec (ca)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204700296190000000006159824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000000000710179116979610100101007003070069700307003070069
10204700296210100000006159824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000000000710179116979610103101007003070030700307003070069
10204700296511000000006159824025102001020010200180833019866949700297002968480368674102001020020200700299121110201100991010020100000710179116979610100101007003070030700307003070030
10204700296220000000006159824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000000000710179116979610100101007003070030700307003070030
102047002962000000000049759824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000000000710179116979610100101007003070030700307003070030
10204700296200000000006159824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000100000710179116979610100101007003070030700307003070030
10204700296200000000006159824025102001020010200180833004966949700297002968480368674102001020020200700549121110201100991010000000000710179116979610100101007003070030700307003070030
10204700296190000000006159824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000100000710179116979610100101007003070030700307003070030
10204700296200000000006159824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000000000710179116979610100101007003070030700307003070030
102047002962000000000010359824025102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000000000710179116979610100101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024700296170000210141059824251002310023100201807430049669497002970029685023686961002010020200207005987011100211091001000000000640279226980510010100107003070030700307003070030
10024700296570000006159824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
10024700296200000006159824251002010020100201807430049669497002970029685833686961002010020200207002987011100211091001000000000640379226980510010100107003070030700307003070030
10024700296290000006159824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
10024700296190000009459824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000010000640279326980510010100107003070030700307003070030
100247002962000000012659824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
100247002962000000012459824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000000001640279226980510010100107003070030700307003070030
10024700296210000006159824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030
100247002962100000023259824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000030000640279226980510010100107003070030700307003070030
10024700296200000006159824251002010020100201807430049669497002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  autda x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5e60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
302048002969900000000616979925202002020020200494260101497694980029800297596137618120200202004020080029144113020110099301000010000019102721179794201000301008003080030800308003080030
30204800297450000000010369799252020020200202004942601004976949800298002975961376181202002020040200800291441130201100993010000410000019101721179794201000301008003080030800308003080030
302048002970600000000616979925202002020020200494260100497694980029800297596137618120200202004020080029144213020110099301000060000019101721179794201000301008003080030800308003080030
302048002970200021088010369799252020020200202004942601004976989800298002975961376181202002020040200800291441130201100993010000630000019101721179794201000301008003080030800308003080030
30204800297020000012001036979925202002020020200494260100497694980029800297596137618120200202004020080029144113020110099301000420300019101721179794201000301008005680030800308003080030
3020480029701000000017496979925202002020020200494260100497694980029800297596137618120200202004020080029144113020110099301000050000019101721179794201220301008003080030800308003080030
302048002974400000000616979925202002020020200494260101497694980029800297596137618120200202004020080029144113020110099301000000000019101721179794201000301008003080030800308003080030
302048002970200000000896979925202002020020200494260101497694980029800297596137618120200202004020080029144113020110099301000000000019101721179794201000301008003080030800308003080030
30204800297020000012001566979925202002020020200494260101497571780029800747596137618120200202004020080029144113020110099301000000000019101721179794201000301008003080030800308003080030
302058002970200000000616979925202002020020200494260101497694980029800297596137618120200202004020080029144113020110099301000010600019101721179794201000301008003080030800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch indir mispred nonspec (c6)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
300248002969600000000061697992520020200202002049520481497694908002980029759833762092002020020400208002914411300211093001000109018903724379803200100300108003080030800308003080030
3002480029700000000012061697992520020200202002049520481497694908002980029760443762082002020020400208002914411300211093001000003018903723379803200100300108003080030800308003080030
30024800297000000011120105569799252002020020200204952048149769490800298002975983376203200202002040020800291441130021109300100020001890372337980320010349300108003080030800308003080030
300248006770000000000089697992520020200202002049520481497694908002980029759833762112002020020400208002914411300211093001000109018903723379803200100300108003080030800308003080030
300248002970000000000061697992520020200202002049520481497694908002980029759833762032002020020400208002914411300211093001000000018903723379803200100300108003080030800308003080030
30024800297160000000120616979925200202002020020496061214976949080029807757598316763092002020517413108002914411300211093001000100018903723380055200150300108003080030800308034480030
3002480159701000000090150697992520052200202008649520481497694908002980029759833762112002020020400208002914411300211093001000000018903723379803200100300108003080030800308003080030
300248002970000000020061697992520020200202002049520481497694908002980029759833762032002020020400208002914421300211093001000003018903723379803200100300108003080030800308003080030
30024800296960000000120103697994420020200202002049520481497694908002980029759833762032002020020400208002914411300211093001000000118903723379808200100300108003080030800308003080030
300248002969900000001201203697992520020200202002049520480497694908002980029759833762032002020020400208002914411300211093001000003018903724479823200100300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  autda x0, x8
  autda x1, x8
  autda x2, x8
  autda x3, x8
  autda x4, x8
  autda x5, x8
  autda x6, x8
  autda x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f465051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
80204800416980000120770025802008020080200401000049769558016880035699663699848020080200160200800351641180201100998010000020051103252280025801000801008003680036800368003680036
8020480035698000000350025802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000010351102252280025801000801008003680036800368003680036
80204800356450000120350025802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000051102252280025801000801008003680036800368003680036
8020480035745000000350025802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000000051102252280025801000801008003680036800368003680036
8020480035744000000630025802008020080200401000098769668003580035699663699848020080200160200800351641180201100998010002000051103252380025801000801008003680036800368003680036
8020480035699000000350025802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000051102252280025801000801008003680036800368003680036
80204800356970000003200025802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000010051103252280025801000801008003680036800368003680036
8020480035702000090350025802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000010051102252280025801000801008003680036800368003680036
8020480035701000000450025802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000020051102252280025801000801008003680036800368003680036
80204800357020000120770025802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000030051102252280025801000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
800248004064300000000077258002080020800204001001497695508003580035699883700068002080020160020800351641180021109800100001030005020225004358002480010800108003680036800368003680036
800248003569800000000077258002080020800204001001497695508003580035699883700068002080020160020800351641180021109800100000000005033425030458002480010800108003680036800368003680036
8002480035703000000000700258002080020800204001000497695508003580035699883700068002080020160020800351641180021109800100000000005020325050468002480010800108003680036800368003680036
800248003570200000000058258002080020800204001000497695508003580035699883700068002080020160020800351641180021109800100000000005020225030468002480010800108003680036800368003680036
80024800357020000000002252580020800208002040010004976955080035800356998837003780020800201600208003516411800211098001000010003050204251000248002480010800108003680036800368003680036
8002480035701000000012035258002080020800204001000497695508003580035699883700068002080020160020800351641180021109800100000000005020433600428002480052800108003680036800368003680036
8002480035702000000000756258002080020800204001000497695508003580035699883700068002080020160020800351641180021109800100000030005020425600758002480010800108003680036800368003680036
800248003570400000000035258002080020800204001000497695508003580035699883700068002080020160020800351641180021109800100001000005020425000448002480010800108049080447805808071280626
800248109269800111141816081056432834680335803778035040159804977640080670807627025869705338035380522160290800351641180021109800100000000005020425000428002480010800108003680036800368003680036
8002480035703000000000510258002080020800204001000497695508003580035699883700068002080020160020800351641180021109800100001000005020425000428002480010800108003680036800368003680036