Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTDB

Test 1: uops

Code:

  autdb x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004702970000192582425100010001000178330049394970297029662336818100010002000702987011100110000073285001167891000100070307030703070307030
100470297003061582425100010001000178330149394970297029662336818100010002000702987011100110001373193001167891000100070307030703070307030
100470296500061582425100010001000178330049394970297029662336818100010002000702987011100110000073285002167891000100070307030703070307030
100470297000061582425100010001000178330149394970297029662336818100010002000702987011100110000373285001167891000100070307030703070307030
100470297000061582425100010001000178330149394970297029662336818100010002000702987011100110000073185001167891000100070307030703070307030
100470297000061582425100010001000178330049394970297029662336818100010002000702987011100110000073285001167891000100070307030703070307030
1004702970012061582425100010001000178330149394970297029662336818100010002000702987011100110000073185001167891000100070307030703070307030
100470297000061582425100010001000178330149394970297029662336818100010002000702987011100110000073185001167891000100070307030703070307030
1004702971000103582425100010001000178330049394970297029662336818100010002000702987011100110000073185001267891000100070307030703070307030
100470297000061582425100010001000178330149394970297029662336818100010002000702987011100110001073185001167891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autdb x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
1020470029621000001200615982425102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010001007102791169796101000101007003070030700307003070030
10204700296500000093600615982425102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000007101791169796101000101007003070030700307003070030
1020470029622000001200615982425102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000007101791169796101000101007003070030700307003070030
102047002962200000000615982425102001020010200180833004966949700297002968480368674102001020020200700299121110201100991010000007101791169796101000101007003070030700307003070030
10204700296210000000018695980625102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010001037101791169796101001101007003070030700307003070030
102047002965000000954001315982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000007101791169796101000101007003070030700307003070030
102047002962300000000615982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000007101791169796101000101007003070030700307003070030
1020470029623000001200845982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010002007101791169796101000101007003070030700307003070030
1020470029622000001041002645982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000037101791169796101000101007003070030700307003070030
102047002962200000938806159824251020010200102001808330149669497002970029684803686741020010200202007002991211102011009910100000187101791169796101000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03181e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)a9abacc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100247002961900103598242510020100201002018074300496694970029700296850236869610020100202002070029870111002110910010000300640279226980510010100107003070030700307003070030
10024700296220061598242510020100201002018074300496694970029700296850236869610020100202002070029870111002110910010800000640279236980510010100107003070030700307003070030
10024700296220089598242510020100201002018074300496694970029700296850236869610020100202002070029870111002110910010002300640379226980510010100107003070030700307003070030
100247002962301261598242510020100201002018074300496694970029700296850236869610020100202002070029870111002110910010000000640279226980510010100107003070030700307003070030
10024700296220061598242510020100201002018074301496694970029700296850236869610020100202002070029870111002110910010100000640279226980510010100107003070030700307003070030
10024700296220061598242510020100201002018074301496694970059700296850236869610020100202015470029870111002110910010000000640279326980510010100107003070030700307003070030
10024700296220061598242510020100201002018074300496694970029700296850236869610020100202002070029870111002110910010000000640279226980510010100107003070030700307003070030
10024700296180222615982425100201002010020180743004966949700297002968502368696100201002020020700298701110021109100101004800640279226980510010100107003070030700307003070030
10024700296180061598242510020100201002018074301496694970029700296850236869610020100202002070029870111002110910010000000640279226980510010100107003070030700307003070030
10024700296220061598242510020100201002018074300496694970029700296850236869610020100202002070029870111002110910010100000640279226980510010100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  autdb x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0f18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204800296960000000061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000002111920116117987020100301008003080030800308003080030
30204800297500000001201596979925202002020020211494304914976949800298002976028776228202112022740254800291441130201100993010020570001111941116117987020100301008003080056800308003080030
302048002974800000000616979925202002020020211494304914976949800298002976020776228202112022740254800291441130201100993010000750301111920116117987020100301008003080030800308003080030
302048002975100000012061697992520200202002021149430491497694980029800297602077622920211202274025480029144113020110099301000000001111920116117987020100301008003080030800308003080030
302048002970000000000349697992520200202002021149430491497694980029800297602077622820211202274025480029144113020110099301000000001111920116117987020100301008003080030800308003080030
302048002974800000000726697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000001910172117979420100301008003080030800308003080030
30204800297490000000061697992520207202002020049426011497694980029800297596177618120200202004020080029144113020110099301000000000001910172117979420100301008003080030800308003080030
302048006974900004012352104697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000010300001910172117979420100301008003080030800308003080030
302048002969800000000103697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000001910172117979420100301008003080030800308003080030
30204800297500000010061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000000001910172117979420100301008003080069800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024800296430000012019669799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000030018900272227980320010300108003080030800308003080030
3002480029703000000077069799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000001000018900272227980320010300108003080030800308003080030
300248002970200000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001004000000018900272227980320010300108003080030800308003080030
300248002969900000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000000018900272227980320010300108003080030800308003080030
30024800297020000000113869799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000030018900272227980320010300108003080030800308003080030
300248002969700000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000000018900272227980320010300108003080030800308003080030
300248002970200000306169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000000018900272227980320010300108003080030800308003080030
300248002970300000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000000018900272217980320010300108003080030800308003080030
3002480029702000001206169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000000018900272227980320010300108003080030800308003080030
3002480029702000000072669799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000000018900272227980320010300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  autdb x0, x8
  autdb x1, x8
  autdb x2, x8
  autdb x3, x8
  autdb x4, x8
  autdb x5, x8
  autdb x6, x8
  autdb x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
80204800416960000003525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000051104253380025801000801008003680036800368003680036
80204800357000000007725802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000351103253380025801000801008003680036800368003680036
802048003575100000010525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000951103253380025801000801008003680036800368003680036
80204800357010000003525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000351103253380025801000801008003680036800368003680036
8020480035699000012070025802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000051103253380025801000801008008280036800368003680036
80204800357000000003525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000010051233253380025801000801008003680036800368003680036
8020480035700000012022525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010020000351103253380025801000801008003680036800368003680036
802048003574800000014725802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000010051483255680025801000801008003680036800368003680036
80204800356980000003525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000051103253380025801000801008003680036800368003680036
802048003575000001207725802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000051103253380025801000801008021680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
80024800406930000000180070025800208004180020400100149769550800358003569988370006800208002016002080081164118002110980010000000000503301625017680024800100800108003680036800368003680036
80024800357510000000000352580020800208002040010014976955080035800356998877000680020800201600208003516411800211098001000001030050200625017880024800100800108003680036800368003680036
800248003569900000000007004680020800208002040010014976955080035800356998837000680020800201600208003516411800211098001000000000050200625061780024800100800108003680036800368003680036
8002480035701000000112009772580020800208002040010014976955080040800356998837000680020800201600208003516411800211098001000001000050200625061780024800100800108003680036800368003680036
800248003574700000000007002580020800208002040020714976955080081800356998837000680020800201600208003516411800211098001000001030050200625061780024800100800108003680036800368003680036
800248003569300000001200632580020800208002040010014976955080035800356998837000680020800201600208003516411800211098001000000000050330525061780024800100800108003680036800368003680036
800248003569900000000006325800208002080020400100149769550800358003569988770006800208002016002080035164118002110980010020012300502001725017680024800100800108003680036800368003680036
80025800356990000000000352580020800208002040010014976955080035800356998837000680020800201600208003516411800211098001000000030050200625061680024800100800108003680036800368003680036
800248008069700000000003525800208002080020400100149769550800358003569988370006800208002016002080035164118002110980010000010000502001725017680024800100800108003680036800368003680036
8002480035700000000012007725800208002080020400100149769553800358003569988370006800208002016002080035164118002110980010000000000502001725017680024800100800108003680036800818003680036