Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTDZA

Test 1: uops

Code:

  autdza x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10047029650615824251000100010001783301493949702970296623368181000100010007029870111001100000732853367891000100070307030703070307030
100470296512615824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
100470296527615824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
10047029670615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
10047029670615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702965306615824251003100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702965321615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
100470296657615824251000100010001783301493949702970296623368181000100010007029870111001100010733853367891000100070307030703070307030
10047029660615824251000100310001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
100470296605365824251000100010001783301493949702970296656368181000100010007029870111001100000733853367891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autdza x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102047002961700000061598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000000000710379116979610100101007003070030700307003070030
1020470029618000000726598154610222102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000000000710179116979610100101007003070030700307003070030
102047002962100000061598242510200102001020018083301496694907002970029684803686741020010200102007002991211102011009910100000030000710179116979610100101007003070030700307003070030
102047002962100000061598242510200102001020018083301496694907002970029684803686741020010200102007002991211102011009910100000001000710179116979610100101007003070030700307007070030
1020470029620000000113598242510200102001020018083300496694907006870029684803686741020010200102007002991211102011009910100000017450000710187116979610104101007003070030700307003070030
1020470029620000000126598242510200102001020018083301496694907002970029684803686741020010200102007002991211102011009910100200000000710179116982010100101007003070030700307003070030
1020470029620010120061598154710200102001020018083301496694907002970029684803686741020010200102007002991211102011009910100000000000710187116979610100101007003070030700307003070030
102047002962000000089598242510200102001020018083300496694907002970029684803686741022810200102007002991211102011009910100000000000710179116979610100101007003070030700307003070030
102047002962100000061598242510200102001020018083300496694907002970029684803686741020010200102007002991211102011009910100000000000710179116979610100101007003070030700307003070030
102047002967400000061598242510200102001020018083301496694907002970029684803686741020010200102007002991211102011009910100000000000710179116979610100101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100247002961900012010359824251002010020100201807430149669497002970029685023686961002010020100207002987011100211091001000103640279226980510010100107003070030700307003070030
1002470029649000006159824251002010020100201807430049669497002970029685023686961002010020100207005487011100211091001000103640279226980510010100107003070030700307003070030
100247002962300012010359824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000103640279236980510010100107003070030700307003070030
10024700296190001206159824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000103640279326980510010100107003070030700557003070030
100247002962200012010359824251002010020100201807531049669497002970029685023686961002010020100207002987011100211091001000103640279326980510010100107003070030700307003070030
10024700296220000010359824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000103640279226980510010100107003070030700307003070030
100247002962300012010359824251002010020100201807430049669497002970029685023686961002010020100207002987011100211091001000103654279226980510010100107003070030700307003070030
100247002962400012010359824251002010020100201807430049669497002970029685028687211002010020100207002987011100211091001000103640379226980510010100107003070068700307003070030
100247002962300012010459815251002010020100201807430049669497002970029685023687221002010020100207002987011100211091001000103640279226980510010100107003070030700307003070030
100247002962200012010459824251002010020100201807430049669497002970068685023686961002010020100207002987011100211091001000100640279226980510010100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  autdza x0
  autdza x1
  autdza x2
  autdza x3
  autdza x4
  autdza x5
  autdza x6
  autdza x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020480062700000000035258020080200802004010001497695508003580035699663699848020080200802008003516411802011009980100000000005110225238002580100801008003680036800368003680036
8020480035698000000035258020080200802004010000497695508003580035699663699848020080200802008003516411802011009980100000000005110225238002580100801008003680036800368003680036
8020480035702000000035258020080200802004010001497695508003580035699663699848020080200802008003516411802011009980100000000005110225228002580100801008003680036800368003680036
8020480035703000000035258020080200802004010001497695508003580035699663699848020080200802008003516411802011009980100002000005110225528002580100801008003680036800368003680036
80204800806990000720077258020080200802004010000497695508003580035699663699848020080200802008003516411802011009980100000000005110225228002580100801008003680036800368003680036
8020480035702000000035258020080200802004010000497695508003580035699663699848020080200802008003516411802011009980100000030005110225228002580100801008003680036800368003680036
8020480035699000000035258020080200802004010001497695508003580035699663699848020080200802008003516411802011009980100000000005110225328002580100801008003680036800368003680036
8020480035700000000035258020080200802004010001497695508003580035699663699848020080200802008003516411802011009980100000000005110225228002580100801008003680036800368003680036
802048003570200002700203258020080200802004010000497695508003580035699663699848020080200802008003516411802011009980100000000005110225228002580100801008003680036800368003680036
80204800357010000000510258020080200802004010001497695508003580035699663699848020080200802008003516411802011009980100000000005110225208002580100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
800248008569900000014400258258004180020800204001001497700180080800806998837000680020800208005480035164118002110980010000000005020052534800248001059800108003680036800368003680036
80024800357000000000013525800208002080020400207149769558003580035699883700068004280020800208003516411800211098001002000247505033032534800598001029800108008180036800368003680036
8002480081699000000000119258002080020800204002070497695580080800356998837000680020800208002080035164118002110980010000002005020032534800248001026800108003680081800368003680082
800248003570000001008803525800208006280020400100149769558003580035699883700068002080052800208003516411800211098001004000000502033253480024800100800108008280036800368003680080
80024800357480000000007322580020800208002040010004976955800358003570007370040800428002080053800911641180021109800100000105050502003254480024800100800108003680036800828008280036
800248003575100000008805725800208002080020400100049769558003580035699883700068002080020800208008116411800211098001000000000503304254380024800100800108003680036800368008180081
8002480035699000000132006746800418002080020400100049769558003580172699886700068002080053800208003516411800211098001000001002502004334380024800100800108008280036800368003680036
80024800817000110001320010047800688004180020400314049769558008080035699883700408004280020800528008116411800211098001000000090502003253580024800100800108003680081800368008180036
8002480035697000000000157725800418002080020400100049769558003580080700063700068002080020800208003516411800211098001000211000502004253480024800100800108008280081800828003680036
8002480035699000000132002312580020800208004240010004976955800358003570007870006800208002080020800351642180021109800100000005100503305254380059800100800108008080036800368012680036