Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTIZA

Test 1: uops

Code:

  autiza x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004702970061582425100010001000178330493949702970296623368181000100010007029870111001100003733853367891000100070307030703070307030
1004702967061582425100010001000178330493949702970296623368181000100010007029870111001100000733852367891000100070307030703070307030
10047029640104582425100010001000178330493949702970296623368181000100010007029870111001100000733852367891000100070307030703070307030
1004702966061582425100010001000178330493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702970061582425100010001000178330493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
100470297027146582425100010001000178330493988702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702970061582425100010001000178330493949702970296623368181000100010007029870111001100010733853467891000100070307030703070307030
1004702970061582425100010001000178330493949702970296623368181028100010007029870111001100010733853267891000100070307030703070307030
1004706970061582425100010001000178330493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702970061582425100010001000178330493949702970296623368181000100010007029870111001100000733853467891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autiza x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
102047002962101000006159824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030
1020470029652000057008259824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030
102047002962100000006159824251020010200102001808330149669497002970029684803686741020010200102007002991211102011009910100007101791169934101000101007003070030700307003070030
102047002962100000008259824251020010200102001808330149669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030
102047002961900004620061598242510200102001020018083301496694970029700296848036867410200102001020070054912111020110099101000071017911697961010023101007003070030700307003070030
102047002965200000006159824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030
1020470029649000012008959824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100107101791169796101000101007003070030700307003070030
102047002962200000006159824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030
102047002962200003006159824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030
10204700296270000390010359824251020010200102001808330049669497002970029684803686741020010200102007002991211102011009910100007101791169796101000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
100247002961700012006159824251002010020100201807430149669490700297002968502368696100201002010020700298701110021109100100000106402792369805100102100107003070030700307003070030
10024700296200000006159824251002010020100201807430149669490700297002968502368696100201002010020700298701110021109100100100006402792269805100100100107003070030700307003070030
10024700296190000006159824251002010020100201807430149669490700297002968502368696100201002010020700298701110021109100100100006402793269805100100100107003070030700307003070030
100247002962000000075459824251002010020100201807430149669490700297005468502368696100201002010020700298701110021109100100000006402793369805100100100107003070030700307003070030
100247002962000012006159824251002010020100201807430149669490700297002968502368696100201002010020700298701110021109100100000006402792269805100100100107003070030700307003070030
100247002962000000013159824251002010020100201807430149669490700297002968502368696100201002010020700298701110021109100100000006402792269805100100100107003070030700307003070030
10024700296210000006159824251002010020100201807430149669490700297002968502368696100201002010020700298701110021109100100000006402792269805100100100107003070030700307003070030
100247002962000000010359788251002010020101041807818149669490700687002968502368696100201002010020700298701110021109100100000006402793269805100100100107003070030700307003070030
100247006962100000013527596624541008710079105261809395149669490700297002968502368696100201066210560705811385161100211091001000040097242665470634100950100107121171209711717121171173
100247097068600033002288014214596175131009410089106651809273049678920709727105168760165694801086211273113087030012872110021109100100000006402792269805100100100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  autiza x0
  autiza x1
  autiza x2
  autiza x3
  autiza x4
  autiza x5
  autiza x6
  autiza x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802048004170000001000772580200802008020040100004976955380035800806996636998480200802008020080035164118020110099801000001030051100225228002580100801008008280036800368017180036
802048003570200000120057802580200802008020040110704976955080035800356996636998480200802008020080035164118020110099801000002090051100225228002580100801008003680036800368003680036
80204800357020100012007422580200802008020040100004976955080035800356996686998480200802008020080035164118020110099801000001030051100225228002580121801008003680036800368003680036
8020480035697000001200352580200802008020040110704976955080035800356996636998480200802008020080035164118020110099801000001030051100225228002580100801008003680036800368003680036
8020480035702000004500772580200802008020040110704976955080035800356996636998480200802008020080035164218020110099801000001030051100225228002580121801008003680036800368003680036
802048003570200000120077258020080200802004010000497695508003580035699663699848020080200802008003516411802011009980100000221070800532102233238080780583801008085381212811708117080846
80204808507090100012003848521807258068780732403461049781270810868098670415107706428072881001809758112016424180201100998010000010300511028016008003780105801008004280041800418004280041
8020480040702000001200772580200802008020040100004976955080035800356996636998480200802008020080035164118020110099801000001030051100225228002580100801008003680036800368003680036
8020480035702000001200352580221802008020040100004976955080035800356996636998480200802008020080035164118020110099801000001030051100225228002580100801008003680036800368003680036
80204800356990000012007642580200802008020040100004976955080035800356996636998480200802348023480035164118020110099801000001000051100225228005780100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)181e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
800248004064311021018325800208002080020400100149769550800358003569988370006800208002080020800351641180021109800100001305022062574800248001005800108003680036800368003680036
8002480035645110120141258002080020800204001001497695508003580035699883700068002080020800208003516411800211098001000004505022042547800248001000800108003680036800368003680036
80024800356441100018325800208002080020400100149769550800358003569988370006800208002080020800351641180021109800100000005022072577800248001000800108003680073800368003680036
800248003564611053701115825800208002080020400100049769550800358003569988370006800208002080020800351641180021109800100000005022042547800248001000800108003680036800368003680036
80024800357021100015825800208002080020400100149769550800358003569988370006800208002080020800351642180021109800100000305022072574800248001000800108003680036800368003680036
800248003570211000110625800208002080020400100098769550800358003569988370006800208002080020800351642180021109800100000005022072574800248001000800108003680036800368003680036
80024800357021100014125800208002080020400100149769550800358003569988370006800208002080020800351641180021109800100000305022072574800248001000800108003680036800368003680036
8002480035700110120113625800208002080020400100049769550800358003569988370006800208002080020800351641180021109800100000005022242577800248001000800108003680036800368003680036
800248003570211000111125800208002080020400100049769550800358003569988370006800208002080020800351641180021109800100000204305022062563800248001000800108003680036800368003680036
80024800357451100014325800208002080020400100049769550800358003569988370006800208002080020800351641180021109800100000005022032536800248001000800108003680036800368003680036