Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

AUTIB

Test 1: uops

Code:

  autib x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10047029666661582425100010001000178330049394970297029662336818100010002000702987011100110000000732853367891000100070307030703070307030
1004702967661582425100010001000178330049394970297029662336818100010002000702987011100110000100733853367891000100070307030703070307030
1004702967661582425100010001000178330049394970297029662336818100010002000702987011100110000000733853367891000100070307030703070307030
100470296739661582425100010001000178330049394970297029662336818100010002000702987011100110000000734853367891000100070307030703070307030
1004702966061582425100010001000178330149394970297029662336818100010002000702987011100110000000733853367891000100070307030703070307030
1004702966961582425100010001000178330149394970297029662336818100010002000702987011100110000000733853367891000100070307030703070307030
1004702966661582425100010001000178330149394970297029662336818100010002000702987011100110000000733853367891000100070307030703070307030
1004702967061582425100010001000178330049394970297029662336818100010002000702987011100110000000733853367891000100070307030703071107030
1004702965082582425100010001000178330049398870297029662336818100010002000702987011100110000000733853367891000100070307030703070307030
1004702967661582425100010001000178330149394970297029662336818100010002000702987011100110000000733853367891000100070307030703070307030

Test 2: Latency 1->1

Code:

  autib x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e8? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
1020470029617000018004415982425102001020010200180833014966949700297002968480368674102001020020232700299121110201100991010000000710179116979601010000101007003070030700307003070030
102047002965500000007265982425102001020010200180833014966949700297002968480368775102841029020200700299121110201100991010000030710179116979601010000101007003070030700307003070030
1020470029655000018001035982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000300710179116979601010000101007003070030700307003070030
10204700296550000000615982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000000710179116979601010000101007003070030700307003070030
10204700296200000000615982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000031710179116979601010000101007003070030700307003070030
10204700296180000000615982425102001020410200180833014966949700297002968480368674102001020020200700299121110201100991010000130710179116979601010000101007003070030700307003070030
102047002962000000006159824251020010200102001808330149669497002970029684803686741020010200202007002918711110201100991010000000710179116979601010000101007003070030700307003070030
102047002962000000007265982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000000710179116979601010000101007003070030700307003070030
102047002965600000007265982425102001020010200180833014966949700297002968480368674102001020020200700299121110201100991010000200710179116987201010000101007003070030700307003070030
102047002961900001200615982425102001020410200180833014966949700297002968480368674102001020020200700299121110201100991010000114000710179116979601010000101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)0309191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100247002968300000325159824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000001640479446980510010100107003070030700307003070030
10024700296580000043959824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000100640479436980510010100107003070030700307003070030
10024700296200000019859815251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000100640479436980510010100107003070030700307003070030
10024700296210000020859824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000030640479446980510010100107003070030700307003070030
10024700296180000053559824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000000640479436980510010100107003070030700307003070030
10024700296200000058459824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000000640479446980510010100107003070030700307003070030
10024700296200000046159824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000030649479436980510010100107003070030700307003070030
100247002963900120036859824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000100640479446980510010100107003070030700307003070030
10024700296200000027659824251002010020100481807430149669497002970029685023686961002010020200207002987011100211091001000130640379346980510010100107003070030700307003070030
10024700296210000013159824251002010020100201807430149669497002970029685023686961002010020200207002987011100211091001000000640387346980510010100107003070030700307003070030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  autib x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)acbbc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
302048002969900000082697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910272117979420100301008003080030800308003080030
302048002969700000061697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008003080030800308003080030
302048002969900000061697952520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301001000001910172117979420100301008003080030800308003080030
302048002970200012001036979925202002020020200494260104976949800298002975961376181202632020040200800291442130201100993010011200001910172117979420100301008003080030800308003080030
302048002974600000061697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008003080030800308003080030
302048002970200000061697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008005580030800308003080030
302048002971700000061697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008003080030800308003080030
3020480029744000000726697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008003080030800308003080030
302048002970100000061697992520200202002020049426010497698880029800297596137618120200202824020080029144113020110099301000000001910172117979420100301008003080030800308003080030
3020480029702000000616979925202002020020200494260104976949800298002975961376181202002020040200800291441130201100993010001800001910172117979420100301008003080154800308003080918

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5e60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
300248011369600000008816169799252002520025200204952048004976949800298002975969376203200202010240020800291441130021109300100000000018902720279803200100300108003080030800308003080030
300248002970000000000072769799252002020029200204952048014976995800298002975983376203200872002040020800291441130021109300100000000018902722279803200100300108003080030800308003080070
30024800297170001000006169799462002520020200204951303014976949800298002975983776209200862002040020800293051130021109300100000000018902184279803200140300108006980030800308003080030
3002480029750000000017606169799252002020020200864952048014976949800698002975969376203200852002040020800291441130021109300100020000019092722279803200100300108003080075800758003080030
300248002970000000013200162269799252002020020200844952048004976949800298002975983376203200872010040020800291441130021109300100001018250019092722279803200100300108003080069800758015280030
300248002970000100082866006169799252002020020200884952048004976949800298002975983876203200202002040020800751441130021109300100000000018902722379803200100300108007580075800308003080030
300248002970000010000025169799252002020020200874952048004976994800698007575983776218200202002040020800291441130021109300100000000018902962279803200100300108007580030800308003080075
30024800297000000001320011769799252002020020200204952048004976949800298006975983376203200202018240020800291441130021109300100000000018902722279803200100300108003080030800308003080030
30024800607500000010006169799252002020024200204952048004976949800298002975983376218200202002040020800291441130021109300100000032118902722279803200140300108003080030800308003080030
300248002970001001100059169799252002020020200204950575014976949800298002975975376203200972002040182800291441130021109300103001019152018902722279803200100300108003080030800308020680030

Test 4: throughput

Count: 8

Code:

  autib x0, x8
  autib x1, x8
  autib x2, x8
  autib x3, x8
  autib x4, x8
  autib x5, x8
  autib x6, x8
  autib x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204800626970000000510258020080200802004010001497695580035800356996636998480200802001602008003516411802011009980100003003005110225328083880604801008112581073808958103781164
80204811257070002024304816129579458020080200802004010001497768180897807607032586707418062080767161892809841641818020110099801002201211563205524233228029680542801008008080042800418004280036
8020480035787000000035258020080200802004010000497695580035800356996636998480200802001602008003516411802011009980100000003005110225228002580100801008003680036800368003680036
802048003575000000132077258020080200802004010000497695580080800356996636998480200802001602008003516411802011009980100000003005110225228005880100801008003680036800368003680036
8020480035696000000035258020080200802004010000497695580035800356996636998480200802001602008003516411802011009980100000000005124225228002580100801008008280036800368003680036
8020480035794000000035258020080200802004010000497695580035800356996636998480200802001602008003516411802011009980100000000005110225228002580100801008003680036800368003680036
8020480035702000000035258020080200802004010000497695580035800356996636998480200802001602008003516421802011009980100000000015110225228002580100801008003680036800368008280036
80204800356990000000352580200802008020040100004976955800818003569966369984802008020016020080035164118020110099801000000015005110225228002580100801008003680036800368003680036
8020480035749000000035258020080200802004011070497695580035800356996636998480200802001602688003516411802011009980100000100005110225228002580100801008003680036800368003680036
80204800357000000000700258020080200802004010000497695580035800356996636998480200802001602008003516411802011009980100000000005110225228006080100801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eb? int retires (ef)f5f6f7f8fd
800248004069900000070025800208002080020400100104976955800358003569988370006800208002016002080035164118002110980010000000005020007253480024800100800108003680036800368003680036
80024800356960000003525800208002080020400100004976955800358003569988370006800208002016002080035164118002110980010000000005020005254380024800100800108003680036800368003680036
8002480035702000012075025800408002080020400100004976955800358003569988370006800208002016002080035164118002110980010000103005073004254380024800100800108003680036800368003680036
8002480035699000000700258002080020800204001000049769558003580035699881170006800208002016002080035164118002110980010000003005020004253580024800100800108003680036800368003680036
80024800357020000003525800208002080020400100104976955800358003569988370006800208002016002080035164118002110980010000000005020004254480024800100800108003680036800368003680036
800248003570200000012025800208002080020400100004976955800358003569988370006800208002016002080035164118002110980010000000005020005255380024800100800108003680036800368003680036
800248003570200000035258002080020800204001000049769558003580035699883700068002080020160020800351641180021109800100000048005020005254480024800100800108003680036800368003680036
80024800357010000003525800208002080020400100104976955800358003569988370006800208002016002080035164118002110980010000000005020002253480024800100800108003680036800368003680036
80024800357020000003525800208002080020400100004976955800358009969988370006800208002016002080035164118002110980010000000005020002255380024800100800108003680036800368003680036
800248003570200000022525800208002080020400100104976955800358003569988370006800208002016002080035164118002110980010000000005020004255380024800100800108003680036800368003680036