Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

XPACD

Test 1: uops

Code:

  xpacd x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)181e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004702966000845824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702965110615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702966000615824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
10047029650001045824251000100010001783301493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702966000615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702966000615824251000100010001783301493949702970296623368181000100010007144870111001100000733853367891000100070307030703070307030
10047029650001055824251000100010001783301493949702970296623368181000100010007029870111001100000734853367891000100070307030703070307030
100470296500126158242510001000100017833004939497029702966233681810001000100070298701110011000018733853367891000100070307030703070307030
1004702967000615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030
1004702965000615824251000100010001783300493949702970296623368181000100010007029870111001100000733853367891000100070307030703070307030

Test 2: Latency 1->1

Code:

  xpacd x0
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0f18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020470029616100000000615982425102001020010200180833014496694907002970029684803686741020010200102007002991211102011009910100000000071040279106979610100101007007270030700307003070030
10204700296200000010001035982425102001020010200180833014986694907002970029684803686741020010200102007002991211102011009910100000010071040179116990810100101007003070030700307003070030
1020570029620000000000615982425102001020010200180833014496694907002970029684803686741023010200102007002991211102011009910100000000071040179116979610104101007003070030700307003070030
10204700296211000000007735982425102001020010229180833014496694907002970029684803686741020010200102007006991211102011009910100000000071040179116979610100101007003070030700697078771248
102047002965700000052800615982425102041020010200180833014496698807002970029684803686741020010200102007002991211102011009910100000000071000179116979610100101007003070030700307003070030
10204700296200000000011285982425102001020010200180873910496694907006870029684803686741020010200102007002991211102011009910100000000071000179116979610100101007003070030700307003070030
102047002962000000412001765982425102001020010200180833010496694907002970029684803686741020010200102007002991211102011009910100000000071000179116979610100101007003070030700707003070030
102047002961700005014400615982425102001020010200180833010496694907002970029684803686741020010200102007002991211102011009910100002000071000279116979610100101007003070069700307003070030
102047002961900000000057159824251020010200103161808330104966949070029700296848019686741020010200102007002991211102011009910100000000071000179116979610100101007003070030700307003070030
102047002962000000000072659824110102001020010200180833010496694907002970029685203686741020010200102007002991211102011009910100000000071000179116979610100101007003070030700697003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002470029617000000332200061598242510020100201002018074300496694907002970029685023686961002010020100207002987011100211091001000000000640279246980510010100107003070030700307003070030
1002470029655000000120061598242510020100201002018074301496694907006870029685023686961002010020100207002987011100211091001000010000640279226980510010100107003070030700307003070030
100247002962000000000071598152510020100201002018074301496694907002970029685023686961002010020100207002987011100211091001000000000640279246980510010100107003070030700307003070030
100247002962000000000061598242510020100201002018074300496390807002970029685023686961002010020100207002987011100211091001000000000640279246980510010100107003070030700307003070030
100247002962100000000061598242510020100201002018074300496694907002970029685023686961002010020100207002987011100211091001000000000640279246991510010100107003070030700307003070030
100247002961900000000061598242510020100201002018074300496694907002970029685023686961002010020100207002987011100211091001000010300640279246980510010100107003070030700307003070030
100247002961900010000061598242510020100201002018074300496694907002970029685023686961002010020100207002987011100211091001000000000640279226980510010100107003070030700307006970030
10024700296200000000005802598242510020100201002018074300496694907002970029685023686961002010020100207002987011100211091001000000166500654279226980510010100107003070059700307006970030
1002470029617000000120061598242510020100201002018074300496694907002970029685023686961002010020100207005887011100211091001000000000640279246980510010100107003070030700307003070030
100247002962000000000061598242510020100201002018074300496694907002970029685023686961002010020100207002987011100211091001000000000640279226980510010100107003070030700307003070030

Test 3: throughput

Count: 8

Code:

  xpacd x0
  xpacd x1
  xpacd x2
  xpacd x3
  xpacd x4
  xpacd x5
  xpacd x6
  xpacd x7

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)030f1e1f3f5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204800406990003502580200802008020040100014976955800808003569966369984802008020080200800351641180201100998010000010005120225228002580100801008003680036800368003680036
802048003570101203502580200802008020040100014973920800358003569966369984802008020080200800351641180201100998010000000005110225228002580100801008003680036800368003680036
80204800357020003502580200802008020040100014976955800358003569966369984802008020080200800351641180201100998010000000005110225228002580100801008003680036800368003680036
8020480035702000119202580200802008020040100014976955800358003569966369984802008020080200800351641180201100998010000000005110217228002580100801008003680036800368003680171
80204800357010003502580200802008020040100014976955800358003569966369984802008020080200800351641180201100998010000000015110225228002580120801008003680036800368003680036
802048003570201203502580200802008020040100014977136800798012669966870149802448020080200800351641180201100998010000000005110225228002580100801008003680036800818003680036
80204800357020003502580200802008020040100014976955800358003569966369984802008020080200800351641180201100998010000003005110225328002580100801008003680036800368003680036
802058003574300013402580200802008020040100014976955800358003569966369984802008025680200800351641180201100998010000003005110225228002580100801008007280036800368003680036
802048003569900035011080200802218020040100014976955800358003569966369984802008020080200800351641180201100998010000000005110225228002580100801008003680036800368003680036
80204800357020003502580200802008020040100014976955800358003569966369984802008020080200800351641180201100998010000000005110225228002580100801008003680073800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
800248005169800000024900352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000305020525428002480010800108003680036800368003680036
800248003571300000094800352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800102000005020425428002480010800108003680036800368003680036
800248003569400000096900352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000005020225248002480010800108003680036800368003680036
800248003570000000091500352580020800208002040010004976955800808003569988370006800208002080020800351641180021109800100000305020225248002480010800108003680036800368003680036
8002480035699000000531004152580020800208002040020704976955800358003569988370006800208002080020800721641180021109800100010005020425428002480010800108003680036800368003680036
800248003574900001084300632580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000005020425428002480010800108003680036800368003680036
80024800357490000001221007002580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000005020433428002480010800108003680036800368003680036
8002480035697000000126900352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000005020425248002480031800108003680036800368003680036
8002480035700000000456880352580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000005020425448002480010800108003680080800368003680036
8002480035699000000951007282580020800208002040010004976955800358003569988370006800208002080020800351641180021109800100000005020425548002480010800108003680036800368003680036