Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

PACIB

Test 1: uops

Code:

  pacib x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10047029660399895824251000100010001783300493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
1004702966001035824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
1004702966001035824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
100470296600615824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
100470296600615824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
100470296700615824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
100470296700615824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
1004702967001035824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
100470297000895824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030
100470296700615824251000100010001783301493949702970296623368181000100020007029870111001100000733853367891000100070307030703070307030

Test 2: Latency 1->1

Code:

  pacib x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102047002956800000120061598242510200102001020018083301496694970029700296848036867410200102002020070029912111020110099101000000000710279116979610100101007003070030700307003070030
10204700296500000000061598242510200102001020018083301496694970029700296848036867410200102002020070029912111020210099101000000000710179116979610100101007003070030700307003070030
102047002962300000000726598242510200102001020018083300496706770029700296848036867410200102002020070029912111020110099101000000000710179116979610100101007003070030700307003070030
102047002962000000000441598242510200102001020018083300496694970029700296848036867410200102002020070029912111020110099101000000000710179116979610100101007003070030700307003070030
10204700296220000000061598242510200102001020018083300496694970029700296848036867410200102002020070029912111020110099101000000000710179116979610100101007003070030700307003070030
10204700296220000000061598242510200102041020018083300496694970029700296848036867410200102002020070029912111020110099101000000000710179116979610100101007003070030700307006070030
102047002962200000000895982425102001020010258180833004966949700297002968480368674102901020020200700299121110201100991010000210307101280117042210173101007101270855710117096670108
1020470029618010120211200173598248710227102631020018100810496694970029700296848036867410200102432020070068912111020110099101000400000710179116979610100101007003070030700307003070030
10204700296560000000061598242510200102001020018083300496694970068700296848036867410200102002020070029912111020110099101002021242248210471295127049610179101007023170230700307003070030
102047006962200030240061598242510200102001020018083300496694970029701086848036872510200102002020070029912211020110099101000000000710179116979610100101007003070030700307003070030

1000 unrolls and 10 iterations

Result (median cycles for code): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0f18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024700296190000001440051059824251002010020100481807527149669497002970029685023686961002010020200207006887011100211091001000000000640379226980510010100107003070108700307003070030
100247002965600000000144959824251002310020100201807430149669887002970029685023686961002010020200207002987011100211091001000200195500640279226980510010100107003070030700307003070030
100247006862000000113200113859824251002010020100201807430149669497002970029685143686961002010020200207002987011100211091001000000000640279226980510010100107003070030700307003070068
100247002962000000198806159815251002010020100201807430049669497002970069685023686961002010020200207002987011100211091001000002000640379226980510013100107007070030700307005570030
100247002966300001000032159824251002010020100201807430149669887002970029685023686961002010185200207006987011100211091001000000020646287226980510010100107003070055700307003070030
10024700696550000000880503597881111003210032101151807818049670287018870189685023687491002010020201047018687041100211091001000210000640279236980510010100107003070030700307003070030
10024700296230000000006159824251002010020100201807430049669497002970029685023686961004810020200207002987011100211091001000010000640279226980510013100107003070030700307006970030
100247002962001001100072659824251002010023100481807527049669497002970029685023686961002010060200207002987021100211091001002004000640279226980510010100107003070030700307006970069
1002470029616010000120015959824251002310020100201807430049669877002970029685023686961002010020200207002987011100211091001000000000640279226980510010100107007070030700307003070030
1002470029620000000988014559824461002310020100201807430049669497002970029685023687231002010020200207006987011100211091001000000000640279226980510010100107006970030700307010970030

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  pacib x0, x1
  mov x0, 1

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204800296970000021302787697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000301910372117979420100301008003080030800308003080030
302048002969900000537061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000321910172117979420100301008003080030800308003080030
302048002969900000270061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000010001910172117979420100301008003080030800308003080030
3020480029699000013090103697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008003080030800308003080030
302048007070000000468061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000200001910172117979420100301008003080030800308003080030
3020480029700000003061697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000200001910172127979420100301008003080030800308003080030
3020480029699000005370944697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000301910196117979420100301008003080030800308003080030
3020480029700000221737592112145106965242720333203282178549525210497802480777810357608488766542160820200402008002914411302011009930100202238033022681288128053820250301008120381286809758129281218
3020481098708000211204088061697992520200202002020049426010497694980029800297596137618120200202004020080029144113020110099301000000001910172117979420100301008003080030800308003080030
302048002969900000630726697992520200202002020049426011497694980029800297596137618120200202004020080029144113020110099301000000001940172117979420100301008003080030800308003080030

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 7.0029

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
30024800297000000000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000018904723779803200100300108005580030800308003080030
30024800297450000000008969799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000018903723379803200100300108003080030800308003080030
3002480029702000000013206169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000100018904723779803200100300108003080030800308003080030
3002480029703000000120061697992520020200202002049520481497694980029800297598337620320020200204002080029144113002110930010000012018903723379803200100300108003080030800308003080030
30024800297000000000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000018903723379803200100300108003080030800308003080030
300248002970300000000038869799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000018906723379803200100300108003080030800308003080030
30024800297050000000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000003018907727379803200100300108003080030800308003080030
30024800297030000000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000018904723779803200100300108003080030800308003080030
300248002970200000000061697992520020200202002049520481497694980029800297598311762092002020020400208002914431300211093001000100018904727479803200100300108003080030800308003080030
30024800296980000000006169799252002020020200204952048149769498002980029759833762032002020020400208002914411300211093001000000018907723379803200100300108003080030800308003080030

Test 4: throughput

Count: 8

Code:

  pacib x0, x8
  pacib x1, x8
  pacib x2, x8
  pacib x3, x8
  pacib x4, x8
  pacib x5, x8
  pacib x6, x8
  pacib x7, x8

(requires arm64e binary, with arm64e_preview_abi boot arg)

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
8020480062643000000007725802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000003000051102252280025801000801008003680036800368003680036
80204800356450000001207725802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000000000051102252280025801000801008003680036800368003680036
8020480035702000000003525802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000000000051102252280025801000801008003680036800368003680036
80204800357020000001203525802008020080200401000049769558003580035699663699848022980200160200800351641180201100998010000000000051102252280058801000801008008280129800368003680036
8020480035701000000003525802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000003000051102252280025801000801008003680036800368003680036
8020480035702000000003525802008020080200401000149771368021980035699663699848020080200160312800351641180201100998010000000000051102252280025801000801008003680036800368003680036
8020480035699000000003525802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000003000051102252280025801000801008003680036800368003680036
80204800357020000000010525802008020080200401000049769558003580035699663699848022980200160200800351641180201100998010000000000051102252280025801000801008003680036800368003680036
8020480035702000000009125802008020080200401000049769558003580035699663699848020080200160200800351641180201100998010000010000051102252280025801000801008008080036800368003680036
8020480035702000000007725802008020080200401000149769558003580035699663699848020080200160200800351641180201100998010000000000051102252280025801000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd0d5map dispatch bubble (d6)d9daddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
800248004069700000021007002580020800208002040010014976955800358003569988370006800208002016002080035164218002110980010000100050200725001328002480010800108003680036800368003680036
8002480081700000000000822258002080020800204001000497695580035800356998837000680020800201600888003516411800211098001000000005020322500428002480010800108003680036800368003680036
8002480035750000000001834258002080020800204001001497695580035800356998877000680020800201600208003516411800211098001000010005020042500438002480010800108003680036800368003680036
80024800356970000001200510258002080020800204001001497695580035800806998837000680020800201600208003516411800211098001000000005020042500448002480010800108003680072800478003680036
8002480035696010000000318258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000005020022500558002480010800108003680036800368003680036
800248003570000000000077258002080020800204001001497700080035800356998837000680020800201600208003516411800211098001000000005020042500428002480010800108003680036800368003680036
80024800356990000000880106258002080020800204001001497695580035800356998837000680020800201600208003516421800211098001000002605020022500458002480010800108003680036800368003680036
800248003570000000000063258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000000005020052500458002480010800108003680036800368003680036
800248003570000000000035258002080020800204001001497695580035800356998837000680020800201600208003516411800211098001000002305020042500428002480010800108003680080800368003680036
80024800357020000000003525800208002080020400100149769558003580080699883700068002080020160020800351641180021109800102211095380532008225007138067080493800108107581211809438130281126