Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (two register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
200440373100015636872520002000200055168004018403740373447337702000200060004037403711100110000073316223787200040384038403840384038
20044037310006136872520002000200055168004018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037300006136872520002000200055168004018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
200440373100025136872520002000200055168004018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037300006136872520002000200055168004018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037300006136872520002000200055168014018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037300006136872520002000200055168004018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037300006136872520002000200055168004018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037310006136872520002000200055168014018403740373447337702000200060004037403711100110000073216223787200040384038403840384038
20044037300006136872520002000200055168014018403740373447337702000200060004037403711100110000073216223787200040384038403840384038

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2020440037300840613968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710021622397870200001004003840038400384003840038
202044003730000613968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710021622398250200001004003840038400384003840038
202044003730000613968725201001002000010020000500571768014001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710121622397870200001004003840038400384003840038
202044003730000613968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710121622397870200001004003840038400384003840038
2020440037300007263968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000712121622397870200001004003840038400384003840038
202044003729900613968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710121622397870200001004003840038400384003840038
202044003729900613968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710121622397870200001004003840038400384003840038
202044003730000893968725201001002000010020000500571768014001804003740037371720337495201002002000020260000400374003711102011009910010010000100000710121622397870200001004003840038400384003840038
202044003729960823968725201001002000010020000500571768004001804003740037371720337495201002002000020060504400374003711102011009910010010000100000710121622397870200001004003840038400384003840038
202044003730000613968725201001002000010020000500571768004001804003740037371720337495201002002000020060000400374003711102011009910010010000100000710121622397870200001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
2002440037300006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037300006139687252001010200001020000505717680400180400374003737205337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037300006139687462001010200001020000505717680400180400374003737194737517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037300006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100001640216223978720000104003840038400384003840038
2002440037300006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037300006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037299006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037300006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038
2002440037299006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640316223978720000104003840038400384003840038
2002440037300006139687252001010200001020000505717680400180400374003737194337517200102020000206000040037400371110021109101010000100000640216223978720000104003840038400384003840038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40204600904501000141176010565968772301251253000012530000625858762716009106014060038546680354995301252003016220080000600386003811302011009910010030000100400000000001939225225980125400001006003960039600396009160141
4020460038449002000012459687263012512530000125300006258588904160019060448600905464803549953012520230331200817786045460503101302011009910010030000100000010000001910216225980125400001006003960039600396003960039
402046003844900000004415968726301251253000012530000625858762706001906003860038546710354995310472003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
402046003845000000006159687263012512530000125300006258587627160019060038600385467102055025301252003000020080000600386003811302011009910010030000100002010418500001910216225980125400001006003960039600396003960039
40204600384500000000615968726301251253000012530000625858762716001906003860038546710354995301252003000020080000600386009011302011009910010030000100000000300001916216225983225400001006003960039600396003960091
402046003845001005071040945968726301251253000012530000625858762716001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396009560039
40204600384500000000615968726301251253000012530000625858762716001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
40204600384500000000825968726301251253000012530000625858762716001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000300001910216225980125400001006003960039600396003960039
40204600384490000000615968726301251253000012530000625858762716001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
40204600384490000000615968726301251253000012530000625858762716001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4002460038450000000000615968748300131330012133060865858762700601646003860038546903550173001320300002080000602426014111300211091010300001044004000189007173359801340000106003960191600906003960090
40024600384500000000002515968726300131330000133000065858762700600196003860038546933550173001320300002080000600386003811300211091010300001000000000189003173359801340000106003960039600396003960141
4002460398452000011000615968726300131330000133000065858890400600916003860038546933550173001320301642080000600386003811300211091010300001022000000189003173359801340000106003960039600396003960039
40024600384500000000006159687263001313300001330000658587627006009360089600385469111550173016520300002080000600386003811300211091010300001020010000189007173459801340000106003960039600396003960039
40024600384490000000001035968726300131330000133013365858762701600196003860038546933550323001320300002080000600386003811300211091010300001000000000189003173359801340000106003960039600396003960039
4002460038450000000000615968726300131330000133000065858762700600196003860038546883550173001320300002080000600386003811300211091010300001000032000189003173359801340000106003960039600396003960039
4002460038450000000600615968726300131330000133000065858762700600196003860038546933550173001320300002080000600386003811300211091010300001000000000189003173459801340000106003960039600396003960039
4002460038450000000000615968726300131330000133000065858762701600196003860038546933550173001320300002080000600386003811300211091010300001000000000189003173359801340000106003960039600396003960039
4002460038450000000000615968726300131330000133000065858762700600196003860038546933550173001320300002080000600386003811300211091010300001000000000189003177359801340000106003960039600396003960039
4002460038450000000000615968726300131330000133000065858762700600196003860038547003550173001320300002080000600386003811300211091010300001000000000189003173759801340000106003960039600396003960039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0043

retire uop (01)cycle (02)03mmu table walk data (08)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4020440040299012061396822630135125300021253000062557009330400244004340040346893349973012520030000200800004004040043113020110099100100300001000000000001910116113984425400001004004440041400444004140044
402044007031003172939706263012512530001125300006255704752040021400434004034692634996301252003000620080016400404004021302011009910010030000100000002161111917016003985425400001004004140041400414004140041
40204400463000006139722263012512530000125300006255701226040021400404004334695734995301252003000620080016400404004011302011009910010030000100000001291111918016003985225400001004004740041400414004140041
402044004330000261397072630125125300001253000062557041380400514004040040347387349963012520030006200800164008540085113020110099100100300001000000039950001910116113984725400001004004140044400414004140041
4020440040300002613973226301261253000112530000625570783904005140043400433468533499730125200300002008000040070400401130201100991001003000010000000210001910116113987425400001004004140044400444004140047
40204400433000011613973226301611253003612530000625570093304002440043400403469133499730125200300002008000040043400401130201100991001003000010000010210001910116113989725400001004004140041400714007140086
4020440043315000613970726301611253000112530000625570783904006640040400403468933500330125200300002008000040040400401130201100991001003000010000000360001910116113984725400001004007140041400444004440041
40204400403000026139732263012512530002125300006255701151040021400404004034691335042301252003000020080000400434004311302011009910010030000100000001290001910116113984425400001004004740071400414004440041
40204400432990036613972226301251253000212530000625570475204006640040400403471633504230125200300002008000040043400431130201100991001003000010000010240001910116113984725400001004004440044400414004140044
4020440043300003672639732263013512530000125300006255700933040021400434004034685334997301252003000020080000400404008511302011009910010030000100000001260001910116113984725400001004004140044400414004140047

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0040

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acbccfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4002440043299000016139707263004913300001330000655700933140021400404004034708335019300132030000208000040046400431130021109101030000100000018903173339843340000104004140044400864004140041
4002440040300000026139707263001413300361330000655704752040021400404010134695335019300132030000208000040040400401130021109101030000100000018905175339843340000104004440041400474004440041
4002440040300000006139707263004913300001330000655701300040021400434004034708335052300132030000208000040040400851130021109101030000100000018902175339888340000104004440041400864004740047
4002440040300000006139707263001313300011330000655704752040066400404004034708335019300132030000208000040040400401130021109101030000100000018904174439888340000104004440044400864004140044
4002440085300000006139707263001413300011330000655701548040066400434004334708335022300132030000208000040040400401130021109101030000100000018904175439843340000104004140041400414004140041
4002440040300000006139732263001313300361330000655700933040021400404004334708335019300132030243208000040043400401130021109101030000100000018903173339846340000104004140086400444008640041
400244004031100210030039732263004913300011330000655700305040027400854004034708335019300132030000208000040040400401130021109101030000100000018904173339888340000104008640044400414008640041
40024400402990000172639707263001313300361330000655705152040021400404004034753335022300132030000208000040040400401130021109101030000100010018906174439843340000104004140044400414004140041
40024400433001000366139710263001313300001330000655700305140021400404004034708335019300132030000208000040085400461130021109101030000100000018904174539843340000104004140044400414004140086
4002440040300000016139707263001313300021330000655700760040021400734004034738335019300132030000208000040046400431130021109101030000100000018902174339846340000104008640041400444004140041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b }, v3.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk data (08)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40204602454500000021935968749301251273000012530000625858762760019600386003854671365499530125200300002008000060038600381130201100991001003000010000001910116115980125400001006003960039600396003960039
40204600384500000061596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010000001910116115980125400001006003960039600396003960039
40204600384500000098596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010002001910116115980125400001006003960039600396003960039
4020460038450000121103596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010000331910116115980125400001006003960039600396003960039
402046003844900054061596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010000301910116115980125400001006003960039600396003960039
402046003845000000615968726301661253000012530000625858762760019600386003854671354995301252003000020080000600386003811302011009910010030000100003001910151115980125400001006003960095600396003960039
40204600384501000061596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010000301910116115980125400001006003960039600396003960039
402046003845000000103596872630125125300001253000062585876276001960038600385467135499530125200301662008000060038600381130201100991001003000010000531910116115980125400001006003960039600396003960039
40204600384500000061596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010000191910116115980125400001006003960039600396003960039
4020460038450000003057596872630125125300001253000062585876276001960038600385467135499530125200300002008000060038600381130201100991001003000010000301910116115994925400001006003960039600396003960039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4002460038450000012010359687263001313300001330000658587627160019060038600385469335501730013203000020800006003860038113002110910103000010000001080018903262259877340000106014360039601456003960039
4002460086450010012010359687263001313300001330000658587627160019060038600385469335501730013203000020800006003860038113002110910103000010000001290018902172259801340000106003960039600396003960039
40024600384500000001035968726300131330000133000065858762716001906003860038546933550173001320300002080000600386003811300211091010300001000008130018903172259801340000106003960039600396003960039
4002460038450000000615968726300131330000133000065858762716001906003860038546933550173001320300002080000600386003811300211091010300001000005000018902172259801340000106003960039600396003960039
400246003844900000061596872630013133000013300006585876270600190600386003854693355033300132030000208000060038600381130021109101030000100000000018902172259801340000106003960039600396003960039
400246003844900000061596872630013133000013300006585876271600190600386003854693355017300132030000208000060038600381130021109101030000100000000018902172259801340000106003960039600396003960039
4002460038450000000615968726300131330000133000065858762716001936003860038546933550173001320300002080000600386003811300211091010300001000009290018902172259801340000106003960039600396003960039
400246003845000000061596872630013133000013300006585876271600190600386003854693355017300132030000208000060038600381130021109101030000100000000018902172259801340000106003960039600396003960039
400246003845000000061596872630013133000013300006585876270600190600386003854693355017300132030000208000060038600381130021109101030000100000000018902172259801340000106003960039600396003960039
4002460038449000000726596872630013133000013300006585876270600190600386003854693355017300132030000208000060038600381130021109101030000100000060018902172259801340000106003960039600396003960039

Test 6: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b }, v10.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b }, v10.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b }, v10.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b }, v10.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b }, v10.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b }, v10.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b }, v10.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b }, v10.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)1e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2402044006530000001145803700026160125125160000125160000650159996014002404004340043997303100001601252001600002004800004004340043111602011009910010016000010000300101101161140040252400001004004440044400448639140044
24020440043300000011572361026160125125275723125160000650159996014002404004340043997303100001601252001600002004800004004340276111602011009910010016000010000010101101161140040252400001004004440044400444004440044
240204400433000000044026160125125160000125160000650159996014002404004386390997303100001601252001600002004800008643387877111602011009910010016000010000000101101161140040252400001004004440044400444004440044
2402048787730000000677809026160125125160000125160000650159996014002404004386390997303100001601252001600002004800004004340043111602011009910010016000010000000101101161240144162400001004016140158401604016840102
24020440158301100002820671602121251600001251600006501186578718785604020840106997303100001601252001602262004800004004340043111602011009910010016000010001300101101161140040252400001004004440044400444004440044
240204878773000000044026275848125160000125160000650159996014002408787740043997303100001601252001600002004800004004340043111602011009910010016000010000000101101161186268252400001004004440044400444004440044
240204400433000000044026160125125160000125160000650159996014002404004340043997703100001601252001600002004800004004340043111602011009910010016000010001000101101161140040252400001004004440044863914004486391
240204400433000000065026160125125160000125160000650159996014002404004340043997303100001601252001600002004800004004340043111602011009910010016000010000001101101161140040252400001004004440044400444004460230
240204858513000000044026160125125160000125160000650159996004002404004340043997303103291601252001600002004800004004340043111602011009910010016000010000000101101161185501252400001004004440111856874004440044
240204727403000000115763440261601251251600001251600006501599960140024087147400434821403100001601252001600002004800004004340043111602011009910010016000010000000101101161140040252400001004004440044400448613940044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
2400244004364200000000497558626160013131600001316000068159996011540024040043400439996031002216001320160000204813204004340043111600211091010160000100000010022812818172111117400403152240000104004440044400444004440044
2400244004364600000000490261600131316000013160000681599960115400240400434004399960310022160013201600002048000040043400431116002110910101600001000000100218322161721115174004031514240000104004440044862494004440044
24002440043300000000007140261600131316000013160000681599960115400240400434004399960310022160013201600002048000040043400431116002110910101600001000000100221141816174221515400403304240000104004440044400444004440044
2400244004330000000000550261600131316000013160000681599960115400240400434004399960310022160013201600002048000040043400431116002110910101600001000000100248518917412816400403302240000104004440044400444004440044
240024400433000000000049026160013131600001316000068159996011586216040043400439996031002216001320160000204800004004340043111600211091010160000100009010021852213174221414400403152240000104004485545400444004440044
240024400433000000000049026160013131600001316000068117076970154002404004340043999603100221600132016000020480000885604004311160021109101016000010000001002411516151721116174004031514240000104004440044400444004440044
2400244004330000000001152034902616001313160000131600006815999601154002434004340043485587656310022160013201600002048000040043862481116002110910101600001000000100241152314172111715400403152240000104004440044400444004440044
2400244004330000000001140734902616001313160000131600006715999601154002408658040043999603100221600132016000020480000400438510811160021109101016000010000001002184189172111315400403152240000104004486249400448623940044
2400244004329900000001152034826026275216131600001316000066159996011540024040043400439996031002216001320160000204800004004340043111600211091010160000100000010022841915172221516400403304240000104004440044400448624940044
2400244004329900000000490261600131327614813160000681180174211540024040043400439996035622716001320160000204800008510840043111600211091010160000100000010024841715172111714400403152240000104004440044400444004440044