Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (two register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e1f3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
20044086310018082368725200020002000551680040184087408534473377020002000600040374037111001100001073116113787200040384038403840384038
20044037310000103368744201220002000551680040184037403734473377020002000600040374037111001100000073116113853200040384038403840384038
2004403730000061368725200020002000551680040184037403734473377020002000600040374037111001100001384373116113787200040384038403840384038
2004403730000061368725200020002000551680040184084408634473377020002000600040374037111001100000073116113787200040384038403840384038
2004403730000061368725200020002000551680040184037403734473377021522000600040374037111001100000073116113787200040384038403840384038
200440373000249061368725200020002000551680040184037403734473377021522000600040374037111001100000073116113787200040384038403840384038
2004403731000061368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038
2004403730000061368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038
20044037300033084368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038
2004403730001920251368725200020002000551680040184037403734473377020002000600040374037111001100000073116113787200040384038403840384038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)181e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
20204400372992200165396872520100100200001002000050057176804001840037400373717963749120100200200082006002440037400371110201100991001001000010002000111718031643398010200001004003840038400384003840038
20204400373002200165396872520100100200001002000050057176804001840037400373717963749120100200200082006002440037400371110201100991001001000010000000111718051643398570200001004003840232400384003840038
202044003730021001653968725201001002004810020000500571768040018402774003737172337495201002002000020060000400374003711102011009910010010000100000900007120516510397870200001004003840184400384008640038
20204400373001100165396872520100100200001002000050057176804001840037400373717233749520406202200002006000040037402321110201100991001001000010003000000712061666397870200001004003840038400384003840038
2020440037300110121107396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010000000000712071666397870200001004003840038400384003840038
20204400373001102191730396872520100104200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010000000000712061655397870200001004003840038400384003840038
20204400373001100165396872520100100200001002000050057176804005440037400373717233749520100200200002006000040037400371110201100991001001000010020000000712075366397870200001004003840038400384003840038
20204400373001100165396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010000081660000712061664397872200001004008640038400384003840038
20204400373001100165396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010001090000712051676397870200001004003840038400384003840038
202044003730011078165396872520100100200001002000050057176804001840037400373717233749520100200200002006000040037400371110201100991001001000010001000000712041666397870200001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
200244003730000000006139687252001010200001020000505717680040018400374003737194337517200102020000206000040037400371110021109101010000100000000064007163339787020000104003840038400384003840038
200244003730000000006139687252001010200001020000505717680040018400374003737194337517200102020000206000040037400371110021109101010000100000000064003163339787020000104003840038400384003840038
200244003729900000006139687252001010200001020000505717680040018400374003737194337517200102020660206000040037400371110021109101010000100000000064003163339787020000104003840038400384003840038
200244003730000000005466395662412007010201321021672505731480140414405584061237211493773221682202098222654424065640572121100211091010100001040201022560078107704340195020000104062240623403274023240086
2002440085299000000059493963224320142122014412216727257317931404504060640617372225637745216822221968226557440617405771311002110910101000010222012244832857081337740009220000104033040622403294042740330
2002440572302010110006139687252001010200001020000505717680140414404644023237214737686213802220980206490840230404743110021109101010000100000100078206974339972020000104028140474404264033340475
200244026730400110754088061396872520010102000010200005057176800404504046540621372154437572219902221640206688840525403161211002110910101000010000000239800847061325940027020000104003840038400384003840038
20024400373110009617167926139687252014410201921021520835731793040414404654052537224337517200102020000206000040037400371110021109101010000102402000064003163439787020000104003840038400384003840038
200244003731100000306139687252001010200001020000505717680040065400374003737194337517200102020000206000040037400371110021109101010000100000003064003163339787020000104003840038400384003840038
200244003730000000006139687252001010200001020000505717680040018400374003737194337517200102020000206000040037400371110021109101010000100000000064003163339787020000104003840038400384003840038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40204600384491010005465281455968726301251253000012530000625858762706001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
402046003844900000000615968726301251253000012530000625858762706001906013960246546710354995301252003000020080000600386003811302011009910010030000100000000300001910216225980125400001006003960039600396003960039
40204600384500000004530615968726301251253000012530000625858762706001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
402046003845000000000615968726301251253000012530000625858762706001906003860038546710354995301252003000020080000600386003811302011009910010030000100000000000001910216125980125400001006003960039600396003960039
4020460038450000000006159687263012512530000125300006258587627060019060038600385467103549953012520030000200817706003860038113020110099100100300001000000502000001938216225980125400001006003960039600396003960039
4020460038451010000006159667263012512530000125300006258587627060019360038600385467101255040301252003000020080000600386003811302011009910010030000100000000000001910216225980125400001006003960039600396003960039
4020460038450000000006159687165301641253000012530000625858762706005606003860038546710354995301252003000020080000600386003811302011009910010030000100000000020001910216225980125400001006003960039600396003960039
402046003844900000030615968753301251253000012530000625859504906001906003860038546530354995301252023000020080000600386003811302021009910010030000100000000000001910216625980125400001006009560039600396003960039
4020460038450000000006159687161301251253000012530000625858762706001936003860038546710354995301252003000020080000600386003871302021009910010030000100000070000001910216225980125400001006003960039603506003960039
4020460038450000060006159687263012512530000130300006258587627060019060038600385467103549953027820630000200800006003860038113020110099100100300001000032141234300001910216235980125400001006003960039600396003960039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4002460038450000000061596872630013133000013300006585876270600190600386003854693035501730013203000020800006003860038113002110910103000010000000300189002172259801340000106003960039600396003960039
40024600384490000681007265968726300131330000133000065858762716001906003860038546930355017300132030000208000060038600381130021109101030000100000701500189003172259801340000106003960039600396003960039
4002460299449000000010359687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000000023100189002172259801340000106003960039600396003960039
40024600384500000000251596872630013133000013300006585876271600190600386003854693035501730013203000020800006003860038113002110910103000010000000600189002172259801340000106003960039600396003960039
40024600384490000000615967826300131330000133000065858762716001906003860092546910355017300132030000208000060038603501130021109101030000100000001800189002172359801340000106003960039600396003960039
4002460038450000000061596872630013133000013300006585876270600190600386003854693035511230013203000020800006003860038113002110910103000010000000300189002172259801340000106014260092600396003960039
400246003844900000001475968726300131330000133000065858762716001906003860038546930355017300132030000208000060038600381130021109101030000100000009300189002172259801340000106003960039600396003960039
400246003844900000026159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000001021300189002172559801340000106003960039600936003960039
400246003845001000006159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000000021900189003172259801340000106003960039600396003960039
400246003845000000006159687263001313300001330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000000022500189003172259801340000106003960039600396003960039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0040

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4020440043300011200128439659833021312530083125300006425700933040021400404007034731033504230125200300002008000040040400401130201100991001003000010000001910216223984425400001004004440041400414010140044
402044007029900000008439707263012612530002125300006255701226140024400434004034685033499730125200300002008000040043400401130201100991001003000010000001910216223984429400001004004140041400414004140041
4020440040300000000225439707263012712530002125300006255700760040024400434004034716033499730125200300002008000040040400431130201100991001003000010000001910216223984425400001004004140041400414004140086
402044004029900000016139699263013512530000125300006255700933140021400404004034685033500030125200300002008000040043400431130201100991001003000010003001910216223984425400001004004140041400414004140041
40204400403000000001101239699263012712530002125300006255700933040021400404004034685033499730125200300002008000040043400401130201100991001003000010000001910216223984425400001004004440041400414004140041
4020440085300000000116539707263012612530000125300006255700760040066400404004034685033500030125200300002008126440040400401130201100991001003000010000001910216223984425400001004004140071400864007140041
402044004030000000006139699263012712530000125300006255700305040021400434004034685733500030125200300002008000040085400401130201100991001003000010000001910216223987425400001004008640044400444004140074
402044004030000000016139699263012512530000125300006255700933040021400404004034685033499730125200300002008000040040400401130201100991001003000010000001910216223984425400001004004140044400414004740041
402044004030000002220025139707263013512530010125300006255700933140021400434007034688033499730125200300002008000040070400701130201100991001003000010000001910216223984425400001004004140041400414004140041
402044004030000000008939699263012612530000125300006255700933040021400404004334685033499730125200300002008000040040400431130201100991001003000010000001910216223988925400001004004140041400444004140044

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0040

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
400244004030000613970754300491330001133000065570030514002104004340040347083350193001320300002080000400854004011300211091010300001000018904173339846340000104004140041400414004140041
40024400403001320613970726300131330000133000065570093314002104004040040347083350193001320300002080000400434004011300211091010300001000018903173339846340000104004140041400414004140044
400244004330001613970726300131330000133000065570093314002104004040043347083350253001320300002080000400404004011300211091010300001010018903173339843340000104004440044400414004440047
4002440040300001283971026300131330001133000065570130014002104004040040347083350193001320300002080000400404004311300211091010300001000018903174339843340000104004440041400414004140041
400244004329900613970726300491330000133000065570093314002104004040040347083350193001320300002080000400404004011300211091010300001000018903173339843340000104004140044400414004140041
4002440046300010613971026300131330000133000065570122604002104004340043347083350493001320300002080000400404004011300211091010300001000018903173339846340000104004740041400444004440044
4002440046300002143970726300131330000133000065570475214002104004040040347113350223001320300002080000400404004011300211091010300001000018903174339843340000104004140041400714004440044
4002440040299001053970726300131330000133000065570093304002104004640040347113350193001320300002080000400404008511300211091010300001000018903173439843340000104004140041400414004140041
4002440040300001743970726300131330010133000065570122604002104004040040347113350253001320300002080000400404004311300211091010300001000018904173339843340000104004440044400414004140044
400244004330002613970726300131330001133000065570093304002104004040040347083350193001320300002080000400404004011300211091010300001000018903173339843340000104004140041400414004140044

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b }, v3.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40204600384491102645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001912516555980125400001006003960039600396003960039
402046003845011012545968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001912516565980125400001006003960039600396003960039
4020460038449110115259687263012512530000125300006258587627600196003860038546710354995301252003000020080000600386003811302011009910010030000100000002101912516355980125400001006003960039600396009060039
40204600384501101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000901912616555994925400001006003960039600396003960039
40204600914511101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001913416445980125400001006003960039600396003960039
402046003845011121645968726301251253000012530000625858762760019600386003854671035499530125200300002048000060038600911130201100991001003000010000000001913650535980125400001006003960039600396003960039
402046003845011121645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010024004405601912416785980125400001006003960039600396003960039
40204600384501101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001912516355980125400001006003960039600396003960039
4020460038449110110035968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000010001912516555980125400001006003960039600396003960039
40204600384511101645968726301251253000012530000625858762760019600386003854671035499530125200300002008000060038600381130201100991001003000010000000001913616535980125400001006003960039600396003960039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0038

retire uop (01)cycle (02)03191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4002460038450006159687263001313300001330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000018907177759801340000106003960039600396003960039
4002460038450006159687263001313300001330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000018904174759801340000106003960039600396003960039
40024600384500011859687263001313300121330000658587627160019060038600385469303550173001320300002080000600386003811300211091010300001000018904174759801340000106003960039600396003960039
4002460038450006159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000018904174759801340000106003960039600396003960039
4002460038450006159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001006018907177759801340000106003960039600396003960039
40024600384500025159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001000018907177559801340000106003960039600396003960039
4002460038449006159687263001313300001330000658587627160019060038600385469303550173001320300002080880600896003811300211091010300001000018904177459801340000106003960039600396003960039
4002460038449006159687263001313300001330000658587627160019060038600385469003550173001320301682080000601406003811300211091010300001000018907177459801340000106019660039600396003960039
40024601394500132126659687263001313300001330000818588904060093060038600385468703550173001320300002080000600906003811300211091010300001027527018907177459801340000106008960039600396003960039
4002460038450018022159687263001313300001330000658587627060019060038600385469303550173001320300002080000600386003811300211091010300001001018907177459801340000106003960039600396003960039

Test 6: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b }, v10.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b }, v10.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b }, v10.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b }, v10.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b }, v10.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b }, v10.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b }, v10.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b }, v10.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2402044006564400032602616012512516000012516000065015999600040070400434004399730355557160125200160000200480000400434004311160201100991001001600001000000001011012970114004062400001004041340165868844004440044
240204400433000006702616012512516000012516000065015999600040024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000015302616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000010902616012512516000012516000065015999600140024400434004399738807310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044401114004487213
240204400436510008602616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
240204400433000009002616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
240204400432990006702616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000081702616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
24020440043300000137902616012512516000012516000065015999600185650400434004399730310000160125200160000200480000400434004311160201100991001001600001000000001011011601140040252400001004004440044400444004440044
2402044004330000010702616012512516000012516000065015999600140024400434004399730310000160125200160000200480000400438567311160201100991001001600001000000001011011601140040252400001004004440044400444004440044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
2400244004330000000001750261600131316000013160000681599960114002440043400439996031002216001320160000204800004004340043111600211091010160000100000100223111917211515400403152240000104004440044400444004440044
2400244004330000000002860261600131327757913160000681599960114002440043400439996031002216001320160000204800008636740043111600211091010160000100000100223111417211157400403152240000104004440044400444004440044
24002440043300000001161670514026160013131600001316000068159996011400244004340043493630310022160013201600002048000040043400431116002110910101600001020031002432241722115104004031515240000104004440163400444004440044
2400244011130011301920013302616001313160000131600006815999601140024400434004399968985310022160013201600002048000084678400431116002110910101600001000001002231151722169400403304240000104004440044400444004440044
24002440043300000000015402616001313160000131600006812043078014002488092400434852603100221600132016000020480000400438707111160021109101016000010000010024622917421512400403152240000108722840044400444004440044
240024400436590000000346026160013131600001316000068159996001400244004340043999603100221600132016000020480000860934004311160021109101016000010000310022321817411149400403152240000104004440044400444004440044
2400244004330000000004902616001313160000131600006815999600140024400434004399960371004316001320160000204800004004340043111600211091010160000100000100243111017221109400403152240000104004440044400444004440044
24002486367647000120001370471601151116000013160000681600968114002440043401094784289723100221600132016000020480000400434004311160021109101016000010002310022311917211116400403152240000104004440044400444004440044
240024400433000000000490261600131316000013160000661599960114002486093400439996035634616001320160000204800004004340043111600211091010160000100003100223111417211109400403152240000104004440044400444004440044
2400244004329900012000490261600131316000013160000681168551411400248809240043999603100221600132016000020480000400434004311160021109101016000010003010022311517211810400403152240000104004440044400444004440044