Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBX (three register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)03mmu table walk data (08)3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
30046037450825687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
30046037460615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
30046073450845687253000300030008386806018603760375322356453000300090006037603711100110005073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
300460374506156872530003000300083868060186037603753223564530003000900060376037111001100016073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216425787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110003073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110002073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
302046003745012615968725301001003000010030000500858768016001860037600375593775624030100200300082009002460037600371110201100991001001000010000011171716597950300001006003860038600386003860038
30204600374491220559687253010010030000100300005008587680060018600376003755929756241301002003068020090024600376003711102011009910010010000100200111774165979614300001006003860038600386003860038
302046003745001035968725301001003000010030000500858768016001860037600375592965624130100200306742009002460037600372110201100991001001000010000211171716597950300001006003860038600386003860038
30204600374500615968725301001003000010030456500858768006001860037600375592975624030100200300082009002460037600371110201100991001001000010023011171616597950300001006003860038600386003860038
30204600374491953885968725301001003000010030000505858768016001860037600375592975624030100200300082009002460037600371110201100991001001000010029011171616597960300001006003860038600386003860038
30204600374500615968725301001003000010030000500858768016001860037600375592965624030100200300082009002460037600371110201100991001001000010000011171616597950300001006003860038600386003860038
302046003744905365968725301001003000010030000500858768016001860037600375592965624130100200300082009002460037600371110201100991001001000010000011171716597950300001006003860038600386003860038
30204600374490615968725301001003000010030000500858768016001860037600375592965624030100200300082009002460037600371110201100991001001000010000011171716597950300001006003860038600386003860038
30204600374504867685968725301001003000010030000500858768016001860037600375592965624030100200300082009002460037600371110201100991001001000010003011171616597960300001006003860038600386003860038
30204600374650615968725301001003000010030000532858768016001860037600375592975624130100200300082009002460183600371110201100991001001000010019011171616597950300001006003860038600386003860038

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002460037450000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
3002460037449000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
30024600374500000000061596872530010103000010300005085876801600186003760037559441056304306182030333209148860085600861110021109101010000100001606404164459785030000106003860038600386018560038
300246003744900000000210596872530010103000010300005085889631600186003760037559441256285304662030162209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
3002460037450000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
30024600374500000000074759687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
3002460037450000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164559785030000106003860038600386003860038
300246003745000000012061596872530010103000010300005085876801600186003760037559443562673001020300002090000600376003711100211091010100001000013064010357559825030000106013460087600386003860184
30024600374500101000016659687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100001006405165459849030000106003860038600386003860038
30024600374500000000076859687253001010300001030000508587680060018600376003755944356267300102030000209000060037600371110021109101010000100001006404165459785030000106003860038600386003860038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)79map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2d5map dispatch bubble (d6)dadde0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
502048003859900000617968626401251254000012540000625114576401580019800388003873428673740401250200400072001100208003880038113020110099100100300001000000000111191651017007980925500001008003980039800398024780039
502048003859900000617968626401251254000012540000625114576401580019800388003873428773740401250200400072001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039
502048003860000000617968626401251254000012540000625114576401580019800388003873423773740401250200400072001100208003880038113020110099100100300001000000000111191651016017980925500001008003980039800398003980039
50204800386000000061796862640125125400001254000062511457640158001980038800387342867374040125020040007200110020800388003811302011009910010030000100000122631801112267511170008050924500001008073381006810098105481002
502048104860712020251717601477947246540363133402281254304064411481772158078080993810587339811074063431710228424852201186778111180796211302011009910010030000100021105071801112265510173008072921500001008131381310810008131781212
50204812636091312300117517944744640421133402281264471262611485725158067180997815137339222674167437730226429772261200258140381209281302011009910010030000100220125455801111937510145008058425500001008003980039800398003980039
5020480089621000001947968626401391254000012540000625114576401580019800388003873427673741401258200401742001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039
502048003860000000617968626401251254000012540000625114576401580019800388003873428673740401250200400072001100208003880038113020110099100100300001000000000111191751016007980925500001008003980039800398003980091
5020480038599000002517968626401251254000012540000625114576401580019800388003873428773740401250200400072001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039
5020480038600000007267968626401251254000012540000625114576401580019800388003873428673741401250200401742001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
5002480038600000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018905174479796350000108003980039800398003980039
500248003859900000654061796862640013134000013400006511457640180019080038800387344303737674001320400002011000080038800381130021109101030000100000018909173479796350000108003980039800398003980039
5002480291600000000061796862640013134000013400006511457640180019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173579796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173479796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173479796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173579796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018905174479796350000108003980039800398003980039
50024800385990000000726796862640025134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173479796350000108003980039800398003980039
50024800385990000000726796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904174479796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511458037080019080038800387344303737674001320400002011000080038800381130021109101030000100000018905173479796350000108003980039800398003980039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire uop (01)cycle (02)03181e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
5020460043449000961596962640125125400001254000062585771186002106004060040534397537424012520040007200110020600406004311302011009910010030000100000000111191601605989525500001006004160044600446004160041
5020460040450000061596962640125125400001254000062585766876002106009560040534396537434012520040007200110020600406004011302011009910010030000100000000111191701605983825500001006004160041600416004160044
50204600404490120961596962640125125400001254000062585766876002106004060040534397537434012520040007200110020600406004311302011009910010030000100000000111191701605983425500001006004160041600416004160041
5020460040450000061597172640134125400091254000062585767256002106004060040534396537434012520040007200110020600406004311302011009910010030000100000000111191701605983425500001006004160041600416004160044
5020460040450000061596962640125125400011254000062585766876002106004360040534397537434012520040007200110020600406004311302011009910010030000100220000111191701605983425500001006004160041600446004160041
5020460040450000961596962640125125400001254000062585767256002106004060040534397537424012520040007200110020600406004011302021009910010030000100000200111191701605983525500001006004160041600416004160041
5020460040449000061596962640126125400001254000062585766876002106004060043534396537464012520040007200110020600406004011302011009910010030000100000000111191601605983825500001006004160041600416004160041
5020460040450000061596962640125125400001254000062585766876002106004060040534397537424012520040007200110020600406004011302011009910010030000100000000111191701605983425500001006004160041600446004160041
5020460040449000161596962640125125400001254000062585766876002106004060040534426537424012520040007200110020600436004011302011009910010030000100000000111191701605983525500001006004460041600446004460044
5020460040449000061596962640125125400001254000062585766876002106004060040534396537434012520040007200110020600406004011302011009910010030000100200000111191702405983425500001006004460041600416004160041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
500246004045000000016159696264001313400011340000658576687160021060040600435345435376940013204000020110000600406004011300211091010300001000000000018907175659826350000106004160041600416004460041
500246004045000009006159696264001313400001340000658576687160021060040600405345435376940013204000020110000600406004011300211091010300001000020000018906174659826350000106004160041600446004160041
500246004344900000006159696264001313400001340000658576687160024060040600405345735377240013204000020110000600406004011300211091010300001000000000018906174559826350000106004460041600416004160041
500246004045000000006159696264001313400001340000658576687160024060043600435345735377240013204000020110000600406004011300211091010300001000000000018905174559826350000106004160041600416004160041
500246004045000000006159696264001313400011340000658576687160021060040600405345735376940013204000020110000600406004011300211091010300001000000000018905175559826350000106004160041600446004160041
500246004045000000006159696264001313400011340000658576687160021060040600405345435376940013204000020110000600406004011300211091010300001000000000018905174559826350000106004160044600416004160041
50024600404490000510125559696264001313400011340000658576687160021060043600435345435376940013204000020110000600406004011300211091010300001000000001018906175659826350000106004160044600416004160041
500246004044900000006159696264001313400001340000658576687160021060040600405345435376940013204000020110000600406004311300211091010300001000000300018905174559826350000106004160041600446004160041
5002460040450000000072659696264001413400091340000658576687160021060040600435345435376940013204000020110000600436004311300211091010300001000000000018906175659826350000106004160041600446004160041
500246004044900000006159696264001313400001340000658576687160021060040600405345735377240013204000020110000600406004011300211091010300001000000000018906175559826350000106004160041600416004160041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50204400423000338839685264012712540003125400006255692152140023400424004233453733743401252004000720011002040042400421130201100991001003000010001111917163988125500001004004340043400434004340043
5020440042300036139687264012812540001125400006255692281040023401094004233453733743401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423002406139684264012512540001125400006255692272140023400424004233453633744401252004000720011003640042401001130201100991001003000010001111916163988125500001004004340043400434004340043
5020440042299016139685264012712540000125400006255693538140023400424004233453733744401252004000720011002040042400421130201100991001003000010001111916163988125500001004004340043400434004340043
5020440042300026139687264012812540002125400006255692208140023400424004233453733744401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000427139685264012612540003125400006255693510140023400424004233453733743401252004000720011002040042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000210339685264012812540003125400006255692195140023400424004233453733743401252004001320011002040042400421130201100991001003000010001111916163988125500001004004340043400434004340043
5020440042300918439685264012912540001125400006255693444140023400424004233453733743401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000320839687264012912540001125400006255692066140023400424004233453733744401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000352339685264012612540004125400006255692294140023400424004233453733743401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire uop (01)cycle (02)03mmu table walk instruction (07)191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
50024400422991021014083968526400161340002134000065569215204002340042400423346933377140013204000020110000400424004211300211091010300001021030189281779398713050000104004340043400434004340043
500244004230010631053968426400161340001134000065569229604002340042400423346933377140013204000020110000400424004211300211091010300001001060189211171212398713050000104004340043400434004340043
500244004230010153107396872640013134000413400006556934360400234004240042334693337714001320400002011000040042400421130021109101030000100106018921117713398713050000104004340043400434004340043
500244004230010150157396852640015134000213400006556922900400234004240042334693337714001320400002011000040042400421130021109101030000100003018909171111398713050000104004340043400434004340043
500244004230010212161396852640015134000213400006556934361400234004240042334693338074001320400002011000040042400421130021109101030000100200018921017811398713050000104004340043400434004340043
500244004230010151107396852640014134000213400006556935100400234004240042334693337714001320400002011000040042400421130021109101030000100103018921017811398713050000104004340043400434004340043
50024400423001062743396852640015134000213400006556921930400234004240042334693337714001320400002011000040042400421130021109101030000100100018921117129398713050000104004340043400434004340043
50024400422991021210539687264001413400021340000655692728040023400424004233469333771400132040000201100004004240042113002110910103000010020301892817117398713050000104004340306400434004340043
50024400423001011746213968726400161340003134000065569342904002340042400423346933377140013204000020110000400424004211300211091010300001001400018921117118398713050000104004340043400434004340043
500244004230010153843968426400151340002134000065569343604002340042400423346933377140013204000020110000400424004211300211091010300001000060189211171111398713050000104004340043400434004340043

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)03181e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
50204800385990001877968626401251254000012540000625114576400800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
5020480038599090827968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800385990002537968626401251254000012540000625114576400800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010002011119161607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576401800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576401800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800938003980091
5020480038599000617968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
5020480038600000617968626401251254000012540000625114576400800198003880038734280773741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800386000003897968626401251254000012540000625114576400800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)ebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
500248003859900462794853654023914402281142888871148055501805658089680902734041387407742907204311720118645810568105120130021109101030000104221003870841890003170347987230050000108003980039800398003980649
5002480038643005241795532264011914401681444408661149675100808558144781669733713739294320324447872212128481414812022713002110910103000010000000301890003170337979630050000108003980039800398003980039
500248003859900617968626400131340036134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
500248003860000617968626400131340000134000065114576401180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
5002480038600045827968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170437979630050000108003980039800398003980039
500248003860000617968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
5002480038600001907968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890004170347979630050000108003980039800398003980039
5002480038599007267968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
50024800386000072679686264001313400001340000651145764001800198003880038734433273767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
50024800385990331937968626400131340000134000065114576400080019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039

Test 7: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7506

retire uop (01)cycle (02)03191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
320204814846130000238026240125125240000125240000636329957160028060047600471997303200042402572002400002007200006004781588111602011009910010016000010022000900101102161160044253200001006004860048600486004860048
320204600474494552001320262401251252400001252400006253299571600283816436004719973032000424012520024000020072000060047600471116020110099100100160000100000011200101101161160044253200001008123960048600486004860048
32020460047450000048026240125125240000125240000625329957160028060047600474109903411952401252002400002007200006004760047111602011009910010016000010000001600101101161160044253200001006004860048600486004881190
320204600474500120061026240125125289154125240000625329957160028060047600471997303200042401252002400002007200006004760047111602011009910010016000010000001000101102162160044253200001006004860048600488123960048
320204600474500000713026240125125240000125240000625329957160028060047600471997303200042401252002400002007200008123860047111602011009910010016000010000000000101101161160044253200001006004860048812396004881190
320204600474500300734026240125125240000125240000625329957160028060047600472001103200042401252002400002007200006004760047111602011009910010016000010000000000101102161160044253200001006004860048600486028460048
320204600474500000726026240125125240000125240000625329957160028060108600471997303200042401252002400002007200006004760047111602011009910010016000010000000000101101171160044253200001006004860048600486004881239
32020460047450000048026240125125240000125240000625329957160028081825600471997303200042401252002400002007204026016460047111602011009910010016000010000000000101101161160044253200001006004860048600486004860048
320204600474500147880480262401251252891541252400006251142811460028060047600471997303200042401252002400002007200006004760047111602011009910010016000010000002000101102161160044253200001006004860048600488148460048
320204600474490000485835426240125125240000125240000625329957160028060047600471997303200042401252002400002007200006004760047111602011009910010016000010000000000101101161160044253200001006004860048600486004860048

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7506

retire uop (01)cycle (02)03l1i tlb fill (04)09l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
320024600484501000000053026240013132400001324000065329957111060028060047600471999603200262400132024000020720000600476004711160021109101016000010010301002331113917111252760044316140320000108145081700812946004860048
3200246004745011000120053054240229132400001324000067329957111560028060047600471999603200262400132024000020720000600476004711160021109101016000010000001002384112517111202560044316138320000108170681338600486004860048
32002460047450111000002680262400131324000013240000653299571115600280600476004720004014201092410532024000020720000600476035141160021109101016000010000001002384112517111222560044316135320000106004860048600486034781504
3200246004744900000000535829526240013132400001424000065330209311560028060047810961999603200262400132024000020720000600476004711160021109101016000010000001002385112617111162560044316127320000106004860048600486004860048
3200246004745011000000112026240013132400001324000065329957111560028360047600471999693200262400132024000020720000600476004721160021109101016000010000201110022841034171112626600443167320000106004860048600486004860048
32002460047449000000002240262400131324000013240000653299571100600280600476004719996032002624001320240000207200006004760047111600211091010160000100000110023311126171122818600443167320000106004860048600486004860048
32002460047450110000009802624001313240000132400006532995711006002808116160047199960352002624001320240000207200006004760047111600211091010160000100000010023311125171112126600443167320000106004860048600486004860048
32002460047450110000002030262400131324000013240000653299571100600280600476004719996032002624001320240000207200006004760047111600211091010160000100000010023311126171112526600443167320000106004860048600486004860048
320024600474501100000079502629014813240000132400006532995710006002806004760047199960320026240013202400002072000060047600471116002110910101600001000000100233121161711121256004431614320000106004860048600486004860048
320024600474491100021009802624001313240000132400006532995711006002806004760047199960320026240013202400002072000060047600471116002110910101600001006200100233111261711117276004431613320000106004860048600486004860048