Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBX (three register table, 16B)

Test 1: uops

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire (01)cycle (02)03083f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
30046037450825687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
30046037460615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
30046073450845687253000300030008386806018603760375322356453000300090006037603711100110005073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
300460374506156872530003000300083868060186037603753223564530003000900060376037111001100016073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216425787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110003073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110002073216225787300060386038603860386038
30046037450615687253000300030008386806018603760375322356453000300090006037603711100110000073216225787300060386038603860386038

Test 2: Latency 1->1

Code:

  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2c5branch mispredict (cb)cdcfd6e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302046003745012615968725301001003000010030000500858768016001860037600375593775624030100200300082009002460037600371110201100991001001000010000011171716597950300001006003860038600386003860038
30204600374491220559687253010010030000100300005008587680060018600376003755929756241301002003068020090024600376003711102011009910010010000100200111774165979614300001006003860038600386003860038
302046003745001035968725301001003000010030000500858768016001860037600375592965624130100200306742009002460037600372110201100991001001000010000211171716597950300001006003860038600386003860038
30204600374500615968725301001003000010030456500858768006001860037600375592975624030100200300082009002460037600371110201100991001001000010023011171616597950300001006003860038600386003860038
30204600374491953885968725301001003000010030000505858768016001860037600375592975624030100200300082009002460037600371110201100991001001000010029011171616597960300001006003860038600386003860038
30204600374500615968725301001003000010030000500858768016001860037600375592965624030100200300082009002460037600371110201100991001001000010000011171616597950300001006003860038600386003860038
302046003744905365968725301001003000010030000500858768016001860037600375592965624130100200300082009002460037600371110201100991001001000010000011171716597950300001006003860038600386003860038
30204600374490615968725301001003000010030000500858768016001860037600375592965624030100200300082009002460037600371110201100991001001000010000011171716597950300001006003860038600386003860038
30204600374504867685968725301001003000010030000500858768016001860037600375592965624030100200300082009002460037600371110201100991001001000010003011171616597960300001006003860038600386003860038
30204600374650615968725301001003000010030000532858768016001860037600375592975624130100200300082009002460183600371110201100991001001000010019011171616597950300001006003860038600386003860038

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0037

retire (01)cycle (02)0307080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002460037450000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
3002460037449000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
30024600374500000000061596872530010103000010300005085876801600186003760037559441056304306182030333209148860085600861110021109101010000100001606404164459785030000106003860038600386018560038
300246003744900000000210596872530010103000010300005085889631600186003760037559441256285304662030162209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
3002460037450000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
30024600374500000000074759687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164459785030000106003860038600386003860038
3002460037450000000006159687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100000006404164559785030000106003860038600386003860038
300246003745000000012061596872530010103000010300005085876801600186003760037559443562673001020300002090000600376003711100211091010100001000013064010357559825030000106013460087600386003860184
30024600374500101000016659687253001010300001030000508587680160018600376003755944356267300102030000209000060037600371110021109101010000100001006405165459849030000106003860038600386003860038
30024600374500000000076859687253001010300001030000508587680060018600376003755944356267300102030000209000060037600371110021109101010000100001006404165459785030000106003860038600386003860038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)030818191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)79~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a7a8a9acc2c5branch mispredict (cb)cdcfd0d2d5d6dadde0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
502048003859900000617968626401251254000012540000625114576401580019800388003873428673740401250200400072001100208003880038113020110099100100300001000000000111191651017007980925500001008003980039800398024780039
502048003859900000617968626401251254000012540000625114576401580019800388003873428773740401250200400072001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039
502048003860000000617968626401251254000012540000625114576401580019800388003873423773740401250200400072001100208003880038113020110099100100300001000000000111191651016017980925500001008003980039800398003980039
50204800386000000061796862640125125400001254000062511457640158001980038800387342867374040125020040007200110020800388003811302011009910010030000100000122631801112267511170008050924500001008073381006810098105481002
502048104860712020251717601477947246540363133402281254304064411481772158078080993810587339811074063431710228424852201186778111180796211302011009910010030000100021105071801112265510173008072921500001008131381310810008131781212
50204812636091312300117517944744640421133402281264471262611485725158067180997815137339222674167437730226429772261200258140381209281302011009910010030000100220125455801111937510145008058425500001008003980039800398003980039
5020480089621000001947968626401391254000012540000625114576401580019800388003873427673741401258200401742001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039
502048003860000000617968626401251254000012540000625114576401580019800388003873428673740401250200400072001100208003880038113020110099100100300001000000000111191751016007980925500001008003980039800398003980091
5020480038599000002517968626401251254000012540000625114576401580019800388003873428773740401250200400072001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039
5020480038600000007267968626401251254000012540000625114576401580019800388003873428673741401250200401742001100208003880038113020110099100100300001000000000111191651016007980925500001008003980039800398003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)03080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
5002480038600000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018905174479796350000108003980039800398003980039
500248003859900000654061796862640013134000013400006511457640180019080038800387344303737674001320400002011000080038800381130021109101030000100000018909173479796350000108003980039800398003980039
5002480291600000000061796862640013134000013400006511457640180019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173579796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173479796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173479796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173579796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018905174479796350000108003980039800398003980039
50024800385990000000726796862640025134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904173479796350000108003980039800398003980039
50024800385990000000726796862640013134000013400006511457640080019080038800387344303737674001320400002011000080038800381130021109101030000100000018904174479796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511458037080019080038800387344303737674001320400002011000080038800381130021109101030000100000018905173479796350000108003980039800398003980039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire (01)cycle (02)03181e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5branch mispredict (cb)cdcfd5d6inst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
5020460043449000961596962640125125400001254000062585771186002106004060040534397537424012520040007200110020600406004311302011009910010030000100000000111191601605989525500001006004160044600446004160041
5020460040450000061596962640125125400001254000062585766876002106009560040534396537434012520040007200110020600406004011302011009910010030000100000000111191701605983825500001006004160041600416004160044
50204600404490120961596962640125125400001254000062585766876002106004060040534397537434012520040007200110020600406004311302011009910010030000100000000111191701605983425500001006004160041600416004160041
5020460040450000061597172640134125400091254000062585767256002106004060040534396537434012520040007200110020600406004311302011009910010030000100000000111191701605983425500001006004160041600416004160044
5020460040450000061596962640125125400011254000062585766876002106004360040534397537434012520040007200110020600406004311302011009910010030000100220000111191701605983425500001006004160041600446004160041
5020460040450000961596962640125125400001254000062585767256002106004060040534397537424012520040007200110020600406004011302021009910010030000100000200111191701605983525500001006004160041600416004160041
5020460040449000061596962640126125400001254000062585766876002106004060043534396537464012520040007200110020600406004011302011009910010030000100000000111191601605983825500001006004160041600416004160041
5020460040450000061596962640125125400001254000062585766876002106004060040534397537424012520040007200110020600406004011302011009910010030000100000000111191701605983425500001006004160041600446004160041
5020460040449000161596962640125125400001254000062585766876002106004060040534426537424012520040007200110020600436004011302011009910010030000100000000111191701605983525500001006004460041600446004460044
5020460040449000061596962640125125400001254000062585766876002106004060040534396537434012520040007200110020600406004011302011009910010030000100200000111191702405983425500001006004460041600416004160041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire (01)cycle (02)03070b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
500246004045000000016159696264001313400011340000658576687160021060040600435345435376940013204000020110000600406004011300211091010300001000000000018907175659826350000106004160041600416004460041
500246004045000009006159696264001313400001340000658576687160021060040600405345435376940013204000020110000600406004011300211091010300001000020000018906174659826350000106004160041600446004160041
500246004344900000006159696264001313400001340000658576687160024060040600405345735377240013204000020110000600406004011300211091010300001000000000018906174559826350000106004460041600416004160041
500246004045000000006159696264001313400001340000658576687160024060043600435345735377240013204000020110000600406004011300211091010300001000000000018905174559826350000106004160041600416004160041
500246004045000000006159696264001313400011340000658576687160021060040600405345735376940013204000020110000600406004011300211091010300001000000000018905175559826350000106004160041600446004160041
500246004045000000006159696264001313400011340000658576687160021060040600405345435376940013204000020110000600406004011300211091010300001000000000018905174559826350000106004160044600416004160041
50024600404490000510125559696264001313400011340000658576687160021060043600435345435376940013204000020110000600406004011300211091010300001000000001018906175659826350000106004160044600416004160041
500246004044900000006159696264001313400001340000658576687160021060040600405345435376940013204000020110000600406004311300211091010300001000000300018905174559826350000106004160041600446004160041
5002460040450000000072659696264001413400091340000658576687160021060040600435345435376940013204000020110000600436004311300211091010300001000000000018906175659826350000106004160041600446004160041
500246004044900000006159696264001313400001340000658576687160021060040600405345735377240013204000020110000600406004011300211091010300001000000000018906175559826350000106004160041600416004160041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8c5branch mispredict (cb)cdcfd6e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50204400423000338839685264012712540003125400006255692152140023400424004233453733743401252004000720011002040042400421130201100991001003000010001111917163988125500001004004340043400434004340043
5020440042300036139687264012812540001125400006255692281040023401094004233453733743401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423002406139684264012512540001125400006255692272140023400424004233453633744401252004000720011003640042401001130201100991001003000010001111916163988125500001004004340043400434004340043
5020440042299016139685264012712540000125400006255693538140023400424004233453733744401252004000720011002040042400421130201100991001003000010001111916163988125500001004004340043400434004340043
5020440042300026139687264012812540002125400006255692208140023400424004233453733744401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000427139685264012612540003125400006255693510140023400424004233453733743401252004000720011002040042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000210339685264012812540003125400006255692195140023400424004233453733743401252004001320011002040042400421130201100991001003000010001111916163988125500001004004340043400434004340043
5020440042300918439685264012912540001125400006255693444140023400424004233453733743401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000320839687264012912540001125400006255692066140023400424004233453733744401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043
50204400423000352339685264012612540004125400006255692294140023400424004233453733743401252004000720011003640042400421130201100991001003000010001111917163988125500001004004340043400434004340043

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire (01)cycle (02)0307191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50024400422991021014083968526400161340002134000065569215204002340042400423346933377140013204000020110000400424004211300211091010300001021030189281779398713050000104004340043400434004340043
500244004230010631053968426400161340001134000065569229604002340042400423346933377140013204000020110000400424004211300211091010300001001060189211171212398713050000104004340043400434004340043
500244004230010153107396872640013134000413400006556934360400234004240042334693337714001320400002011000040042400421130021109101030000100106018921117713398713050000104004340043400434004340043
500244004230010150157396852640015134000213400006556922900400234004240042334693337714001320400002011000040042400421130021109101030000100003018909171111398713050000104004340043400434004340043
500244004230010212161396852640015134000213400006556934361400234004240042334693338074001320400002011000040042400421130021109101030000100200018921017811398713050000104004340043400434004340043
500244004230010151107396852640014134000213400006556935100400234004240042334693337714001320400002011000040042400421130021109101030000100103018921017811398713050000104004340043400434004340043
50024400423001062743396852640015134000213400006556921930400234004240042334693337714001320400002011000040042400421130021109101030000100100018921117129398713050000104004340043400434004340043
50024400422991021210539687264001413400021340000655692728040023400424004233469333771400132040000201100004004240042113002110910103000010020301892817117398713050000104004340306400434004340043
50024400423001011746213968726400161340003134000065569342904002340042400423346933377140013204000020110000400424004211300211091010300001001400018921117118398713050000104004340043400434004340043
500244004230010153843968426400151340002134000065569343604002340042400423346933377140013204000020110000400424004211300211091010300001000060189211171111398713050000104004340043400434004340043

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v4.16b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)03181e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5branch mispredict (cb)cdcfd6inst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50204800385990001877968626401251254000012540000625114576400800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
5020480038599090827968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800385990002537968626401251254000012540000625114576400800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010002011119161607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576401800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
5020480038599000617968626401251254000012540000625114576401800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800938003980091
5020480038599000617968626401251254000012540000625114576400800198003880038734280673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
5020480038600000617968626401251254000012540000625114576400800198003880038734280773741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800386000003897968626401251254000012540000625114576400800198003880038734280773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)03181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd2icache miss (d3)d5d6dbddinst fetch restart (de)e0? int output thing (e9)ebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
500248003859900462794853654023914402281142888871148055501805658089680902734041387407742907204311720118645810568105120130021109101030000104221003870841890003170347987230050000108003980039800398003980649
5002480038643005241795532264011914401681444408661149675100808558144781669733713739294320324447872212128481414812022713002110910103000010000000301890003170337979630050000108003980039800398003980039
500248003859900617968626400131340036134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
500248003860000617968626400131340000134000065114576401180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
5002480038600045827968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170437979630050000108003980039800398003980039
500248003860000617968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
5002480038600001907968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890004170347979630050000108003980039800398003980039
5002480038599007267968626400131340000134000065114576400180019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
50024800386000072679686264001313400001340000651145764001800198003880038734433273767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039
50024800385990331937968626400131340000134000065114576400080019800388003873443373767400132040000201100008003880038113002110910103000010000000001890003170337979630050000108003980039800398003980039

Test 7: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v1.16b, 0
  tbx v1.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v2.16b, 0
  tbx v2.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v3.16b, 0
  tbx v3.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v4.16b, 0
  tbx v4.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v5.16b, 0
  tbx v5.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v6.16b, 0
  tbx v6.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v7.16b, 0
  tbx v7.16b, { v8.16b, v9.16b, v10.16b }, v11.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7506

retire (01)cycle (02)03191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320204814846130000238026240125125240000125240000636329957160028060047600471997303200042402572002400002007200006004781588111602011009910010016000010022000900101102161160044253200001006004860048600486004860048
320204600474494552001320262401251252400001252400006253299571600283816436004719973032000424012520024000020072000060047600471116020110099100100160000100000011200101101161160044253200001008123960048600486004860048
32020460047450000048026240125125240000125240000625329957160028060047600474109903411952401252002400002007200006004760047111602011009910010016000010000001600101101161160044253200001006004860048600486004881190
320204600474500120061026240125125289154125240000625329957160028060047600471997303200042401252002400002007200006004760047111602011009910010016000010000001000101102162160044253200001006004860048600488123960048
320204600474500000713026240125125240000125240000625329957160028060047600471997303200042401252002400002007200008123860047111602011009910010016000010000000000101101161160044253200001006004860048812396004881190
320204600474500300734026240125125240000125240000625329957160028060047600472001103200042401252002400002007200006004760047111602011009910010016000010000000000101102161160044253200001006004860048600486028460048
320204600474500000726026240125125240000125240000625329957160028060108600471997303200042401252002400002007200006004760047111602011009910010016000010000000000101101171160044253200001006004860048600486004881239
32020460047450000048026240125125240000125240000625329957160028081825600471997303200042401252002400002007204026016460047111602011009910010016000010000000000101101161160044253200001006004860048600486004860048
320204600474500147880480262401251252891541252400006251142811460028060047600471997303200042401252002400002007200006004760047111602011009910010016000010000002000101102161160044253200001006004860048600488148460048
320204600474490000485835426240125125240000125240000625329957160028060047600471997303200042401252002400002007200006004760047111602011009910010016000010000000000101101161160044253200001006004860048600486004860048

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7506

retire (01)cycle (02)0304090b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9acc5cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320024600484501000000053026240013132400001324000065329957111060028060047600471999603200262400132024000020720000600476004711160021109101016000010010301002331113917111252760044316140320000108145081700812946004860048
3200246004745011000120053054240229132400001324000067329957111560028060047600471999603200262400132024000020720000600476004711160021109101016000010000001002384112517111202560044316138320000108170681338600486004860048
32002460047450111000002680262400131324000013240000653299571115600280600476004720004014201092410532024000020720000600476035141160021109101016000010000001002384112517111222560044316135320000106004860048600486034781504
3200246004744900000000535829526240013132400001424000065330209311560028060047810961999603200262400132024000020720000600476004711160021109101016000010000001002385112617111162560044316127320000106004860048600486004860048
3200246004745011000000112026240013132400001324000065329957111560028360047600471999693200262400132024000020720000600476004721160021109101016000010000201110022841034171112626600443167320000106004860048600486004860048
32002460047449000000002240262400131324000013240000653299571100600280600476004719996032002624001320240000207200006004760047111600211091010160000100000110023311126171122818600443167320000106004860048600486004860048
32002460047450110000009802624001313240000132400006532995711006002808116160047199960352002624001320240000207200006004760047111600211091010160000100000010023311125171112126600443167320000106004860048600486004860048
32002460047450110000002030262400131324000013240000653299571100600280600476004719996032002624001320240000207200006004760047111600211091010160000100000010023311126171112526600443167320000106004860048600486004860048
320024600474501100000079502629014813240000132400006532995710006002806004760047199960320026240013202400002072000060047600471116002110910101600001000000100233121161711121256004431614320000106004860048600486004860048
320024600474491100021009802624001313240000132400006532995711006002806004760047199960320026240013202400002072000060047600471116002110910101600001006200100233111261711117276004431613320000106004860048600486004860048