Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBX (three register table, 8B)

Test 1: uops

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386801601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009486603760371110011000000073216225824300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225853300060386038603860386038
30046037460615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253012300030008386801601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450615687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038
30046037450845687253000300030008386800601860376037532235645300030009000603760371110011000000073216225787300060386038603860386038

Test 2: Latency 1->1

Code:

  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc5branch mispredict (cb)cdcfd5d6inst fetch restart (de)e0? int output thing (e9)eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3020460037450000000072659687253010010030000100300005008587680160018600376003755929656240301002003000820090024600376003711102011009910010010000100000011171701605979600300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038
302046003744900000008959687253010010030000100300005008587680160018600376003755929756241301002003000820090024600376003711102011009910010010000100000011171601615979500300001006003860038600386003860038
3020460037450000042006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171701605979500300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171601605979600300001006003860038600386003860083
302046003745000000006159687253010010030000100300005008587680160018600376003755929756240301002003000820090024600376003711102011009910010010000100000011171601605979600300001006003860038600386003860038
302046003744900000006159687253010010030000100300005008587680160018600376003755929756241301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755929656241301002003000820090024600376003711102011009910010010000100000011171601605979600300001006003860038600386003860038
302046003745000000006159687253010010030000100300005008587680160018600376003755932756240301002003000820090024600376003711102011009910010010000100000011171701615979500300001006003860038600386003860038

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300246003745000000003215968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038
300246003744900000001935968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000100006402162259785030000106003860038600386003860038
300246003744900000001455968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162359785030000106003860038600386003860038
300246003744900000001245968725300101030000103000050858768016001806003760037559443562673016320300002090000600376003711100211091010100001000000006403162259785030000106003860038600386003860038
30024600374500000000615968725300101030000103000050858768016001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038
3002460037450000000040959533296301461530156173212877860564216055806070060713559856456528321452432311209695160716607161511002110910101000010002043026828412803260341530000106003860038600866003860038
300246003745000101848123206752595332553018012301801431672558606925160558060750607195599062564953229120324792497404604756090316110021109101010000100401033960088141795360433130000106120461147608126110061147
30024611854741000001531859566319301421430192103000050858768016073836079760811560107156565324452032473229644760713608111311002110910101000010000001998464011372259785130000106003860038600386003860038
30024600374650001120014115967625300101030000103000050858768016001806003760085559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038
300246003745000000001455968725300101230000103000050859152916001806003760037559443562673001020300002090000600376003711100211091010100001000000006402162259785030000106003860038600386003860038

Test 3: Latency 1->2

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v1.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)03090a0b191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2c5branch mispredict (cb)cdcfd6inst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50204800386000000000617968626401251254000012540000625114576400800190800388009273428773740401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
502048003859900000001297968626401251254000012540000625114576401800190800388003873428773740401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576400800190800388003873424673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576401800193800388003873431773741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
502048003860000000007267968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039800398003980039
50204800386000000000617968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
50204800385990000000617968626401251254000012540000625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119171607980925500001008003980039802438003980039
50204800385990004120010579686119401251254000012540608625114576401800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800398003980039
502048003860000000001037968626401251254000012540000625114576400800190800388003873428673741401252004000720011002080038800381130201100991001003000010000011119161607980925500001008003980039800928003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50024800385990000726796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905173579796350000108003980039800398003980039
50024800386000090726796862640013134000013400006511457842008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175379796350000108003980039800398003980039
5002480038599000061796652640013134000013400006511457640018001908003880038734430373767400132040000201100008003880038113002110910103000010000300018903173579796350000108003980039800398003980039
5002480038600000061796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175379796350000108003980039800398003980039
5002480038600000061796862640013134000013400006011457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905173579796350000108003980039800398003980039
5002480038599000061796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175879796350000108003980039800398003980039
5002480038599000061796862640013134000013400006511457640008001908003880038734430373767400132040000201100008003880038113002110910103000010000000018905175379796350000108003980039800398003980039
5002480038599000061796862640013134000013400006511457640008001908003880038734437373767400132040000201100008009280038113002110910103000010000000018905175779796350000108003980039800398003980039
50024800915990000726796862640013134000013400006511457640118001908003880038734430373767400132040000201100008003880038113002110910103000010000000018903175379796350000108003980039800398003980039
50024800386000000536796862640013134000013400006511457640008001908003880038734437373767400132040000201100008003880038113002110910103000010000000018903175379796350000108003980039800398003980039

Test 4: Latency 1->3

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v2.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire (01)cycle (02)03080b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc2c5branch mispredict (cb)cdcfd6e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50204600404500000000615969626401251254000012540000625857668706002160040600405343975374540125200400072001100206004060040113020110099100100300001000024601111916165983425500001006004160041600416004160041
50204600404500000000615969626401251254000012540000593857668706002160040600405344275374240125200400072001100206004060040113020110099100100300001000025201111916165983525500001006004160041600416004160041
50204600404500000000615969626401251254000012540000625857668706002160040600405343975374240125200400072001100206004060040113020110099100100300001000024601111917165983525500001006004160041600416004160041
50204600404500000000615969626401261254000012540000625857668706002160040600405343975374240125200400072001100206004060040113020110099100100300001000010201111916165983425500001006004160044600416004160041
502046004045000001320061596962640125125400001254000062585766871600216004360040534396537434012520040007200110020600406004011302011009910010030000100006001111917165983825500001006004160041600446004160041
50204600404500000000615969626401251254000012540000625857668706002160040600435343965374240125200400072001100206004060040113020110099100100300001000023401111917165983825500001006004160153601536004460041
502046015045101002760079759686264012612540000125400006258576687060021600406004053442753743401252004000720011002060043600401130201100991001003000010000301111916165983525500001006004160044600416004160041
5020460040450000000072659696264012512540000125400006258577118060021600406004053439653742401252004000720011002060040600401130201100991001003000010001301111917165983425500001006004160041600416004160041
50204600434500000001615971726401251254000012540000625857668716002160040600405344275374240125200400072001100206004060040113020110099100100300001000023101111916165983725500001006004160041600416004160041
50204600434500000000615969626401251254000012540000625857668706002160040600405344275374240125200400072001100206004060040113020110099100100300001000217101111916165983425500001006004160041600416004160041

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 4.0040

retire (01)cycle (02)030b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50024600434500000006159696264001313400001341005658576687160021600406026753454235376940013204000020112341600406004011300211091010300001000009018904332259826250000106004160041600416004160041
5002460040450000005615969626400131340000134000065857668716002160040600405345435382840013204000020110000600406004011300211091010300001000000018902173259826350000106004160041600416004160041
5002460040449000000615969626400131340000134000065857668706002160040600405345435376940013204000020110000600406004011300211091010300001000000018902172259826350000106004160044600416004160041
5002460040449004000615969626400131340001134080465857668716002160040600405345435376940013204000020110000600406004011300211091010300001000013018902173259826350000106004460041600416004160044
5002460040450000001615969626400131340001134000065857668706002160040600405345435376940013204000020110000600406004011300211091010300001000000018902172259826350000106004160041600416004160041
5002460040449000000615969626400131340000134000081857668716002160040600405345435376940013204000020110000600406004011300211091010300001000000019112172259826350000106004160041600416004160041
500246021244900012006159658264001413400001340000658576687160021600406004053454455377240013204000020110000600406004311300211091010300001000200018902172259829350000106004160041600416004160041
5002460040449000000615969626400131340000134000065857668706002160040600435345435376940013204000020110000600406004011300211091010300001000000018902172259826350000106004160041600416004460044
500246004044900000011145969626400131340009134000065857672506002160040600405345435376940013204000020110000600406004311300211091010300001000006018902172259826350000106026660044600966004160041
50024600404490000006365971726400131340000134000065857711816002460150602625345445537874061520400002011000060040600401130021109101030000100001108018902492259829350000106004160041600416004160041

Test 5: Latency 1->4

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v3.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire (01)cycle (02)0318191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8acc5branch mispredict (cb)cdcfd6e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
5020440042300000210339687264012712540003125400006255692179140023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004229900036139685264012712540004125400006255692424040023400424004233453633744401252004000720011002040042400421130201100991001003000010000001111917173988125500001004004340043400434004340043
502044004230000016139685264012712540002125400006255693510040023400424004233453733744401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004230000026139685264012812540001125400006255691797040023400424004233453733744401252004000720011003640042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004229900016139685264012812540003125400006255692269040023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004230000036139684264012712540003125400006255692152140023400424004233453733743401252004000720011003640042400421130201100991001003000010000001111916163988125500001004004340043400434004340043
50204400423000004141739684264012612540003125400006255692179140023400424004233453733743401252004000720011003640042400421130201100991001003000010000001111917163988125500001004004340043400434004340043
502044004230000006139687264012712540003125400006255691934040023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111916163988125500001004004340043400434004340043
502044004230000026139685264012712540002125400006255693436040023400424004233453733743401252004000720011002040042400421130201100991001003000010000001111916163988125500001004004340043400434004340043
502044004230000006139686264012512540003125400006255692089040023400424004233453633744401252004000720011002040042400421130201100991001003000010000001111917163988125500001004004340043400434004340043

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0042

retire (01)cycle (02)031e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
500244004230000613968526400161340001134000065569343704002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
5002440042300027683968426400131340002134000065569268204002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230003613968726400131340003134000065569268504002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230003613968426400161340003134000065569229604002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004229902613968526400171340001134000065569343604002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230001613968726400141340000134000065569226904002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230001613968426400161340002134000065569219504002340042400423346933377140013204000022110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230004613968426400151340002134000065569228704002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004230000613968526400141340001134000065569343604002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043
500244004229902613968426400161340003134000065569219504002340042400423346933377140013204000020110000400424004211300211091010300001000018902172239871350000104004340043400434004340043

Test 6: Latency 1->5

Chain cycles: 2

Code:

  movi v0.16b, 0
  tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v4.8b
  add v4.16b, v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4
  movi v4.16b, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)0308091e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd0d2d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
50204800386000012103796862640125125400001254000062511457640008001908003880038734287737414012520040007200110020800388003811302011009910010030000100006111195900316537980925500001008003980039800398003980039
5020480038599001261796862640125125400001254000062511457640108001908003880038734287737404012520040007200110020800388003811302011009910010030000100000111191700216437980925500001008003980039800398003980039
5020480038599001261796862640125125400001254000062511457640108001908003880038734286737414012520040007200110020800388003811302011009910010030000100040111191700316347980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640008001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191600416447980925500001008003980039800398003980039
502048003860000061796862640125125400001254000062511457640108001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191600316377980925500001008003980039800398003980039
5020480038599010298796862640125125400001254000062511457640108001938003880038734287737404012520040007200110020800388003811302011009910010030000100000111191600316337980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640008001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191700116337980925500001008003980039800398003980039
502048003860000061796862640125125400001254000062511457640108001908003880038734287737414012520040007200110020800388003811302011009910010030000100009111191700316317980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640008001908003880038734286737414012520040007200110020800388003811302011009910010030000100023111191700416437980925500001008003980039800398003980039
502048003859900061796862640125125400001254000062511457640108001908003880038734286737414012520040007200110020800388003811302011009910010030000100000111191600316337980925500001008003980039800398003980039

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 6.0038

retire (01)cycle (02)03040b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
5002480038600100000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000002018903173379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018904173379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398019480039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039
5002480038600000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903174379796350000108003980039800398003980039
5002480038599000000061796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903174379796350000108003980039800398003980039
50024800386000000000726796862640013134000013400006511457640080019800388003873443037376740013204000020110000800388003811300211091010300001000000000018903173379796350000108003980039800398003980039

Test 7: throughput

Count: 8

Code:

  movi v0.16b, 0
  tbx v0.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v1.16b, 0
  tbx v1.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v2.16b, 0
  tbx v2.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v3.16b, 0
  tbx v3.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v4.16b, 0
  tbx v4.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v5.16b, 0
  tbx v5.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v6.16b, 0
  tbx v6.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v7.16b, 0
  tbx v7.8b, { v8.16b, v9.16b, v10.16b }, v11.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7506

retire (01)cycle (02)031e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020460057450004911552002624012512524000012524000062532995711600280600476004719973032000424012520024000020072000060047814691116020110099100100160000100000101105161160044253200001006004860048600486004860048
3202046004744900028302624012512524000012524000062532995711600940600476004719973034142624012520024000020072000060047814001116020110099100100160000100000101101161160044253200001006004860048818006004860048
3202046004744900022202624012512524000012524000062532995711600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100010101101161160044253200001006004860048600486004860048
3202046004744900020102624012512524000012524000062532995711600280817366004719973034132124012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048600486004881470
32020460047450000191602624012512524000012524000062532995711814050600476004741252032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001008134960048600488142560048
3202048130245000018002624012512529063212524000062532995711600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048600486004860048
32020460047450000108802624012512524000012524000062532995710600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048815296004860048
3202048173645000022202624012512524000012524000062532995710600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001008173760048600486004860048
320204809824500049553106202624012512528989912524000062532995710600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100000101101161160044253200001006004860048600486004860048
32020460047450000157026240125125240000125240000640114346831600280600476004719973032000424012520024000020072000060047600471116020110099100100160000100003101101161160044253200001006004860048600486004860048

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7506

retire (01)cycle (02)0307080a0b1e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3200246004860900000001510262400131324000013240000653299571105600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228112017211920600443157320000108117960048600486004860048
320024600474500000000164026240013132400001324000065329957111560028600476004719996032002624001320240000207200006004760047111600211091010160000100000010022841191721120206004431514320000106004860048600486004860048
3200246004745000000001390262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917411619600443309320000106004860048600486004860048
320024600474500000000160026240013132400001324000065329957111560028815196004719996032002624001320240000207200006004760047111600211091010160000100000010022114119172111919600443157320000106004860107600486004860048
320024600474500000000107602624001313240000132400006532995711156002860047600471999603200262400132024000020720000600476004711160021109101016000010000001002284119172111919600443157320000106004860048600486004860048
3200246004745000000006290262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917211619600443157320000106004860048600486004860048
3200246004745000000005820262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917211198600443157320000106004860048600486004860048
3200246004744900000009580262400131324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100228411917211198600443157320000106004860048600486004881663
3200246004745000000001580262400961324000013240000653299571115600286004760047199960320026240013202400002072000060047600471116002110910101600001000000100248416172111020600445157320000106004860048600486004860048
320024600474500000000674579352624001313240000132400006532995711156002860047600471999603200262400132024000020720000600476004711160021109101016000010000001002211411917211198600443157320000106004860048600488164460048