Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBL (four register table, 8B)

Test 1: uops

Code:

  tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.001

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire (01)cycle (02)0308181e373f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
300440373000006136762530003000300055141004018403740373335336453000300090004037403711100110000073116113821300040384038403840384038
3004403730000410536762530003001300054964104018403740373335336453000300090004049403711100110000073116113821300040384038403840384038
3004403730000010536892530003000300054964514018403740373335336453000300090004049403711100110001073116113821300040384038403840504038
300440373000006136762530003000300054964104030403740373335336453000300090004037403711100110002073116113821300040384038403840384038
300440373000016136762530013001300054964514018403740373335336453000300090004037403711100110000073116113833300040384050403840384038
300440373000008236762530003001300054964114018404940373335336453000300090004037403711100110000073116113821300040384038403840384038
300440373000016136762530003001300054964504018403740373335336453000300090004037403711100110000073116113821300040384038403840384038
300440373000016136762530003001300054964514018403740373335336453000300090004037403711100110000073116113821300040384038403840384038
300440373200016136762530003004300054964114018403740373335336453000300090004049403711100110000073116113821300040504038403840504050
300440373000016136762530003001300054964104018403740373335336453000300090004037403711100110000073116113821300040384038403840384038

Test 2: Latency 1->2

Code:

  tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080b1e373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30204400373000004061396762530101100300011003000050057084100400184003740037359426362403010020030008200900244003740037111020110099100100100001000071012162239819300001004003840038400384003840038
30204400373000000061396762530100100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001001671012163239819300001004003840038400384003840038
30204400373000000061396762530100100300011023000050057084101400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400384003840038
30204400373000004061396912530101100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000371012162239831300001004003840038400384003840038
30204400373000000061396912530100100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000071212162239819300001004003840038400384008440038
30204400373000004061396762530100100300001003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400384003840038
30204400372990001092396762530101100300001003000050057066410400184003740037359353362453010020030000200900004003740049111020110099100100100001000071012163239819300001004003840038400384003840038
30204400373000001061396762530101100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239831300001004005040038400384003840038
302044003729900040223396762530197100300001003000050057066451400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400384003840038
302044003730000010747396762530100100300011003000050057066410400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012162239819300001004003840050400504003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300244003730000000172639676253001010300011030000505706641040018400374003735957336267300102030000209000040037400371110021109101010000100006402163239818030000104003840038400384003840038
30024400372990000006139676253001010300001030000505706645040018400374003735973336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
30024400373000000046139676253001110300011030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
300244003730000000110339676253001110300011030000505706645040018400374003735957336267300102030000209000040037400371110021109101010000100006402162339830030000104003840038400384003840038
30024400403000000016139682253001010300001030000505706645140018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
300244003730000000125139676253001110300011030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100006402162339818030000104003840038400384003840041
300244003730000000182396761443035810301741030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
300244003730000000419139676253001110300011030000505706641040018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
3002440037300000001147396762530010103000110300005057066410400184003740037359572636267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
30024400373000000016139676253001010300011030000505706645040030400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038

Test 3: Latency 1->3

Code:

  tbl v1.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0304080b18191e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2c5branch mispredict (cb)cdcfd0d2icache miss (d3)d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3020440037300100001500040739676253010010030001100300005005706641004001840037400373593533625730100200300002009000040037400371110201100991001001000010000000000071010031622398190300001004003840038400384003840038
302044003730000000001059339676253010010030000100300005005706645004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004003840038400384003840050
30204400372990000000016139676253010110030001100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021632398190300001004003840038400384003840038
30204400372990000000106139676253010110030001100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004005040050400384003840038
3020440037300000000000104939676253010010030001100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004003840038400384003840038
302044003729900000004161396762530101100300861003000050057066410040018401414003735935213624530324200300002009000040037400491110201100991001001000010000000000071010021632398190300001004003840038400384003840038
30204400373000000000106139676253010110030001100300005005706645004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071210021622398190300001004003840038400384003840038
30204400373000000000106139676253010110030000100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000003000071010021632398190300001004003840038400384003840038
30204400373000000000106139676253010010030000100300005005706645004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004003840038400384003840038
30204400373000000000106639676253010110030000100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000020000071010021622398190300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03090b1e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002440037299000010125939676253001010300011030000505706641040030400494004935957336267300102030252209000040037400491110021109101010000100000640216323981830000104003840038400384003840038
30024400373000000106639676253001110300041030000505708410140018400494003735957336267300102030000209000040037400371110021109101010000100000640216333981830000104003840038400504005040038
30024400373000000006139676253001010300001030000505708410140018400374003735957336267300102030000209000040049400371110021109101010000100000640416333981830000104003840038400384005040038
300244003729900000072639676253001110300001030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100000640216433981830000104003840038400384003840050
30024400372990000006139676253001110300011030000505706641140018400374003735957336267302312030000209000040037400371110021109101010000100000640416333981830000104003840038400504005040038
30024400373000000006139676253001110300011030000505706645040018400374003735957336267300102030000209000040037400371110021109101010000100000640416333981830000104005040038400384003840038
30024400372990000006139676253001110300001030000505706641140018400644003735957336267300102030000209000040037400371110021109101010000100000640216233981830000104003840038400384003840038
30024400373000000406139676253001010300011030000505706641140018400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038
300244004930000000072639682253001010300001030000505706641040018400374003735957336267300102030000209000040037400371110021109101010000100000640216433981830000104003840038400384003840038
30024400373000000006139676253001110300011030000505706645040018400374003735957336267300102030000209000040037400371110021109101010000100000640316233983030000104003840038400384003840038

Test 4: Latency 1->4

Code:

  tbl v2.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030918191e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30204400372990010000107239697253010110030094100300005665706624140018400374004035952336245301002003000020090000400404003711102011009910010010000100000002092403142534042223300001004089240634409284073440939
3020440849306115151980132020058573938616931115126309951353270064957142671405344063440411359041043650932837217330932229929740359406761211020110099100100100001000212278204093202163534031415300001004087640931409214095140038
302044003730000015000124396902530101100300001003000050057066231400184003740037359343362453010020030000200900004003740037111020110099100100100001000000000710021622398190300001004003840041400384004140038
302044003730000000007263968725301011003000110030000500570662314001840037400373593425362453010020030000200900004003740037111020110099100100100001002000000710121632398370300001004004140038400384006240038
3020440037300000001061396902530101100300001003000050057066241400184003740037359343362693010020030000200900004003740040111020110099100100100001000000000710121632398220300001004003840038400384003840038
3020440037300000000061396872530100100300081003000050057070571400184003740037359343362453010020030000200900004006140037111020110099100100100001000000201000710121632398190300001004003840038400384003840041
30204400372990007801061396872530101100300011003000050057066240400184003740037359343362453010020030000200900004003740061111020110099100100100001000000000710121632398190300001004003840038400624003840041
302044004030000000101661396872530100100300001003000050057066240400184003740037359343362453010020030000200900004004040061111020110099100100100001000000000710122622398190300001004006240038400384003840038
3020440037300000008061396902530100100300001003000050057104001400424003740037359343362453010020030000200900004003740037111020110099100100100001000000000710121632398220300001004004140062400384003840041
3020440037300000001061396872530101100300011003000050057066241400184004040061359343362693010020030000200900004006140037111020110099100100100001000000000710121632398192300001004006240041400384006240038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9faccfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300244003730000000001125539687253001010300001030000505706624140018400494004935968336267300102030000209000040061400491110021109101010000100640316333982030000104003840038400384003840062
300244005530000000002186539703253001010300001030000505710400040030400374003735956336267300102030000209000040055400371110021109101010000100640316333983530000104003840038400624005640038
300244003730000000000187339694253001010300001030000505706624040018400374003735956336267300102030000209000040037400371110021109101010000100640316333981730000104003840038400384003840038
300244003730000000000186839687253001110300001030000505706624040042400614005535974336291300102030000209000040055400371110021109101010000100640316333981730000104003840038400504003840038
300244003730000000008129439690253001110300001030000505710400040018400374003735956336267300102030000209000040040400371110021109101010000100640316333981730000104005640050400384003840038
30024400373000000000052639687253001410300261030000505710400040042400614003735956336267300102030000209000040037400611110021109101010000100640316333984130000104006240041400384003840083
300244003730000000006185339687253001010300001030000505706624040018400374003735968336270302342030000209000040052400371110021109101010000100640316333981730000104003840041400414003840038
300244006130000000008180239687253001610300061030000505706624040018400374005535974336285300102030000209000040055400371110021109101010000100640316333981730000104005040038400384005040038
300244003730000000008186139687253001010300351030000505706624040042400374003735974336267300102030000209000040055400371110021109101010000100640316333981730000104003840038400384003840038
300244003730000000000133339687253001010300001030000505706624140021400404003735956336267300102030000209000040037400371110021109101010000100640316333981730000104007440038400384004140041

Test 5: Latency 1->5

Code:

  tbl v3.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030b1e373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30204400373000246061396872530100100300001003045053357076120400184003740037359343362453010020030000200900004011540090111020110099100100100001000071004162239819300001004003840038400384003840038
3020440037300001161396872530100100300001003000050057066240400214003740037359343362453010020030000200900004003740040111020110099100100100001001071012162239819300001004003840038400624003840038
3020440037300001061396872530101100300011003000050057066240400214004040061359373362453010020030000200900004003740037111020110099100100100001000071012162239822300001004003840041400384003840038
3020440037300001061396872530100100300011003000050057066230400184003740037359343362453010020030000200900004003740040111020110099100100100001000071212162239819300001004006240038400384003840038
3020440037300001061396872530101100300001003000050057066230400424006140037359373362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400414003840038
30204400373000000251396872530101100300001003000050057066240400424004040037359583362483010020030000200900004003740249111020110099100100100001000071012163239843300001004003840038400384003840038
3020440037299001061396902530100100300001003000050057066230400184006140037359343362453010020030000200900004003740040111020110099100100100001000071013162239819300001004003840038400564003840038
3020440037300001061396872530101100300011003000050057070570400214003740040359373362453010020030000200900004003740037111020110099100100100001000071012162239819300001004004140038400384003840038
3020440037299001061396872530108100300011003000050057070570400214004040061359343362453010020030000200900004003740040111020110099100100100001000071013162239819300001004006240038400384004140038
3020440037300000061396872530101100300001003000050057066240400424004040037359583362483010020030000200900004004040037111020110099100100100001002071012162239819300001004004140038400414003840083

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090a18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a7a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300244003729900000001613969025300101030000103000050570662404001840037400373595633626730010203000020900004003740037111002110910101000010000000006405163339817030000104003840038400384003840038
3002440037300000000009433968725300111030000103000050570662404001840037400373595633626730010203000020900004003740037111002110910101000010000000006403163339835030000104003840038400384003840038
300244003730000000001613968725300101030000103000050570662404001840037400373595633626730010203000020900004005540037111002110910101000010000000006403164339817030000104003840038400384003840038
300244003730000000000613968725300101030000103000050570662404001840037400373595633626730010203000020900004003740037111002110910101000010000000006403163339820030000104003840038400384003840038
300244003729900000000613968725300101030001103000050570662414001840037400373595633626730010203000020900004003740040111002110910101000010000000006403163339817030000104003840038400384003840038
3002440037300000000006313968725300101030000103000050570662404001840037400553595633626730010203000020900004003740037111002110910101000010000000006403163339817030000104003840041400384003840038
300244003730000000000613968725300111030000103000050570662314001840037400373595633626730010203000020900004003740037111002110910101000010000000006403163339817030000104003840038400384003840038
300244003730000000000613968725300101030000103000050570662314001840061400373595633626730010203000020900004003740037111002110910101000010000000006403163339817030000104003840038400384003840038
300244003730000000000613968725300101030000103000050570662404001840037400373595633626730010203000020900004009040037111002110910101000010000003006403164439817030000104003840038400384009040038
3002440247300000000016139687253001010300001030000505706623040018400894003735952123628230010203023520900004003740037111002110910101000010000000006403163339817030000104003840038400384003840038

Test 6: Latency 1->6

Code:

  tbl v4.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30204400372990000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000712021622398870300001004003840038400384003840038
30204400372990000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400372990000060061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300181003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
3020440037300000000006139779253010010030000100300005005714347140018400374003736025336245301002003000020090000400374003711102011009910010010000100001410710121622398870300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30024400373000000613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010006403162239888030000104003840038400384003840038
300244003730000006139753253001010300001030000505714347040018400374003736047336267300102030000209054040037400371110021109101010000100126402162239888030000104003840038400384003840038
300244003730000006139779253001010300001030000505714347040018400374003736047336267300102030000209000040037400371110021109101010000101216402162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006403162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
300244003730000001033977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400372990000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400372990000613977925300101030000103000050571434704001840084400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038

Test 7: throughput

Count: 8

Code:

  tbl v0.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v1.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v2.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v3.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v4.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v5.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v7.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7505

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6inst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010006111511601606003902400001006004360043600436004360043
24020460042449033262401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042449033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010080111511601606003902400001006004360043600436004360043
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
240204600424500698252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111512901606003902400001006004360043600436004360043
24020460042449033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042449033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042451033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7505

retire (01)cycle (02)03040708090b18191e1f373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a7a8a9acc2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240024600424491000000120043025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061666600390240000106004360043600436004360043
240024600424490000000000233025240010102400001024000050227997106002360042600422999603381182400102024000020720000602006004211800211091010800001000000300502071675600390240000106004360043600436004360043
24002460042450000000000085025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061676600390240000106004360043600436004360043
240024600424500000000000705025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061665600390240000106004360043600436004360043
2400246004244900000000007500252400101024000010240000502279971060023600426004229996033002224001020240000207200006004260042118002110910108000010000101200502061657600390240000106004360043600436009460043
24002460042450000000000043025240010102400001024000050228388806002360042600422999603449612400102024000020720000600426004211800211091010800001000000300502071665600390240000106004360043600436004360043
24002460042450000000000061025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061666600390240000106004360043600436004360043
24002460042450000000000043025240010102400001024000050228287306002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502051656600390240000106004360043600436004360043
24002460042450000000000043025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000300502061666600390240000106004360043600436919060043
240024600424490000000210043025240010102400001024000050227997116002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502051666600390240000106004360043600436004360043