Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

TBL (four register table, 16B)

Test 1: uops

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.001

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire (01)cycle (02)030b18191e1f373f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
300440373000000161367625300130003000549641140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403731000120161367625300130003000549641140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403730000000103367625300030013000549645140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403730000000613676253001300430005496411401840374037333522364530003000900040494037111001100010073216223821300040384038403840384038
300440373000000061367625300030003000549641040184049403733353364530003000900040374037111001100000073216223821300040384038403840504038
300440373100130161367625300430013000549641140184037403733353364530003000900040374037111001100000073216223821300040384050403840384038
3004403730000001613676253000300030005496410401840374037333533645300030009000403740371110011000008773216223821300040384038405040384038
300440373100000161367625300030013000549641140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403730000000823676253001300030005514101403040374037333533645300030009000403740371110011000002773216223821300040384038403840384038
300440373000000161367625300430003000549641040184037403733353364530003000900040374037111001100000073216223821300040384038403840384038

Test 2: Latency 1->2

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080918191e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302044003730000000010613967625301001003000110030000500570664114001840049400493593533624530100200300002009000040037400371110201100991001001000010000029000710131633398190300001004003840038400384005040038
302044003730000000000613967625301041003000010030000500570664114001840037400373594733624530100200300002009000040037400371110201100991001001000010000029000710131633398190300001004003840038400384003840038
302044003730000000000613967625301001003000010030000500570664514003040037400373593533624530100200300002009000040037400371110201100991001001000010000027000710131633398310300001004003840038400384003840038
30204400372990000001061396762530100100300001003000050057066411400184003740037359353362573010020030000200900004003740037111020110099100100100001000000000710131633398190300001004003840038400384003840038
30204400373000000001061396762530104100300011003000050057066451400184003740049359353362453010020030000200900004004940049111020110099100100100001000000000710131633398190300001004003840038400384003840038
30204400373000000000061396762530101100300001003000050057066411400184003740037359353362453010020030000200900004003740049111020110099100100100001000000000710131633398190300001004003840038400504003840050
30204400373000000004061396762530100100300011003000050057066411400184003740037359353362453010020030000200900004004940037111020110099100100100001000000000710131633398190300001004003840038400384003840038
30204400372990000001061396972530104100300001003000050057066411400184003740037359353362453010020030000200900004003740037111020110099100100100001000001000710131633398190300001004003840038400384003840038
30204400372990000001061396762530104100300001003000050057066451400184003740037359353362453010020030000200900004003740037111020110099100100100001000003060710131633398190300001004003840038400384003840038
302044003730000000000613967625301041003000010030000500570664514001840049400373593533624530100200300002009000040037400371110201100991001001000010000051030710131633398190300001004003840050400384003840050

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090b18191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30024400373000000000613968225300111030000103000050570664514001840037400373595733626730010203000020900004003740037111002110910101000010000200006402162239818030000104003840038400384003840038
3002440037300000000161396912530010103000110300005057066411400184003740037359573362673001020300002090000400374003711100221091010100001000010306402162239818030000104003840038400384003840050
3002440040300000000161396762530010103000010300005057066431400184003740037359573362673001020300002090000400374003711100211091010100001000020006402162239818030000104003840038400384003840038
3002440037309000000161396762530011103000010300005057066451400184003740049359573362673001020300002090000400374003711100211091010100001000020006402162239818030000104003840038400384003840038
3002440037300000000161396762530010103000110300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000000006402162239818030000104003840038400384003840038
3002440037299000000082396762530011103000110300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840262
3002440037300000000161396762530010103000410300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840038
3002440049300000000161396762530014103000110300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840041
3002440037299000000061396822530014103000010300005057066451400184003740037359573362673001020300002090000400374003711100211091010100001000000006402162239818030000104005040038400384003840038
3002440037300000000161396792530011103000010300005057066451400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840050

Test 3: Latency 1->3

Code:

  tbl v1.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03080b18191e1f373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a7a8a9acc2c5branch mispredict (cb)cdcfd0d2d5d6dbddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30204400373000000000061396762530100100300001003000050057066411400184003740037359420636241301002003000820090024400374003711102011009910010010000100000000011171700016000398400300001004003840038400384003840038
30204400373000000000061396762530209100300011003000050057066411400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216032398190300001004003840038400504003840038
30204400373000000300061396762530101100300011003000050057066451400184003740037359350336245301002003000020090000400374003711102011009910010010000100221022794300009471041420224016917300001004079240782406274083640460
302044082830601141418481316201033967612230471133305261263179262657087531403184040240462358910583638332143218314212159499540526404119110201100991001001000010000002020007101081630544028018300001004062340896408154079640840
302044081930500101612001232120742639285236314341313120613633136670570934314068440820406163592803362453010020030000200900004003740037111020110099100100100001000001030000710102970233981917300001004005040465400384014540251
30204404662991014218601144194402239327339314401313113310030000500570664114061240285408103585908236360321412203093820493555402514030711102011009910010010000100000000000071010216022398190300001004003840038400384009740038
30204400373000000001061396762530100100300001003000050057066411400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071210216022398190300001004003840038400384009640038
302044003730000000000296396762530101100300001003000050057066411400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216022398190300001004003840038400384003840038
30204400372990000001061396764430101100300001003000050057066450400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216032398190300001004005040038400384003840038
30204400493000000001161396762530100100300001003000050057084100400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216022398190300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030b1e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002440037310001613967625300111030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100200640316333981830000104005040038400384005040038
3002440037300000613967625300111030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038
30024400373001017263967625300101030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104005040050400384020940038
3002440037300001613967625300101030001103000050570664101400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038
3002440037300031613967625300101030000113000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100020640316333981830000104003840038400384003840038
3002440037300000613967625300101030001103000050570664101400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333983030000104003840038400384003840038
30024400373000004973967625300111030000103000050570664301400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400864003840038
30024400372990004413967625300111030001103000050570664501400180400374004035957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400504003840038
3002440037300001613967625300141030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100000662316333981830000104003840038400384003840038
3002440037299001613967625300111030001103000050570664100400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038

Test 4: Latency 1->4

Code:

  tbl v2.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)031e373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acbranch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3020440037300000613970325301001003000110030000500570662404001840037400373593433624530100200300002009000040037400371110201100991001001000010000071002162239822300001004003840038400414006240038
3020440037300000613969025301011003000010030000500570662304001840037400373593433624530100200300002009000040037400401110201100991001001000010000071012162239819300001004003840038400414006240038
3020440040300010613968725301081003000010030000500570662304002140037400373593733624530100200300002009000040037400551110201100991001001000010000071012162239819300001004003840041400384004140038
3020440037300000613970325301001003000010030000500570662414002140037400403593433624530100200300002009000040037400371110201100991001001000010000071012162239822300001004003840038400384003840038
30204400613000001563968725301001003000010030000500570662314001840037400403595833624530100200300002009000040061400371110201100991001001000010000071002162239819300001004003840038400414004140038
3020440037299001613968725301011003000110030000500570662404001840040400373593433624530100200300002009000040037400371110201100991001001000010000071012162239819300001004003840038400414003840038
3020440037300000613968725301011003000110030000500570662314001840037400373593433624530100200300002009000040037400371110201100991001001000010000071012162239819300001004003840041400624003840038
3020440037300010613968725301011003000810030000500570705704001840037400373593433624530100200300002009000040037400401110201100991001001000010000071012162239819300001004003840038400384004140038
3020440037300010613968725301011003000110030000500570662404001840040400373593433624530100200300002009000040037400371110201100991001001000010000071012162239822300001004003840038400384003840038
3020440037300080613968725301011003000110030000500570662404001840037400613593433624530100200300002009000040061400401110201100991001001000010000071012162239819300001004003840062400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030b1e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300244003730000061396972530016103000010300005057066240400184006140037359800336267300102030000209000040037400371110021109101010000100000640316333981730000104003840038400384003840038
300244003730000061397032530018103000810300005057066241400184003740037359560336267300102030000209000040037400371110021109101010000100000640316323981730000104003840038400384003840056
300244003729900661396872530010103000010300005057104000400424006140037359560336291300102030000209000040037400371110021109101010000100000640316223981730000104003840038400384003840038
300244003730000861396872530011103000010300005057066240400654003740037359560336267300102030000209000040061400371110021109101010000100000640316233981730000104003840038400384003840056
300244005530000061396872530010103000010300005057066240400184003740037359560336267300102030000209000040037400371110021109101010000100000640216233981730000104003840056400384006240038
300244003730000061396872530010103000110300005057066240400184003740037359560336267300102030000209000040037400371110021109101010000100000640316333981730000104003840038400624005640062
300244003730000861396872530010103000010300005057104000400184003740037359560336267300102030000209000040037400371110021109101010000100000640316333981730000104003840062400384006240041
300244006130000061396872530018103000010300005057104000400184006140061359800336291300102030000209000040061400371110021109101010000100301640316333981730000104003840038400384003840038
300244003730000061396872530010103000010300005057066240400184003740037359560336267300102030000209000040055400611110021109101010000100000640316333981730000104003840038400384003840038
300244004030000861396872530010103000810300005057066240400424006140061359560336291300102030000209000040037400371110021109101010000100000640316333981730000104004140038400384003840038

Test 5: Latency 1->5

Code:

  tbl v3.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0307181e373a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3020440037300002850061396902530101100300011003000050057066244002140040400373595833624530100200300002009000040061400401110201100991001001000010000071003163339819300001004003840041400384003840038
30204400373000001061396902530100100300001003000050057066244001840037400403593433624530100200300002009000040037400611110201100991001001000010000071003163339819300001004003840038400384003840038
30204400372990060061396872530100100300081003000050057066234001840037400373593733624530100200300002009000040037400401110201100991001001000010000071003163339819300001004003840038400384003840041
302044003730001540061396872530101100300011003000050057070574001840037400373593433624830100200300002009000040040400371110201100991001001000010000071003163339819300001004003840038400384003840038
30204400373000000061396902530108100300001003000050057070574001840061400373593433624830100200300002009000040037400401110201100991001001000010000071213163339819300001004006240038400384003840038
30204400373000001061396872530106100300011003000050057066234001840037400373593433624830100200300002009000040037400371110201100991001001000010000071013163339819300001004003840038400414003840038
302044003730000010613968725301001003000010030000500570662440018400374008935923113629130100200300002009000040040400371110201100991001001000010000071013164339819300001004004140038400384006240038
30204400403000000061396902530100100300001003000050057066244001840037400373593733624530100200300002009000040037400371110201100991001001000010000071213163339819300001004003840038400384003840038
30204400372990001082397032530101100300001003000050057066234001840037400373593433624530100200300002009000040037400401110201100991001001000010000071013163339819300001004003840038400384003840041
302044004029900511061396872530100100300011003000050057066244004240040400373593433624530100200300002009075340037400371110201100991001001000010000071013163339822300001004003840038400624004140038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)03191e373f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30024400373000480536396872530010103000010300005057095364001840037400373595633626730010203000020900004005540037111002110910101000010006402162239820030000104003840038400384006240038
3002440037300030861396872530011103000010300005057070574001840037400373595633626730010203000020900004003740037111002110910101000010006402162239817030000104003840038400384003840038
30024400373000354061397032530010103000010300005057066244001840037400373595633626730010203000020900004003740037111002110910101000010006402162239817030000104006240038400384003840038
3002440037300042061396872530010103000010300005057066244001840037400373595633627030010203000020900004004040037111002110910101000010006402163239817030000104003840038400384003840038
300244003730009861396872530010103000010300005057104004001840037400373598033626730010203000020900004003740037111002110910101000010006402162239817030000104004140038400384003840038
3002440037299096861396872530010103000010300005057104004004240061400373595633626730010203000020900004003740084111002110910101000010006402162239817030000104003840038400384005640038
30024400373000240232396872530010103000010300005057066244001840037400613595633626730010203000020900004003740037111002110910101000010006402162339817030000104003840038400384003840038
3002440037300024161396872530010103000010300005057066244001840037400373595633626730010203000020900004003740037111002110910101000010006402162239835030000104004140038400384005640038
3002440037300051161396902530011103000110300005057070574001840037400373595633626730010203000020900004003740037111002110910101000010006402162239817030000104003840038400384003840062
3002440037300027061396902530010103000010300005057066244001840037400373595933626730010203000020900004003740037111002110910101000010006402162239817030000104003840056400624003840038

Test 6: Latency 1->6

Code:

  tbl v4.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)0308090b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)93inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3020440037300000006900613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710021622398870300001004003840038400384003840038
3020440037300000003000613977925301001003000010030000500571434740065400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003018220090000400374003711102021009901001001000010000000000710121722398870300001004003840038400384003840038
3020440037300000000007263977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
30204400372990000000030023977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840087
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374008411102011009901001001000010000000000731121622398870300001004003840038400384003840038
302044003729900000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121632398870300001004003840038400384003840038
302044003730000000001613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire (01)cycle (02)030818191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3002440037300000510613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000000006403162239888030000104003840038400384003840038
300244003730000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006403163239888030000104003840038400384003840038
3002440037300000270613977925300101030000103000050571434704006540037400373604733626730010203000020900004003740037111002110910101000010000000006402163339888030000104003840038400384003840038
300244003730000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006403163339888030000104003840038400384003840038
3002440037300011360613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006402162239888030000104003840038400384003840038
300244003730000030613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000000006402162239888030000104003840038400384003840038
300244003730000060613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000700006402163339888030000104003840038400384003840083
300244003730000060613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006402162239888030000104003840038400384003840038
300244003730000060613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006403162239888030000104003840038400384003840038
300244003730000000613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000000006403163339888030000104003840038400384003840038

Test 7: throughput

Count: 8

Code:

  tbl v0.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v1.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v3.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v4.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v5.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v6.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7505

retire (01)cycle (02)030b1e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5branch mispredict (cb)cdcfd6e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2402046004245000332524010610024000610024002050022801030600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245000332524010610024000610024002050022801030600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245000332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801030600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245100332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
24020460042450006982524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245000332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7505

retire (01)cycle (02)030a0b191e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a7a8a9acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
24002460053450000951825240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050204164360039240000106025560043600436009460043
2400246004245000004325240010102400001024000050227997160144060042600422999633002224001020240000207200006004260042118002110910108000010000300050204163360039240000106004360043600436004360043
2400246004244900004325240010102400001024043450227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000003050735164360039240000106004360043600436004360043
24002460042450010124325240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000400150203163360039240000106004360043600436025060043
24002460042450000070825240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050203163360039240000106004360043600436004360043
2400246004245000004325240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050205165360039240000106004360043600436004360043
2400246004244900004325240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050203163360039240000106004360043600436004360043
24002460042449000043252400101024000010240000502279971600230600426004229996330022240010202400002072000060042600421180021109101080000100001040050203163460039240000106004360043600436004360043
2400246004245000008725240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050205163360039240000106004360043600436004360043
2400246004244900054432524001010240000102400005022799716078906091560984302289730614242549202421132072630661089611412218002110910108000010241000050205165560039240000106004360043600436004360043