Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBL (four register table, 16B)

Test 1: uops

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.001

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
300440373000000161367625300130003000549641140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403731000120161367625300130003000549641140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403730000000103367625300030013000549645140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403730000000613676253001300430005496411401840374037333522364530003000900040494037111001100010073216223821300040384038403840384038
300440373000000061367625300030003000549641040184049403733353364530003000900040374037111001100000073216223821300040384038403840504038
300440373100130161367625300430013000549641140184037403733353364530003000900040374037111001100000073216223821300040384050403840384038
3004403730000001613676253000300030005496410401840374037333533645300030009000403740371110011000008773216223821300040384038405040384038
300440373100000161367625300030013000549641140184037403733353364530003000900040374037111001100000073216223821300040384038403840384038
3004403730000000823676253001300030005514101403040374037333533645300030009000403740371110011000002773216223821300040384038403840384038
300440373000000161367625300430003000549641040184037403733353364530003000900040374037111001100000073216223821300040384038403840384038

Test 2: Latency 1->2

Code:

  tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
302044003730000000010613967625301001003000110030000500570664114001840049400493593533624530100200300002009000040037400371110201100991001001000010000029000710131633398190300001004003840038400384005040038
302044003730000000000613967625301041003000010030000500570664114001840037400373594733624530100200300002009000040037400371110201100991001001000010000029000710131633398190300001004003840038400384003840038
302044003730000000000613967625301001003000010030000500570664514003040037400373593533624530100200300002009000040037400371110201100991001001000010000027000710131633398310300001004003840038400384003840038
30204400372990000001061396762530100100300001003000050057066411400184003740037359353362573010020030000200900004003740037111020110099100100100001000000000710131633398190300001004003840038400384003840038
30204400373000000001061396762530104100300011003000050057066451400184003740049359353362453010020030000200900004004940049111020110099100100100001000000000710131633398190300001004003840038400384003840038
30204400373000000000061396762530101100300001003000050057066411400184003740037359353362453010020030000200900004003740049111020110099100100100001000000000710131633398190300001004003840038400504003840050
30204400373000000004061396762530100100300011003000050057066411400184003740037359353362453010020030000200900004004940037111020110099100100100001000000000710131633398190300001004003840038400384003840038
30204400372990000001061396972530104100300001003000050057066411400184003740037359353362453010020030000200900004003740037111020110099100100100001000001000710131633398190300001004003840038400384003840038
30204400372990000001061396762530104100300001003000050057066451400184003740037359353362453010020030000200900004003740037111020110099100100100001000003060710131633398190300001004003840038400384003840038
302044003730000000000613967625301041003000010030000500570664514001840049400373593533624530100200300002009000040037400371110201100991001001000010000051030710131633398190300001004003840050400384003840050

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30024400373000000000613968225300111030000103000050570664514001840037400373595733626730010203000020900004003740037111002110910101000010000200006402162239818030000104003840038400384003840038
3002440037300000000161396912530010103000110300005057066411400184003740037359573362673001020300002090000400374003711100221091010100001000010306402162239818030000104003840038400384003840050
3002440040300000000161396762530010103000010300005057066431400184003740037359573362673001020300002090000400374003711100211091010100001000020006402162239818030000104003840038400384003840038
3002440037309000000161396762530011103000010300005057066451400184003740049359573362673001020300002090000400374003711100211091010100001000020006402162239818030000104003840038400384003840038
3002440037300000000161396762530010103000110300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000000006402162239818030000104003840038400384003840038
3002440037299000000082396762530011103000110300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840262
3002440037300000000161396762530010103000410300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840038
3002440049300000000161396762530014103000110300005057066411400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840041
3002440037299000000061396822530014103000010300005057066451400184003740037359573362673001020300002090000400374003711100211091010100001000000006402162239818030000104005040038400384003840038
3002440037300000000161396792530011103000010300005057066451400184003740037359573362673001020300002090000400374003711100211091010100001000010006402162239818030000104003840038400384003840050

Test 3: Latency 1->3

Code:

  tbl v1.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30204400373000000000061396762530100100300001003000050057066411400184003740037359420636241301002003000820090024400374003711102011009910010010000100000000011171700016000398400300001004003840038400384003840038
30204400373000000000061396762530209100300011003000050057066411400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216032398190300001004003840038400504003840038
30204400373000000300061396762530101100300011003000050057066451400184003740037359350336245301002003000020090000400374003711102011009910010010000100221022794300009471041420224016917300001004079240782406274083640460
302044082830601141418481316201033967612230471133305261263179262657087531403184040240462358910583638332143218314212159499540526404119110201100991001001000010000002020007101081630544028018300001004062340896408154079640840
302044081930500101612001232120742639285236314341313120613633136670570934314068440820406163592803362453010020030000200900004003740037111020110099100100100001000001030000710102970233981917300001004005040465400384014540251
30204404662991014218601144194402239327339314401313113310030000500570664114061240285408103585908236360321412203093820493555402514030711102011009910010010000100000000000071010216022398190300001004003840038400384009740038
30204400373000000001061396762530100100300001003000050057066411400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071210216022398190300001004003840038400384009640038
302044003730000000000296396762530101100300001003000050057066411400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216022398190300001004003840038400384003840038
30204400372990000001061396764430101100300001003000050057066450400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216032398190300001004005040038400384003840038
30204400493000000001161396762530100100300001003000050057084100400184003740037359350336245301002003000020090000400374003711102011009910010010000100000000000071010216022398190300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002440037310001613967625300111030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100200640316333981830000104005040038400384005040038
3002440037300000613967625300111030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038
30024400373001017263967625300101030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104005040050400384020940038
3002440037300001613967625300101030001103000050570664101400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038
3002440037300031613967625300101030000113000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100020640316333981830000104003840038400384003840038
3002440037300000613967625300101030001103000050570664101400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333983030000104003840038400384003840038
30024400373000004973967625300111030000103000050570664301400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400864003840038
30024400372990004413967625300111030001103000050570664501400180400374004035957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400504003840038
3002440037300001613967625300141030000103000050570664501400180400374003735957336267300102030000209000040037400371110021109101010000100000662316333981830000104003840038400384003840038
3002440037299001613967625300111030001103000050570664100400180400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038

Test 4: Latency 1->4

Code:

  tbl v2.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020440037300000613970325301001003000110030000500570662404001840037400373593433624530100200300002009000040037400371110201100991001001000010000071002162239822300001004003840038400414006240038
3020440037300000613969025301011003000010030000500570662304001840037400373593433624530100200300002009000040037400401110201100991001001000010000071012162239819300001004003840038400414006240038
3020440040300010613968725301081003000010030000500570662304002140037400373593733624530100200300002009000040037400551110201100991001001000010000071012162239819300001004003840041400384004140038
3020440037300000613970325301001003000010030000500570662414002140037400403593433624530100200300002009000040037400371110201100991001001000010000071012162239822300001004003840038400384003840038
30204400613000001563968725301001003000010030000500570662314001840037400403595833624530100200300002009000040061400371110201100991001001000010000071002162239819300001004003840038400414004140038
3020440037299001613968725301011003000110030000500570662404001840040400373593433624530100200300002009000040037400371110201100991001001000010000071012162239819300001004003840038400414003840038
3020440037300000613968725301011003000110030000500570662314001840037400373593433624530100200300002009000040037400371110201100991001001000010000071012162239819300001004003840041400624003840038
3020440037300010613968725301011003000810030000500570705704001840037400373593433624530100200300002009000040037400401110201100991001001000010000071012162239819300001004003840038400384004140038
3020440037300010613968725301011003000110030000500570662404001840040400373593433624530100200300002009000040037400371110201100991001001000010000071012162239822300001004003840038400384003840038
3020440037300080613968725301011003000110030000500570662404001840037400613593433624530100200300002009000040061400401110201100991001001000010000071012162239819300001004003840062400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
300244003730000061396972530016103000010300005057066240400184006140037359800336267300102030000209000040037400371110021109101010000100000640316333981730000104003840038400384003840038
300244003730000061397032530018103000810300005057066241400184003740037359560336267300102030000209000040037400371110021109101010000100000640316323981730000104003840038400384003840056
300244003729900661396872530010103000010300005057104000400424006140037359560336291300102030000209000040037400371110021109101010000100000640316223981730000104003840038400384003840038
300244003730000861396872530011103000010300005057066240400654003740037359560336267300102030000209000040061400371110021109101010000100000640316233981730000104003840038400384003840056
300244005530000061396872530010103000010300005057066240400184003740037359560336267300102030000209000040037400371110021109101010000100000640216233981730000104003840056400384006240038
300244003730000061396872530010103000110300005057066240400184003740037359560336267300102030000209000040037400371110021109101010000100000640316333981730000104003840038400624005640062
300244003730000861396872530010103000010300005057104000400184003740037359560336267300102030000209000040037400371110021109101010000100000640316333981730000104003840062400384006240041
300244006130000061396872530018103000010300005057104000400184006140061359800336291300102030000209000040061400371110021109101010000100301640316333981730000104003840038400384003840038
300244003730000061396872530010103000010300005057066240400184003740037359560336267300102030000209000040055400611110021109101010000100000640316333981730000104003840038400384003840038
300244004030000861396872530010103000810300005057066240400424006140061359560336291300102030000209000040037400371110021109101010000100000640316333981730000104004140038400384003840038

Test 5: Latency 1->5

Code:

  tbl v3.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)181e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020440037300002850061396902530101100300011003000050057066244002140040400373595833624530100200300002009000040061400401110201100991001001000010000071003163339819300001004003840041400384003840038
30204400373000001061396902530100100300001003000050057066244001840037400403593433624530100200300002009000040037400611110201100991001001000010000071003163339819300001004003840038400384003840038
30204400372990060061396872530100100300081003000050057066234001840037400373593733624530100200300002009000040037400401110201100991001001000010000071003163339819300001004003840038400384003840041
302044003730001540061396872530101100300011003000050057070574001840037400373593433624830100200300002009000040040400371110201100991001001000010000071003163339819300001004003840038400384003840038
30204400373000000061396902530108100300001003000050057070574001840061400373593433624830100200300002009000040037400401110201100991001001000010000071213163339819300001004006240038400384003840038
30204400373000001061396872530106100300011003000050057066234001840037400373593433624830100200300002009000040037400371110201100991001001000010000071013163339819300001004003840038400414003840038
302044003730000010613968725301001003000010030000500570662440018400374008935923113629130100200300002009000040040400371110201100991001001000010000071013164339819300001004004140038400384006240038
30204400403000000061396902530100100300001003000050057066244001840037400373593733624530100200300002009000040037400371110201100991001001000010000071213163339819300001004003840038400384003840038
30204400372990001082397032530101100300001003000050057066234001840037400373593433624530100200300002009000040037400401110201100991001001000010000071013163339819300001004003840038400384003840041
302044004029900511061396872530100100300011003000050057066244004240040400373593433624530100200300002009075340037400371110201100991001001000010000071013163339822300001004003840038400624004140038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03191e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30024400373000480536396872530010103000010300005057095364001840037400373595633626730010203000020900004005540037111002110910101000010006402162239820030000104003840038400384006240038
3002440037300030861396872530011103000010300005057070574001840037400373595633626730010203000020900004003740037111002110910101000010006402162239817030000104003840038400384003840038
30024400373000354061397032530010103000010300005057066244001840037400373595633626730010203000020900004003740037111002110910101000010006402162239817030000104006240038400384003840038
3002440037300042061396872530010103000010300005057066244001840037400373595633627030010203000020900004004040037111002110910101000010006402163239817030000104003840038400384003840038
300244003730009861396872530010103000010300005057104004001840037400373598033626730010203000020900004003740037111002110910101000010006402162239817030000104004140038400384003840038
3002440037299096861396872530010103000010300005057104004004240061400373595633626730010203000020900004003740084111002110910101000010006402162239817030000104003840038400384005640038
30024400373000240232396872530010103000010300005057066244001840037400613595633626730010203000020900004003740037111002110910101000010006402162339817030000104003840038400384003840038
3002440037300024161396872530010103000010300005057066244001840037400373595633626730010203000020900004003740037111002110910101000010006402162239835030000104004140038400384005640038
3002440037300051161396902530011103000110300005057070574001840037400373595633626730010203000020900004003740037111002110910101000010006402162239817030000104003840038400384003840062
3002440037300027061396902530010103000010300005057066244001840037400373595933626730010203000020900004003740037111002110910101000010006402162239817030000104003840056400624003840038

Test 6: Latency 1->6

Code:

  tbl v4.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch indir (93)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020440037300000006900613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710021622398870300001004003840038400384003840038
3020440037300000003000613977925301001003000010030000500571434740065400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003018220090000400374003711102021009901001001000010000000000710121722398870300001004003840038400384003840038
3020440037300000000007263977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
30204400372990000000030023977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840087
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374008411102011009901001001000010000000000731121622398870300001004003840038400384003840038
302044003729900000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038
302044003730000000000613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121632398870300001004003840038400384003840038
302044003730000000001613977925301001003000010030000500571434740018400374003736025336245301002003000020090000400374003711102011009901001001000010000000000710121622398870300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002440037300000510613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000000006403162239888030000104003840038400384003840038
300244003730000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006403163239888030000104003840038400384003840038
3002440037300000270613977925300101030000103000050571434704006540037400373604733626730010203000020900004003740037111002110910101000010000000006402163339888030000104003840038400384003840038
300244003730000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006403163339888030000104003840038400384003840038
3002440037300011360613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006402162239888030000104003840038400384003840038
300244003730000030613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000000006402162239888030000104003840038400384003840038
300244003730000060613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000700006402163339888030000104003840038400384003840083
300244003730000060613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006402162239888030000104003840038400384003840038
300244003730000060613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010000000006403162239888030000104003840038400384003840038
300244003730000000613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010000000006403163339888030000104003840038400384003840038

Test 7: throughput

Count: 8

Code:

  tbl v0.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v1.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v2.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v3.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v4.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v5.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v6.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  tbl v7.16b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.16b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7505

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2402046004245000332524010610024000610024002050022801030600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245000332524010610024000610024002050022801030600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245000332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801030600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245100332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
24020460042450006982524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004245000332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043
2402046004244900332524010610024000610024002050022801031600236004260042299776299932401202002400322007200966004260042118020110099100100800001000001115116166003902400001006004360043600436004360043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7505

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)l2 tlb miss data (0b)191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
24002460053450000951825240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050204164360039240000106025560043600436009460043
2400246004245000004325240010102400001024000050227997160144060042600422999633002224001020240000207200006004260042118002110910108000010000300050204163360039240000106004360043600436004360043
2400246004244900004325240010102400001024043450227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000003050735164360039240000106004360043600436004360043
24002460042450010124325240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000400150203163360039240000106004360043600436025060043
24002460042450000070825240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050203163360039240000106004360043600436004360043
2400246004245000004325240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050205165360039240000106004360043600436004360043
2400246004244900004325240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050203163360039240000106004360043600436004360043
24002460042449000043252400101024000010240000502279971600230600426004229996330022240010202400002072000060042600421180021109101080000100001040050203163460039240000106004360043600436004360043
2400246004245000008725240010102400001024000050227997160023060042600422999633002224001020240000207200006004260042118002110910108000010000000050205163360039240000106004360043600436004360043
2400246004244900054432524001010240000102400005022799716078906091560984302289730614242549202421132072630661089611412218002110910108000010241000050205165560039240000106004360043600436004360043