Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

TBL (four register table, 8B)

Test 1: uops

Code:

  tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 3.000

Issues: 3.001

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 3.000

retire uop (01)cycle (02)03mmu table walk data (08)181e373f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
300440373000006136762530003000300055141004018403740373335336453000300090004037403711100110000073116113821300040384038403840384038
3004403730000410536762530003001300054964104018403740373335336453000300090004049403711100110000073116113821300040384038403840384038
3004403730000010536892530003000300054964514018403740373335336453000300090004049403711100110001073116113821300040384038403840504038
300440373000006136762530003000300054964104030403740373335336453000300090004037403711100110002073116113821300040384038403840384038
300440373000016136762530013001300054964514018403740373335336453000300090004037403711100110000073116113833300040384050403840384038
300440373000008236762530003001300054964114018404940373335336453000300090004037403711100110000073116113821300040384038403840384038
300440373000016136762530003001300054964504018403740373335336453000300090004037403711100110000073116113821300040384038403840384038
300440373000016136762530003001300054964514018403740373335336453000300090004037403711100110000073116113821300040384038403840384038
300440373200016136762530003004300054964114018403740373335336453000300090004049403711100110000073116113821300040504038403840504050
300440373000016136762530003001300054964104018403740373335336453000300090004037403711100110000073116113821300040384038403840384038

Test 2: Latency 1->2

Code:

  tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
30204400373000004061396762530101100300011003000050057084100400184003740037359426362403010020030008200900244003740037111020110099100100100001000071012162239819300001004003840038400384003840038
30204400373000000061396762530100100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001001671012163239819300001004003840038400384003840038
30204400373000000061396762530100100300011023000050057084101400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400384003840038
30204400373000004061396912530101100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000371012162239831300001004003840038400384003840038
30204400373000000061396912530100100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000071212162239819300001004003840038400384008440038
30204400373000004061396762530100100300001003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400384003840038
30204400372990001092396762530101100300001003000050057066410400184003740037359353362453010020030000200900004003740049111020110099100100100001000071012163239819300001004003840038400384003840038
30204400373000001061396762530101100300011003000050057066450400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239831300001004005040038400384003840038
302044003729900040223396762530197100300001003000050057066451400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400384003840038
302044003730000010747396762530100100300011003000050057066410400184003740037359353362453010020030000200900004003740037111020110099100100100001000071012162239819300001004003840050400504003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
300244003730000000172639676253001010300011030000505706641040018400374003735957336267300102030000209000040037400371110021109101010000100006402163239818030000104003840038400384003840038
30024400372990000006139676253001010300001030000505706645040018400374003735973336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
30024400373000000046139676253001110300011030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
300244003730000000110339676253001110300011030000505706645040018400374003735957336267300102030000209000040037400371110021109101010000100006402162339830030000104003840038400384003840038
30024400403000000016139682253001010300001030000505706645140018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
300244003730000000125139676253001110300011030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100006402162339818030000104003840038400384003840041
300244003730000000182396761443035810301741030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
300244003730000000419139676253001110300011030000505706641040018400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
3002440037300000001147396762530010103000110300005057066410400184003740037359572636267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038
30024400373000000016139676253001010300011030000505706645040030400374003735957336267300102030000209000040037400371110021109101010000100006402162239818030000104003840038400384003840038

Test 3: Latency 1->3

Code:

  tbl v1.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020440037300100001500040739676253010010030001100300005005706641004001840037400373593533625730100200300002009000040037400371110201100991001001000010000000000071010031622398190300001004003840038400384003840038
302044003730000000001059339676253010010030000100300005005706645004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004003840038400384003840050
30204400372990000000016139676253010110030001100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021632398190300001004003840038400384003840038
30204400372990000000106139676253010110030001100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004005040050400384003840038
3020440037300000000000104939676253010010030001100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004003840038400384003840038
302044003729900000004161396762530101100300861003000050057066410040018401414003735935213624530324200300002009000040037400491110201100991001001000010000000000071010021632398190300001004003840038400384003840038
30204400373000000000106139676253010110030001100300005005706645004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071210021622398190300001004003840038400384003840038
30204400373000000000106139676253010110030000100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000003000071010021632398190300001004003840038400384003840038
30204400373000000000106139676253010010030000100300005005706645004001840037400373593533624530100200300002009000040037400371110201100991001001000010000000000071010021622398190300001004003840038400384003840038
30204400373000000000106639676253010110030000100300005005706641004001840037400373593533624530100200300002009000040037400371110201100991001001000010000020000071010021622398190300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002440037299000010125939676253001010300011030000505706641040030400494004935957336267300102030252209000040037400491110021109101010000100000640216323981830000104003840038400384003840038
30024400373000000106639676253001110300041030000505708410140018400494003735957336267300102030000209000040037400371110021109101010000100000640216333981830000104003840038400504005040038
30024400373000000006139676253001010300001030000505708410140018400374003735957336267300102030000209000040049400371110021109101010000100000640416333981830000104003840038400384005040038
300244003729900000072639676253001110300001030000505706643040018400374003735957336267300102030000209000040037400371110021109101010000100000640216433981830000104003840038400384003840050
30024400372990000006139676253001110300011030000505706641140018400374003735957336267302312030000209000040037400371110021109101010000100000640416333981830000104003840038400504005040038
30024400373000000006139676253001110300011030000505706645040018400374003735957336267300102030000209000040037400371110021109101010000100000640416333981830000104005040038400384003840038
30024400372990000006139676253001110300001030000505706641140018400644003735957336267300102030000209000040037400371110021109101010000100000640216233981830000104003840038400384003840038
30024400373000000406139676253001010300011030000505706641140018400374003735957336267300102030000209000040037400371110021109101010000100000640316333981830000104003840038400384003840038
300244004930000000072639682253001010300001030000505706641040018400374003735957336267300102030000209000040037400371110021109101010000100000640216433981830000104003840038400384003840038
30024400373000000006139676253001110300011030000505706645040018400374003735957336267300102030000209000040037400371110021109101010000100000640316233983030000104003840038400384003840038

Test 4: Latency 1->4

Code:

  tbl v2.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)030918191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30204400372990010000107239697253010110030094100300005665706624140018400374004035952336245301002003000020090000400404003711102011009910010010000100000002092403142534042223300001004089240634409284073440939
3020440849306115151980132020058573938616931115126309951353270064957142671405344063440411359041043650932837217330932229929740359406761211020110099100100100001000212278204093202163534031415300001004087640931409214095140038
302044003730000015000124396902530101100300001003000050057066231400184003740037359343362453010020030000200900004003740037111020110099100100100001000000000710021622398190300001004003840041400384004140038
302044003730000000007263968725301011003000110030000500570662314001840037400373593425362453010020030000200900004003740037111020110099100100100001002000000710121632398370300001004004140038400384006240038
3020440037300000001061396902530101100300001003000050057066241400184003740037359343362693010020030000200900004003740040111020110099100100100001000000000710121632398220300001004003840038400384003840038
3020440037300000000061396872530100100300081003000050057070571400184003740037359343362453010020030000200900004006140037111020110099100100100001000000201000710121632398190300001004003840038400384003840041
30204400372990007801061396872530101100300011003000050057066240400184003740037359343362453010020030000200900004003740061111020110099100100100001000000000710121632398190300001004003840038400624003840041
302044004030000000101661396872530100100300001003000050057066240400184003740037359343362453010020030000200900004004040061111020110099100100100001000000000710122622398190300001004006240038400384003840038
3020440037300000008061396902530100100300001003000050057104001400424003740037359343362453010020030000200900004003740037111020110099100100100001000000000710121632398220300001004004140062400384003840041
3020440037300000001061396872530101100300011003000050057066241400184004040061359343362693010020030000200900004006140037111020110099100100100001000000000710121632398192300001004006240041400384006240038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
300244003730000000001125539687253001010300001030000505706624140018400494004935968336267300102030000209000040061400491110021109101010000100640316333982030000104003840038400384003840062
300244005530000000002186539703253001010300001030000505710400040030400374003735956336267300102030000209000040055400371110021109101010000100640316333983530000104003840038400624005640038
300244003730000000000187339694253001010300001030000505706624040018400374003735956336267300102030000209000040037400371110021109101010000100640316333981730000104003840038400384003840038
300244003730000000000186839687253001110300001030000505706624040042400614005535974336291300102030000209000040055400371110021109101010000100640316333981730000104003840038400504003840038
300244003730000000008129439690253001110300001030000505710400040018400374003735956336267300102030000209000040040400371110021109101010000100640316333981730000104005640050400384003840038
30024400373000000000052639687253001410300261030000505710400040042400614003735956336267300102030000209000040037400611110021109101010000100640316333984130000104006240041400384003840083
300244003730000000006185339687253001010300001030000505706624040018400374003735968336270302342030000209000040052400371110021109101010000100640316333981730000104003840041400414003840038
300244006130000000008180239687253001610300061030000505706624040018400374005535974336285300102030000209000040055400371110021109101010000100640316333981730000104005040038400384005040038
300244003730000000008186139687253001010300351030000505706624040042400374003735974336267300102030000209000040055400371110021109101010000100640316333981730000104003840038400384003840038
300244003730000000000133339687253001010300001030000505706624140021400404003735956336267300102030000209000040037400371110021109101010000100640316333981730000104007440038400384004140041

Test 5: Latency 1->5

Code:

  tbl v3.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
30204400373000246061396872530100100300001003045053357076120400184003740037359343362453010020030000200900004011540090111020110099100100100001000071004162239819300001004003840038400384003840038
3020440037300001161396872530100100300001003000050057066240400214003740037359343362453010020030000200900004003740040111020110099100100100001001071012162239819300001004003840038400624003840038
3020440037300001061396872530101100300011003000050057066240400214004040061359373362453010020030000200900004003740037111020110099100100100001000071012162239822300001004003840041400384003840038
3020440037300001061396872530100100300011003000050057066230400184003740037359343362453010020030000200900004003740040111020110099100100100001000071212162239819300001004006240038400384003840038
3020440037300001061396872530101100300001003000050057066230400424006140037359373362453010020030000200900004003740037111020110099100100100001000071012163239819300001004003840038400414003840038
30204400373000000251396872530101100300001003000050057066240400424004040037359583362483010020030000200900004003740249111020110099100100100001000071012163239843300001004003840038400384003840038
3020440037299001061396902530100100300001003000050057066230400184006140037359343362453010020030000200900004003740040111020110099100100100001000071013162239819300001004003840038400564003840038
3020440037300001061396872530101100300011003000050057070570400214003740040359373362453010020030000200900004003740037111020110099100100100001000071012162239819300001004004140038400384003840038
3020440037299001061396872530108100300011003000050057070570400214004040061359343362453010020030000200900004003740040111020110099100100100001000071013162239819300001004006240038400384004140038
3020440037300000061396872530101100300001003000050057066240400424004040037359583362483010020030000200900004004040037111020110099100100100001002071012162239819300001004004140038400414003840083

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
300244003729900000001613969025300101030000103000050570662404001840037400373595633626730010203000020900004003740037111002110910101000010000000006405163339817030000104003840038400384003840038
3002440037300000000009433968725300111030000103000050570662404001840037400373595633626730010203000020900004003740037111002110910101000010000000006403163339835030000104003840038400384003840038
300244003730000000001613968725300101030000103000050570662404001840037400373595633626730010203000020900004005540037111002110910101000010000000006403164339817030000104003840038400384003840038
300244003730000000000613968725300101030000103000050570662404001840037400373595633626730010203000020900004003740037111002110910101000010000000006403163339820030000104003840038400384003840038
300244003729900000000613968725300101030001103000050570662414001840037400373595633626730010203000020900004003740040111002110910101000010000000006403163339817030000104003840038400384003840038
3002440037300000000006313968725300101030000103000050570662404001840037400553595633626730010203000020900004003740037111002110910101000010000000006403163339817030000104003840041400384003840038
300244003730000000000613968725300111030000103000050570662314001840037400373595633626730010203000020900004003740037111002110910101000010000000006403163339817030000104003840038400384003840038
300244003730000000000613968725300101030000103000050570662314001840061400373595633626730010203000020900004003740037111002110910101000010000000006403163339817030000104003840038400384003840038
300244003730000000000613968725300101030000103000050570662404001840037400373595633626730010203000020900004009040037111002110910101000010000003006403164439817030000104003840038400384009040038
3002440247300000000016139687253001010300001030000505706623040018400894003735952123628230010203023520900004003740037111002110910101000010000000006403163339817030000104003840038400384003840038

Test 6: Latency 1->6

Code:

  tbl v4.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30204400372990000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000712021622398870300001004003840038400384003840038
30204400372990000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400372990000060061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300181003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
30204400373000000000061397792530100100300001003000050057143470400184003740037360253362453010020030000200900004003740037111020110099100100100001000000710121622398870300001004003840038400384003840038
3020440037300000000006139779253010010030000100300005005714347140018400374003736025336245301002003000020090000400374003711102011009910010010000100001410710121622398870300001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
30024400373000000613977925300101030000103000050571434714001840037400373604733626730010203000020900004003740037111002110910101000010006403162239888030000104003840038400384003840038
300244003730000006139753253001010300001030000505714347040018400374003736047336267300102030000209054040037400371110021109101010000100126402162239888030000104003840038400384003840038
300244003730000006139779253001010300001030000505714347040018400374003736047336267300102030000209000040037400371110021109101010000101216402162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006403162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
300244003730000001033977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400373000000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400372990000613977925300101030000103000050571434704001840037400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038
30024400372990000613977925300101030000103000050571434704001840084400373604733626730010203000020900004003740037111002110910101000010006402162239888030000104003840038400384003840038

Test 7: throughput

Count: 8

Code:

  tbl v0.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v1.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v2.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v3.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v4.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v5.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v6.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  tbl v7.8b, { v8.16b, v9.16b, v10.16b, v11.16b }, v12.8b
  movi v8.16b, 9
  movi v9.16b, 10
  movi v10.16b, 11
  movi v11.16b, 12
  movi v12.16b, 13

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7505

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010006111511601606003902400001006004360043600436004360043
24020460042449033262401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042449033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010080111511601606003902400001006004360043600436004360043
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042450033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
240204600424500698252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111512901606003902400001006004360043600436004360043
24020460042449033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042449033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043
24020460042451033252401061002400061002400205002280103060023600426004229977062999324012020024003220072009660042600421180201100991001008000010000111511601606003902400001006004360043600436004360043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7505

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
240024600424491000000120043025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061666600390240000106004360043600436004360043
240024600424490000000000233025240010102400001024000050227997106002360042600422999603381182400102024000020720000602006004211800211091010800001000000300502071675600390240000106004360043600436004360043
24002460042450000000000085025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061676600390240000106004360043600436004360043
240024600424500000000000705025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061665600390240000106004360043600436004360043
2400246004244900000000007500252400101024000010240000502279971060023600426004229996033002224001020240000207200006004260042118002110910108000010000101200502061657600390240000106004360043600436009460043
24002460042450000000000043025240010102400001024000050228388806002360042600422999603449612400102024000020720000600426004211800211091010800001000000300502071665600390240000106004360043600436004360043
24002460042450000000000061025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502061666600390240000106004360043600436004360043
24002460042450000000000043025240010102400001024000050228287306002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502051656600390240000106004360043600436004360043
24002460042450000000000043025240010102400001024000050227997106002360042600422999603300222400102024000020720000600426004211800211091010800001000000300502061666600390240000106004360043600436919060043
240024600424490000000210043025240010102400001024000050227997116002360042600422999603300222400102024000020720000600426004211800211091010800001000000000502051666600390240000106004360043600436004360043