Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UDOT (by element, 8B)

Test 1: uops

Code:

  udot v0.2s, v1.8b, v2.4b[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03091e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037230082254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037230061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037240061254825100010001000398313030183037303724153289510001000300030373037111001100000373116112630100030383038303830383038
10043037230061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037230061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037240061254825100010001000398313030183037303724153289510001000300030373037111001100000373116112630100030383038303830383038
10043037230061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037230061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037240061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037230061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  udot v0.2s, v1.8b, v2.4b[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003723300213295482510100100100001001000050042773133001830037300372827272874010100200100082003002430037300371110201100991001001000010000011171802161129646100001003003830038300383003830038
102043003723300217295482510100100100001001000050042773133001830037300372827272874010100200100082003002430037300371110201100991001001000010000011171701161129647100001003003830038300383003830038
102043003723200149295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000000071013163329634100001003003830038300383008630038
1020430037233240103295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010013000071013163329634100001003003830038300383003830038
10204300372330061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000000071013163329634100001003003830038300383003830038
102043003723300128295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000000071013163329634100001003003830038300383003830038
102043003723300103295482510100100100001001000050042773133001830037300372826532876210100200100002003000030037300371110201100991001001000010000000071013163329634100001003003830038300383003830038
102043003723300112295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010003000071014163329634100001003003830038300383003830038
102043003723301188295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000000071013163329634100001003003830038300383003830038
102043003723201149295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000000071013163329634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)030f1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003723300061295482510010101000001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000000640216222963010000103003830038300383003830038
100243003723300061295482510010101000001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000640216222963010000103003830038300383003830038
100243003723300075295482510010101000001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000000640216222963010000103003830038300383003830038
1002430037232012061295482510010101000001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000000640216222963010000103003830038300383003830038
1002430037241000726295482510010101000001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000000640216322963010000103003830038300383003830038
100243003723300061295482510010101000001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000300640216222963010000103003830038300383003830038
100243003723300089295482510010101000001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000102000000640216222963010000103003830038300383003830038
100243003723300061295482510010101000001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000300640216222963010000103003830038300383003830038
100243003723300061295482510010101000001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000640216222963010000103003830038300383003830038
100243003723300089295482510010101000001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000640216222963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  udot v0.2s, v0.8b, v1.4b[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003723300006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071001161129634100001003003830038300383003830038
102043003723200008529548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071001161129634100001003003830038300383003830038
102043003723200006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071001161129634100001003003830038300383003830038
1020430037233000017029548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071001161129634100001003003830038300383003830038
1020430037233000084529548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000500071001161129634100001003003830038300383003830038
1020430037232013025429548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071001161129634100001003003830038300383003830038
1020430037233000012429548251010010010000103100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071001161129634100001003003830038300383003830038
1020430037233000010329548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000003071001161129634100001003003830038300383003830038
1020430037232000018729548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000002823071001161129634100001003007830038300383003830038
102043003723300012103295486510143100100001001000055742854551301983008530179282651728817105672101000022031989302773027851102011009910010010000100020108290071003321129634100001003003830136300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)0e1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037232000088329548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006403162229630010000103003830038300383003830038
1002430037233000010329548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038
100243003723300006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038
100243003723300006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038
1002430037233000012829548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038
1002430037233000016829548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010036402162229630010000103003830038300383003830038
100243003723300008429548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109102101000010006402162229630010000103003830038300383003830038
100243003723300006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038
1002430037233000014729548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038
1002430037241000012429548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109100101000010006402162229630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  udot v0.2s, v1.8b, v0.4b[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003723300021900612954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611296340100001003003830038300383003830038
102043003723300047400612954825101001001000010010000500427731300300183003730037282653288181010020010000200300003003730037111020110099100100100001000620000710011611296340100001003003830038300383003830038
102043003723300047400612954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611296340100001003003830038300383003830038
10204300372330004500612954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611296340100001003003830038300383003830038
102043003723300046500612954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611297020100001003003830038300383003830038
102043003723200039600612954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611296340100001003003830038300383003830038
1020430037233000417007262954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611296340100001003003830038300383003830038
1020430037232000000612954825101001001000010010000500427731300300183003730037282653287451010020010013200300243003730037111020110099100100100001000000111717001600296470100001003003830038300383003830038
10204300372330009900612954825101001001000010010000500427731300300183003730037282726287401010020010008200300243003730037111020110099100100100001000000111717001600296470100001003003830038300383003830038
102043003723300012000612954825101001001000010010000500427731300300183003730037282653287451010020010000200300003003730037111020110099100100100001000000000710011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003723300000000612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000006403162229630010000103003830038300383003830038
100243003723300000000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000000306402162229630010000103003830038300383003830038
100243003723200000000612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
100243003723300000000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
100243003723200000000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
100243003723300000000612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
100243003723300000000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
100243003723300000000612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
1002430037233000000001032954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
100243003723300000000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000000006402162229630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  udot v0.2s, v8.8b, v9.4b[1]
  movi v1.16b, 0
  udot v1.2s, v8.8b, v9.4b[1]
  movi v2.16b, 0
  udot v2.2s, v8.8b, v9.4b[1]
  movi v3.16b, 0
  udot v3.2s, v8.8b, v9.4b[1]
  movi v4.16b, 0
  udot v4.2s, v8.8b, v9.4b[1]
  movi v5.16b, 0
  udot v5.2s, v8.8b, v9.4b[1]
  movi v6.16b, 0
  udot v6.2s, v8.8b, v9.4b[1]
  movi v7.16b, 0
  udot v7.2s, v8.8b, v9.4b[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)181e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420092155202021045925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010001001012614161314200611600001002006520065200652006520065
1602042006415630303046525801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012814161215200611600001002006520065200652006520065
1602042006415530300046525801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000031012815161214200611600001002006520065200652006520065
1602042006415630300046525801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012511161513200611600001002006520065200652006520065
16020420064155505000433825801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012714161414200611600001002006520065200652006520065
1602042006415630300048325801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012713161412200611600001002006520065200652006520065
1602042006415520300046525801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012714161313200611600001002006520065200652006520065
1602042006415630302403592580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000000101266161415200611600001002006520065200652006520065
16020420064155203060376025801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012411161012200611600001002006520065200652006520065
16020420064155303039045925801001008000010080104500640000020045200642006432280100200800002002400002006420064111602011009910010016000010000001012716161415200611600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc3branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024201421550222706329800121280000128000062640000015200412006020261322800122080000202400002006120060111600211091010160000100000010040113221344121413200482202160000102006120128200722006320061
1600242005115501200512780012128000012800006264000011520032200512024932280012208000020240000200512006011160021109101016000010000001004211411625221814200582402160000102005220061200522006120052
1600242005115501225505130800121280000128000062640000115200412006020226114480012208000020240000200512005111160021109101016000010000001003785214254221315200572401160000102006120061200612005220061
1600242006015500000632980012128000012800006264000001520041200512020932280012208000020240000200602005111160021109101016000010003001004111628344221616200482403160000102005220052200612005220061
16002420060156011005127800121280000128000062640000115200322005120188322800122080000202400002005120051111600211091010160000100000010037166116342211418200582401160000102005220052200522005220052
1600242005115501151043827800121280000128000062640000111020032200512020232280012208000020240000200512005111160021109101016000010000001003813611425211916200482201160000102005220052200522005220052
1600242005115501215197278001212800001280000626400001110200322005120192322800122080000202400002005120051111600211091010160000100000010036137113342111414200482201160000102005220052200522005220052
16002420051156001150452780012128000012800006264000011102003220051201892422801172080000202400002005120051111600211091010160000100000010036137216252111212200482201160000102005220143200522005220052
160024200511550106051318001212800001280000626400001110200322005120164322800122080000202400002006120060111600211091010160000100000010038136114252211314200572201160000102005220052200522005220053
16002420051156011348085278001212800001280000626400001110200322006020163322800122080000202400002005120051111600211091010160000100000010037137113252111313200482201160000102006120061200612005220052

Test 6: throughput

Count: 16

Code:

  udot v0.2s, v16.8b, v17.4b[1]
  udot v1.2s, v16.8b, v17.4b[1]
  udot v2.2s, v16.8b, v17.4b[1]
  udot v3.2s, v16.8b, v17.4b[1]
  udot v4.2s, v16.8b, v17.4b[1]
  udot v5.2s, v16.8b, v17.4b[1]
  udot v6.2s, v16.8b, v17.4b[1]
  udot v7.2s, v16.8b, v17.4b[1]
  udot v8.2s, v16.8b, v17.4b[1]
  udot v9.2s, v16.8b, v17.4b[1]
  udot v10.2s, v16.8b, v17.4b[1]
  udot v11.2s, v16.8b, v17.4b[1]
  udot v12.2s, v16.8b, v17.4b[1]
  udot v13.2s, v16.8b, v17.4b[1]
  udot v14.2s, v16.8b, v17.4b[1]
  udot v15.2s, v16.8b, v17.4b[1]
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03191e373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400723010017412516010010016001810016000050012800000400200400394003919973319997160100200160000200480000400484004821160201100991001001600001000010110011611400461600001004004940049400494004040040
16020440048300000412516010010016000010016000050024388650400200400394004919973319997160100200160000200480000400394004811160201100991001001600001000010110011611400451600001004004040040400404005040049
16020440048300000412516010110016000010016000050012800001400300400394003919973320006160100200160000200480000400394004811160201100991001001600001000010110011611400361600001004004040049400404004040041
16020440049299000502516011710016001710016000050023989991400290400394003919973319997160100200160000200480000400394003911160201100991001001600001000010110011611400361600001004005040040400404004040040
16020440039300000412516010010016000010016000050012800000400300400394003919973319997160100200160000200480000400394003911160201100991001001600001000010110011611400461600001004004040041400494004040040
160204400483000017462516010010016001710016000050012800001400200400394003919973319997160100200160000200480000400394003911160201100991001001600001000010110011611400361600001004005040040400404004140041
16020440039300000412516010010016000010016000050023989990400200400394003919973320006160100200160000200480000400394003911160201100991001001600001000310110011611400451600001004004040040400494004040049
16020440040300000412516010010016000010016000050012800000400200400394003919973319997160100200160000200480000400394004811160201100991001001600001000010110011611400451600001004004040040400404004940040
160204400393000004125160100100160017100160000500128000004002004004840048199733200061601002001600002004800004003940039111602011009910010016000010003610110011611400451600001004004140049400404004040049
16020440039300001412516011710016000010016000050012800000400200400394003919973320006160100200160000200480000400394004811160201100991001001600001000010110011711400451600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400483100000000000520251600101016000010160000501280000115400204004840039199963200281600102016000020480000400394003911160021109101016000010000000001002411521016422244003604012160000104004040049400494004940040
1600244003931000000000005302516002710160000101600005012800000154002040039400391999632001916001020160000204800004003940039111600211091010160000100000000010022852216222644003602018160000104004940040400404004040049
160024400393100000000017093502516001010160001101600005023989990154003040048400391999632001916001020160000204800004004940039111600211091010160000100000000010022842216412464003602012160000104004040040400404004940049
160024400393100000000000460251600271016000010160000501280000015400294004940039199963200191600102016000020480000400394003911160021109101016000010000000001002285141621124400360406160000104004040040400404004040106
1600244003931000000000170460251600271016001810160000502398999115400204004940039199963200191600102016000020480000400394003911160021109101016000010000000001002284281621124400360206160000104004040040400404004140040
16002440039311000000000060540251600101016000010160000502398999115400204004940039199963200201600102016000020480000400394003911160021109101016000010000000001002285241622124400360206160000104004040050400404004040040
16002440039310000000000046025160010101600171016000050128000011540020400484004819996320019160010201600002048000040039400391116002110910101600001000000000100241141416211244003602012160000104004040040400404004040040
160024400393100000009000520251600101016000010160000501319997115400204004840039199963200191600102016000020480000400394003911160022109101016000010000000001002284121621142400360206160000104004140040400494004940049
160024400393100000000000460251600101016000010160000501280000115400204003940039199963200191601652016000020480000400394003911160021109101016000010000000001002284241641197400360216160000104004040040400404004040040
160024400393110000000000460251600271016000010160000501280000115402434004840048199963200191600102016000020480000400394003911160021109101016000010000000001002284141621165400360206160000104004040040400404004040040