Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA512H2

Test 1: uops

Code:

  sha512h2 q0, q1, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100420381501649372510001000100068984020192038203817883189610001000300020382038111001100000075616651970100020392039203920392039
100420381501619372510001000100068984120192038203817883189610001000300020382038111001100000075616661970100020392039203920392039
100420381501619372510001000100068984120192038203817883189610001000300020382038111001100000075616661970100020392039203920392039
100420381501619372510001000100068984120192038203817883189610001000300020382038111001100000075616551970100020392039203920392039
100420381501619372510001000100068984120192038203817883189610001000300020382038111001100000075716561970100020392039203920392039
100420381601619372510001000100068984020192038203817883189610001000300020382038111001100000075616661970100020392039203920392039
100420381501619372510001000100068984020192038203817883189610001000300020382038111001100000075516771970100020392039203920392039
100420381601619372510001000100068984120192038203817883189610001000300020382038111001100000075616661970100020392039203920392039
100420381502619372510001000100068984120192038203817883189610001000300020382038111001100001075516661970100020392039203920392039
100420381501619372510001000100068984120192038203817883189610001000300020382038111001100000075516551970100020392039203920392039

Test 2: Latency 1->1

Code:

  sha512h2 q0, q1, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0038

retire uop (01)cycle (02)03mmu table walk data (08)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)c2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020420038150000619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
10204200381501330619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
1020420038150000619937251010010010000100100005007079841200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
10204200381500390619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
1020420038150000619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
1020420038150060619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071014163319969100001002003920039200392003920039
1020420038150000619937251010010010000100100005007079841200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
1020420038150000619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
1020420038150000619937251010010010000100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039
1020420038150000619937251010010010012100100005007079840200192003820038186383187461010020010000200300002003820038111020110099100100100001000071013163319969100001002003920039200392003920039

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0038

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024200381501561993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640316331996910000102003920039200392003920039
10024200381503061991825100221010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640216221996910000102003920039200392003920039
1002420038150061993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640216331996910000102003920039200392003920039
1002420038150061993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640316331996910000102003920039200392003920039
1002420038150061993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640216221996910000102003920039200392003920039
1002420038150061993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100040640216331996910000102003920039200392003920039
10024200381502761993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640316331996910000102003920039200392003920039
1002420038150061993725100101010000101000050707984120019200382006118660318768100102010000203000020038200381110021109101010000100000640316331996910000102003920039200392003920039
1002420038150361993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640216221996910000102003920039200392003920039
1002420038150061993725100101010000101000050707984120019200382003818660318768100102010000203000020038200381110021109101010000100000640216321996910000102003920039200392003920039

Test 3: Latency 1->2

Code:

  sha512h2 q0, q0, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)033f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722461199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722561199012510100100100071001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722461199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722461199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722561199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722461199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722461199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300843003711102011009910010010000100107102162229920100001003003830038300383003830038
102043003722561199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722561199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038
102043003722561199012510100100100001001000050010675000300183003730037285890328745101002001000020030000300373003711102011009910010010000100007102162229920100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000306402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372240000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000310006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038
10024300372250000061199012510010101000010100005010675003001803003730037286113287671001020100002030000300373003711100211091010100001000000006402162229920010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  sha512h2 q0, q1, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250611990125101001001000010010000500106750010300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710211611299200100001003003830038300383003830038
10204300372240611990125101001001000010010000500106750010300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710211611299200100001003003830038300383003830038
10204300372250611990125101001001000010010000500106750002300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710211611299200100001003003830038300383003830038
10204300372250611990125101001001000010010000500106750010300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710011611299200100001003008630038300383003830038
102043003722506121990125101001001000010010000500106750010300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710011611299200100001003003830038300383003830038
10204300372259611990125101001001000010010000500106750010300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710011611299200100001003003830038300383003830038
10204300372250611990125101001001000010010000500106750000300183003730037285893287451010020010000200300003003730037111020110099100100100001000300710211611299200100001003003830038300383003830038
10204300372250611990125101001001000010010000500106750010300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710211611299200100001003003830038300383003830038
10204300372246611990125101001001000010010000500106750002300183003730037285893287451010020010000200300003003730037111020110099100100100001000000710011611299200100001003003830038300383003830085
10204300372250611990125101001001000010010000500106750000300183003730037285893287451010020010000200300003003730037111020110099100100100001000001710211611299200100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250021291990125100101010000101000050106750030018300373003728611328767100102010000203000030037300371110021109101010000100064381610102992010000103003830038300383003830038
100243003722500223419901251001010100001010000501067500300183003730037286113287671001020100002030000300373003711100211091010100001000643101610102992010000103003830038300383003830038
10024300372250021041990125100101010000101000050106750030018300373003728611328767100102010000203000030037300371110021109101010000100064310166102992010000103003830038300383003830038
1002430037225002129419901251001010100001010000501067500300183003730037286113287671001020100002030000300373003711100211091010100001000643111611112992010000103003830038300383003830038
10024300372250021921990125100101010000101000050106750030018300373003728611328767100102010000203000030037300371110021109101010000100064310161052992010000103003830038300863003830038
10024300372250028519901251001010100001010000501067500300183003730037286113287671001020100002030000300373003711100211091010100001000643101610102992010000103003830038300383003830038
100243003722500219219901251001010100001010000501067500300183003730037286113287671001020100002030000300373003711100211091010100001000643101610102992010000103003830038300383003830038
100243003722500221319901251001010100001010000501067500300183003730037286113287671001020100002030000300373003711100211091010100001000643101610102992010000103003830038300383003830038
1002430037225300210041990125100101010000101000050106750030018300373003728611328767100102010000203000030037300371110021109101010000100064310161152992010000103003830038300383003830038
100243003722500215019901251001010100001010000501067500300183003730037286483287671001020100002030000300373003711100211091010100001000643816882992010000103008630038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha512h2 q0, q8, v9.2d
  movi v1.16b, 0
  sha512h2 q1, q8, v9.2d
  movi v2.16b, 0
  sha512h2 q2, q8, v9.2d
  movi v3.16b, 0
  sha512h2 q3, q8, v9.2d
  movi v4.16b, 0
  sha512h2 q4, q8, v9.2d
  movi v5.16b, 0
  sha512h2 q5, q8, v9.2d
  movi v6.16b, 0
  sha512h2 q6, q8, v9.2d
  movi v7.16b, 0
  sha512h2 q7, q8, v9.2d
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)091e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204160038119800090617993725801001008000010080000500567798401600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100002000001011091684159981160000100160039160039160039160039160039
1602041600381198000002447993725801001008000010080000500567798411600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000001011091694159981160000100160039160039160039160039160039
1602041600381199000600617993725801001008000010080000500567798411600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000001011091694159981160000100160039160039160039160039160039
160204160038119800000617993725801001008000010080000500567798401600193160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000001011091699159981160000100160039160039160039160039160039
1602041600381199000906179937258010010080000100800005005677984016001901600381600381398883139996801002008000020024000016003816003811160201100991001001600001000000001680101109161010159981160000100160039160039160039160039160039
1602041600381198000007267993725801001008000010080000500567798401600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000001011091688159981160000100160039160039160039160039160039
16020416003811990000024147993725801001008000010080000505567798411600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000001011091699159981160000100160039160039160039160039160039
1602041600381199000002337993725801001008000010080000500567798411600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000001011081699159981160000100160039160039160039160039160039
160204160038119900000617993725801001008000010080000500567798411600190160038160038139888313999680100200800002002400001600381600381116020110099100100160000100000000180010110416910159981160000100160039160039160039160039160039
1602041600381198000006179937258010010080000100800005005677984016007901600381600381398883139996801002008000020024000016003816003811160201100991001001600001000000100010110101699159981160000100160039160039160039160039160039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600241600381199001110005057993725800101080000108000050567798411516005601600381600381399201214001880276208022620240681160087160278411600211091010160000102421903200100228112316211164415998311513816000010160039160039160039160086160039
160024160038128500000000737993725800101080000108000050567798401016001901600381600381399103140018800102080000202400001600381600381116002110910101600001000000000100223111416211371215998301513816000010160039160039160039160039160039
1600241600381199000000006779937258001010800001080000505677984110160019016003816003813991031400188001020800002024000016003816003811160021109101016000010000002001002231115162114018159983015132416000010160039160039160039160039160039
1600241600381198000000007657993725800101080000108000050567798411016001901600381600381399103140018800102080000202400001600381600381116002110910101600001000000000100223111616211341615998301513816000010160039160039160039160039160039
160024160038119900000009677993725800101080000108000050567798411016001901600381600381399103140018800102080000202400001600381600381116002110910101600001000000000100223111516211341415998301513816000010160039160039160039160039160039
160024160038119800000000677993725800101080012108000050567798411016001901600641600381399103140018800102080000202400001600981600381116002110910101600001000000000100503221316422131515998303013816000010160039160039160039160039160039
1600241600381198000000006779937258001010800121080000505677984110160019016003816003813991031400188001020800002024000016003816009811160021109101016000010000000001002231113172113711159983015261616000010160039160039160039160039160039
1600241600381199000000007379937258001010800001080000505677984010160019016003816003813991031400188001020800002024000016003816003811160021109101016000010000000001002262214164224315159983030261616000010160039160039160039160039160039
160024160038119900000000677993725800101080000108000050567798411016001901600381600381399103140018800102080000202400001600381600381116002110910101600001000000000100223111516211361415998301513816000010160039160039160039160039160039
160024160038119800000000100799372580010108001210800445056779841101600190160038160038139910371400188001020800002024000016003816003811160021109101016000010000000001002231115172321131315998301513816000010160039160039160039160039160039

Test 6: throughput

Count: 16

Code:

  sha512h2 q0, q16, v17.2d
  sha512h2 q1, q16, v17.2d
  sha512h2 q2, q16, v17.2d
  sha512h2 q3, q16, v17.2d
  sha512h2 q4, q16, v17.2d
  sha512h2 q5, q16, v17.2d
  sha512h2 q6, q16, v17.2d
  sha512h2 q7, q16, v17.2d
  sha512h2 q8, q16, v17.2d
  sha512h2 q9, q16, v17.2d
  sha512h2 q10, q16, v17.2d
  sha512h2 q11, q16, v17.2d
  sha512h2 q12, q16, v17.2d
  sha512h2 q13, q16, v17.2d
  sha512h2 q14, q16, v17.2d
  sha512h2 q15, q16, v17.2d
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204320038239700000611599372516010010016000010016000050011357984032001903200383200382998880329999616010020016000020048021632003832003811160201100991001001600001000000000000101102491131996902160000100320039320039320039320039320039
1602043200382397000708095315993725160100100160000100160000500113579840320019032018432003829988802129999616010020016000020048000032008732003811160201100991001001600001000000000000101101161132011630160000100320039320039320039320039320039
160204320038239700000611599372516010010016000010016000050011357984032001933200383200382998880329999616010020016000020048000032003832003811160201100991001001600001000000000000101101161131996900160000100320039320039320039320039320085
16020432003823970000061159937251601001001600001001600005001135798403200190320038320086299888032999961601002001600722004800003200383200382116020110099100100160000100000011900000101102161131996900160000100320039320039320039320039320039
160204320038243100000611599372516010011916000010016000050011357848032001903200383200382998880329999616010020016000020048000032003832003811160201100991001001600001000000000000101102161131996900160000100320039320039320039320039320039
1602043200382397000007261599372516010010016000010016000050011357984132001903200383200382998880329999616014720016000020048000032003832003811160201100991001001600001000000000000101101161131996900160000100320039320039320039320039320039
160204320038239700000611599372516010010016000010016000050011357984032001903200383200382998880329999616010020016000020048000032003832003811160201100991001001600001000000000000101102491131996900160000100320039320039320039320039320039
16020532003823970011507261599372516010010016000010016000050011357984132001903200383200382998880329999616010020016000020048000032003832003811160201100991001001600001000000600000101101161131996900160000100320039320039320039320039320039
16020432003823970002790611599372516010010016000010016000050011357984032001903200383200382998880329999616010020016000020048000032003832003811160201100991001001600001000000000000101102161131996900160000100320039320039320039320039320039
1602043200382397000180611599372516010010016000010016000050011357984132001903200383200382998880329999616010020016000020048000032003832003811160201100991001001600001000000000000101102160131996900160000100320039320039320039320039320039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600243200382397007315993725160010101600001016000050113579840153200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100000000010022135161611154319969021191016000010320039320039320039320039320039
16002432003823980067159937251600101016000010160000501135798411103200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100010000010022134151611137319969021191016000010320039320039320039320039320039
16002432003823980088159937251600101016000010160000501135798411103200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100000000010022135131611157319969021192216000010320039320039320039320039320039
160024320084239700671599372516001010160000101600005011357984111032001903200383200382999100330001816001020160000204800003200383200381116002110910101600001000000100100241662912022137319969042381016000010320039320039320523320136320426
16002432003823981067159937251600101016000010160046501135798411103200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100000000010022135261611147319969021191016000010320039320039320039320039320039
16002432003823970967159937251600101216000012160000601135798411103200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100000000010052136171611164319969021191016000010320039320039320039320039320039
160024320038239700738159937251600101016000010160000501135798411103200190320038320038299910033000181600102016000020480000320038320038311600211091010160000100000000010024167251622254319969042382016000010320039320039320039320039320039
16002432003823970067159937251600101016000010160000501135798411103200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100000000010022136151611165319969021191016000010320039320039320039320039320039
160024320038239700732159937251600101016000010160000501135798411103200190320038320038299910033000181600102016000020480000320038320038111600211091010160000100000000010022136131611158319969021191016000010320039320039320039320039320039
16002432003823970067159937251600101016000010160000501135798411103200190320038320086299910033000181600102016000020480000320086320038111600211091010160000100020010110042132661711165320043021221016000010320039320039320039320039320039