Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1P

Test 1: uops

Code:

  sha1p q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004403830001032865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840871110011000000073216223873100040394039403940394039
100440383100612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383000612865251000100010001401641401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383100612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039
1004403831045612865251000100010001398951401940384038369033896100010003000403840381110011000000073216223873100040394039403940394039

Test 2: Latency 1->1

Code:

  sha1p q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fbranch cond mispred nonspec (c5)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400383000029829865251010010110000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071004162239871100001004003940039400394003940039
10204400382990019329865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071002162239871100001004003940039400394003940039
10204400382990019129865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004003940039400394003940039
10204400383000059929865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004003940039400394003940039
10204400383000019129865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004003940039400394003940039
10204400382990025429865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004003940039400394003940039
10204400383000023529865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004003940039400394003940039
10204400383000023329865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071013162239871100001004003940039400394003940039
10204400383000021229865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004008840137400394003940039
10204400382990092129865251010010010000100100005001426895400194003840038385403387461010020010000200300004003840038111020110099100100100001000071012162239871100001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003830005122986525100101010000101000050142689504001940038400383856233876810010201000020300004008640038111002110910101000010000640216223991010000104003940039400394003940039
100244003830001452986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039
100244003830002332986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039
1002440038300181242986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039
100244003830002352986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039
10024400383000612986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039
100244003830002512986525100101010000101000050142689504001940038400383856233876810053201000020300004003840038111002110910101000010000640316223987210000104003940039400394003940039
10024400383000612986525100101010000101000050142689514001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039
10024400383000612986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216233987210000104003940039400394003940039
10024400383000612986525100101010000101000050142689504001940038400383856233876810010201000020300004003840038111002110910101000010000640216223987210000104003940039400394003940039

Test 3: Latency 1->2

Code:

  sha1p q0, s0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)031e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204500373750006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011611498240100001005003850038500385003850038
10204500373750006139829251010010010000100100435001786259150018500375003748491348745101002001018620030000500375003711102011009910010010000100000371011611498220100001005003850038500385008550038
10204500373750006139829251010010010000100100005001786259050018500375003748491348745101002001000020030000500375003711102011009910010010000100000071431723498240100001005003850038500385003850038
10204500373750006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011611498220100001005003850038500385003850038
10204500373750006139829251010010010000100100005001786259050018500375003748491348745101002001000020030000500375003711102011009910010010000100000071411623498240100001005003850038500385003850038
10204500373750016139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011711498220100001005003850038500385003850038
10204500373750006139829251010010010000100100005001786259050018500375003748491348745101002001005420030000500375003711102011009910010010000100000071011611498220100001005003850038500385003850038
102045003737500061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000712217124982425100001005003850038500385003850038
102045003737500025139829251010010010000100100005001786259050018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011611498220100001005003850038500385003850038
10204500373750006139829251012512510000100100006261786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071411611498220100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024500373750103398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010006404163449824010000105003850038500385003850038
10024500373750102398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010006403163349824010000105003850038500385003850038
10024500373750441398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010006403163349824010000105003850038500385003850038
1002450037375061398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010106403163349824010000105003850038500385003850038
1002450037375061398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010006403163349824010000105003850038500385003850038
1002450037375061398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010006403163349824010000105003850038500385003850038
1002450037375061398292510010101000013100005017865245001850037500374851334876710050201000020300005003750037111002110910101000010106403163149824010000105003850038500385003850038
1002450037375061398292510010101000010100005017862595001850084500374851334876710010201000020300005003750037111002110910101000010036403163349824110000105003850038500385003850038
1002450037374961398292510010101000010100005017862595001850037500374851334876710010201000020300005007050037111002110910101000010006403163349824010000105003850038500385003850038
1002450037375061398292510010101000010100005017862595001850037500374851334876710010201000020300005003750037111002110910101000010006403163349824010000105003850038500385003850038

Test 4: Latency 1->3

Code:

  sha1p q0, s1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)0309191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020450037375000061398292510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000000071021611498220100001005003850038500385003850038
1020450037374000061398292510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000000071011611498220100001005003850038500385003850038
1020450037375000061398142510100100100101001000059117862591500530500845008548491748777101002001000020030000500375003711102011009910010010000100000100071011611498220100001005003850038500385003850038
10204500373740000103398292510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000000071011611498220100001005003850038500385003850038
1020450037374020108550398142510100110100051001003950017862591500180500845003748491348745101002001000020030000500375003711102011009910010010000100000130071011611498220100001005003850038500385003850038
10204500373750000103398292510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000132071011611498220100001005003850038500385003850038
10204500373750045082398292510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000000071011611498220100001005003850038500385003850038
1020450037375000061398292510100100100001001000050017862591500650500375003748491348745101002001000020030000500375003711102011009910010010000100000000071011611498220100001005003850038500385003850038
1020450037374000061398152510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000000071011611498220100001005003850038500385003850038
10204500373750000726398292510100100100001001000050017862591500180500375003748491348745101002001000020030000500375003711102011009910010010000100000000071011611498220100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? simd retires (ee)? int retires (ef)f5f6f7f8fd
100245008437500065406139829251001010100001010000501786259050018500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385003850038
1002450037376000006139829251001010100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385003850038
1002450037375000006139829251001010100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385003850038
1002450037374000006139829251001010100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385003850038
10024500373740000010739829251001010100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385003850038
10024500373740000099339829251001710100051010000601786524150018500375003748513034876710010201000020300005003750037111002110910101000010221733064021622498240010000105003850038500385003850038
1002450037375000606139829251001010100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385008550038
10024500373761201206139829251002110100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010003064021622498240010000105003850038500385003850038
10024500373750000010339829251001010100001010000501786259150018500375003748513034876710010201000020300005003750037111002110910101000010000064021622499630010000105003850038500385003850228
1002450037375000006139829251001010100001010000501786259050053500375003748513034876710010201000020300005003750037111002110910101000010000064021622498240010000105003850038500385003850038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1p q0, s8, v9.4s
  movi v1.16b, 0
  sha1p q1, s8, v9.4s
  movi v2.16b, 0
  sha1p q2, s8, v9.4s
  movi v3.16b, 0
  sha1p q3, s8, v9.4s
  movi v4.16b, 0
  sha1p q4, s8, v9.4s
  movi v5.16b, 0
  sha1p q5, s8, v9.4s
  movi v6.16b, 0
  sha1p q6, s8, v9.4s
  movi v7.16b, 0
  sha1p q7, s8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020432003823970000214207262398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113198860160000100320039320039320039320039320039
1602043200382397000000007262398202580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113198860160000100320039320039320039320039320280
160204320038239801000000258223986525880100100800001038000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113198860160000100320039320039320039320039320039
1602043200382398000000001152398652580100100800061008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113198860160000100320039320039320039320039320039
160204320038239800000036069832398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113199640160000100320039320039320039320039320039
1602043200382398000000003442398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113198860160000100320039320039320039320039320039
1602043200382397000000007262398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110116113198860160000100320039320039320039320039320039
1602043200382397000000006123986525801001008000010080000500114368951320019032003832003829979029299996801002008000020024000032003832003811160201100991001001600001000000000010110117113198860160000100320039320039320039320039320039
1602043200382397000000007262398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000010110216113199590160000100320097320039320039320039320039
16020432003823970000007590612398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160202100991001001600001000000000010110116113198860160000100320039320039320039320039320039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0005

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002432003823970000000067239865548001010800001080000501143689511532001932003832003829981233000188001020800002024000032003832003811160021109101016000010000000010022821121612202413319887016141316000010320039320039320039320039320039
1600243200382397000000009622398652580010108000010800005011437166015320019320038320038299812330001880010208000020240000320038320038111600211091010160000100000000100241132251622182126319887032142616000010320039320039320039320039320039
1600243200382398000000001002398652580010108000010800005011436895015320019320038320038299812330001880010208000020240000320038320038111600211091010160000100000000100241132251622152629319887032282616000010320039320039320039320039320039
160024320038239800000000738239865258001010800001080000501143689501532001932003832003829981233000188001020800002024000032003832003811160021109101016000010003000010022821122411141227319922032282616000010320039320039320039320039320039
160024320038239700000570067239865258001010800001080000501143689501532001932003832003829981233000188001020800002024000032003832003811160021109101016000010000000010022822131611152511319887032281316000010320039320039320039320039320039
160024320038239700000000765239865258001010800001080000501143689511532001932003832003829981233000188001020800002024000032003832003811160021109101016000010000000010022825131612151127319887016141316000010320039320039320039320039320039
160024320038239800000000962398502580010108000010800005011436895015320019320038320038299812283000188001020800002024000032003832003811160021109101016000010000000010024832101622182528319887032282616000010320039320039320039320232320039
16002432003823970000000017512398659980010108000010800005011436895015320019320038320038299812330001880010208000020240000320038320038111600211091010160000100000000100241142251622162513319887032282616000010320039320039320039320039320088
160024320038239700000300738239865258001010800001080000501143689501532001932003832003829981233000188001020800002024000032003832003811160021109101016000010200000010096842251622152613319887016282616000010320039320039320039320235320039
16002432003823970010412352014062398652580010108000010801205011436895015320019320038320038299812183000188001020800002024000032003832003811160021109101016000010004004010093832124922161927319887032282616000010320039320039320039320039320039

Test 6: throughput

Count: 16

Code:

  sha1p q0, s16, v17.4s
  sha1p q1, s16, v17.4s
  sha1p q2, s16, v17.4s
  sha1p q3, s16, v17.4s
  sha1p q4, s16, v17.4s
  sha1p q5, s16, v17.4s
  sha1p q6, s16, v17.4s
  sha1p q7, s16, v17.4s
  sha1p q8, s16, v17.4s
  sha1p q9, s16, v17.4s
  sha1p q10, s16, v17.4s
  sha1p q11, s16, v17.4s
  sha1p q12, s16, v17.4s
  sha1p q13, s16, v17.4s
  sha1p q14, s16, v17.4s
  sha1p q15, s16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0002

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204640075479400000066300536479865251601001001600001001600005002287689506400673640038640038619790036199961601482001600002004800006400386400381116020110099100100160000100000020010110117116398730160000100640039640087640039640076640039
1602046400384795000000000726479865251601001001600001001600005002287689516400190640038640038619790036199961601002001600002004802166400386400381116020110099100100160000100000000010110116116398730160000100640039640039640039640039640039
16020464003847940000106006947479865251601001001600001001600005002287689516400190640038640038619790036199961601002001600002004800006400386400381116020110099100100160000100100001010110116116398730160000100640039640039640039640039640039
1602046400384795000000930072647986525160100100160000100160000500228768950640019064003864003861979003619996160100200160000200480000640038640038111602011009910010016000010000237000010110116116398730160000100640039640039640039640039640666
1602046401854800100113394000726479847251601001001600001001600005002287689506400190640038640038619790036199961601002001600002004800006400386400381116020110099100100160000100000000010110116116398730160000100640039640039640039640039640039
1602046400384794000000000726479865251601001001600001001600005002287689506400190640038640038619790036199961601002001600002004800006400386400861116020110099100100160000100000000010110116116398730160000100640039640039640039640039640039
1602056400384794000000000726479865251601001001600001001600005002287689506400190640038640038619790036199961601822001600002004800006400386400381116020110099100100160000100009000010110116116398730160000100640039640039640039640077640039
160204640038479400000000061479865251601001001600001001600005002287716616400190640038640038619790036199961601002001600002004800006400386400381116020210099100100160000100000000010110016116398730160000100640039640039640039640039640039
1602046400384794000000000747479865251601001001600001001600005002287689506400190640038640038619790036199961601002001600722004800006400386400382116020110099100100160000100000000010110117116398730160000100640039640039640039640039640039
1602046400384794000000000726479865251601001001600001001600005002287689516400190640038640038619790036200321601002001600002004800006400386400381116020110099100100160000100000000010110116116398730160000100640039640039640039640039640039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0002

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600246400384794000000001337479865251600161016000010160000502287689511064001906400386400386198120362001816001020160000204800006400386400862116002110910101600001000000010022331042161413722639873016161316000010640039640039640039640039640039
1600246400384794101000002061479865251600101016000010160000502287689511064001906400386400386198120362001816001020160068204802046400386400381116002110910101600001000000110022331017161112534639873016161316000010640039640039640039640039640039
160024640038479410100000781479865251600101016000010160000502287689511064001906400386400386198120362001816001020160000204800006400386400381116002110910101600001008000010023331125181111728639873016161316000010640039640039640039640231640087
160024640038479410100000732479865251600101016000010160000502287689511064001936400386400386198123362001816001020160000204800006400386400381116002110910101600001000000010023331126161112624639873016162616000010640039640039640039640039640039
1600246400384795101000120116479865251600101016000010160000502287689511064001906400386400386198120362001816001020160000204800006400386400381116002110910101600001000000010022331025161111625639873016161316000010640039640039640079640039640039
160025640038479400000000732479865251600101016000010160000502287716511064001906400386400386198120362001816001020160000204800006400386400381116002210910101600001002000010023641132161112228639873232162616000010640039640232640039640039640232
160024640038479410104000781479865251600351016000010160000502287689511064016306400386402296198870362015016001020160000204806846400386400384116002110910101600001001034010023311126161111824639873116161316000010640039640232640039640231640039
1600246400384794101004007814798651041600101016002410160000502287770211064016306400386400386198120362001816001020160000204800006400386400385116002110910101600001000056670010084331126161212225639873016161316000010640039640232640231640039640039
160024640231479411100000781479865251600101016000010160160502287689501064001906400386400386198690362001816001020160000204806786400386400385116002110910101600001000000010083331125161112627639873016161316000010640039640039640039640039640039
160024640038479510100000781479865251600101116000010160000502287689511064001906400386400386198120362015116001020160000204800006400386400381116002110910101600001000000010103331128161222716639873116161316000010640039640039640232640039640039