Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCMLA (vector, by element, 8H)

Test 1: uops

Code:

  fcmla v0.8h, v1.8h, v2.h[1], #90
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373001289734072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
10044037300186134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730106134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730008234072510001000100053190804018403740373258338951000100030004037403711100110001073116113473100040384038403840384038
1004403730008234072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000079116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110001373116113473100040384038403840384038
1004403730006134072510001000100053190804018403740373258338951000100030004037403711100110000073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fcmla v0.8h, v1.8h, v2.h[1], #90
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003729900613940725101001001000010010000500570690814001840037400373811506387411010020010008200300244003740037111020110099100100100001000011171711601394900100001004003840038400384003840038
102044003730001843940725101001001000010010000500570690814001840037400373811506387411010020010008200300244003740037111020110099100100100001000011171801600394890100001004003840038400384003840038
102044003730001613940725101001001000010010000500570690804001840037400373811506387411010020010008200300244003740037111020110099100100100001000011171801600394890100001004003840038400384003840038
102044003730000823940725101001001000010010000500570690804001840037400373811507387401010020010008200300244003740037111020110099100100100001000011171701600394890100001004003840038400384003840038
102044003729900613940725101001001000010010000500570690804001840037400373811506387411010020010008200300244003740037111020110099100100100001000011171801600394900100001004003840038400384003840038
102044003729900613940725101001001000010010000500570690814001840037400373811506387411010020010008200300244003740037111020110099100100100001000011171801600394900100001004003840038400384003840038
102044003730000613940725101001001000010010000500570690804001840037400373811507387401010020010008200300244003740037111020110099100100100001000011171701600394900100001004003840038400384003840038
102044003730000613940725101001001000010010000500570690814001840037400373811506387411010020010008200300244003740037111020110099100100100001000311171801600394890100001004003840038400384003840038
1020440037300151823940725101001001000010010000500570690804001840037400373811506387411010020010008200300244003740037111020110099100100100001000011171701600394890100001004003840038400384003840038
102044003730000613940725101001001000010010000500570690804001840037400373811507387401010020010008200300244003740037111020110099100100100001000011171701600394890100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037300006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640316223947310000104003840038400384003840038
1002440037300006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216233947310000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100100640216223947310000104003840038400384003840038
1002440037300006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216233947310000104003840038400384003840038
1002440037300006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216233947310000104003840038400384003840038
1002440037300006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
1002440037299006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
1002440037300006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
10024400373000061393892510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001001000640216223947310000104003840038400384003840038
10024400373000072639407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fcmla v0.8h, v0.8h, v1.h[1], #90
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003730036139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021623394790100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038
1020440037300426139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038
102044003730026161394072510100100100001001000050057069084001840037400373810820387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038
1020440037300276139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000203071021622394790100001004003840038400384003840038
10204400373003944139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038
10204400373003516139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038
1020440037300336139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000003071021622394790100001004003840038400384003840038
10204400373004206139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038
102044003730066139407251010010010000100100005005706908400184003740037381083387451010020010000200300004003740037111020110099100100100001000000071021622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acc2cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400373000001509433940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
1002440037300000225067783940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
1002440037300000300613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640003162239473010000104003840038400384003840038
1002440037299000300613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
100244003730000020106139407251001010100001010000505706908104001840037400373813017387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
10024400373000001140613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
10024400373000004170613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
10024400373000003690613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
10024400373000002400613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038
10024400372990004950613940725100101010000101000050570690810400184003740037381303387671001020100002030000400374003711100211091010100001000000640002162239473010000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fcmla v0.8h, v1.8h, v0.h[1], #90
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003730039613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
102044003730007473940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
102044003730002053940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001840037400373812233874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
1020440037300411613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071021622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037299006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006405162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040158400854003738130338767100102010000203000040037400371110021109101010000100016402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040022400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037299006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fcmla v0.8h, v8.8h, v9.h[1], #90
  movi v1.16b, 0
  fcmla v1.8h, v8.8h, v9.h[1], #90
  movi v2.16b, 0
  fcmla v2.8h, v8.8h, v9.h[1], #90
  movi v3.16b, 0
  fcmla v3.8h, v8.8h, v9.h[1], #90
  movi v4.16b, 0
  fcmla v4.8h, v8.8h, v9.h[1], #90
  movi v5.16b, 0
  fcmla v5.8h, v8.8h, v9.h[1], #90
  movi v6.16b, 0
  fcmla v6.8h, v8.8h, v9.h[1], #90
  movi v7.16b, 0
  fcmla v7.8h, v8.8h, v9.h[1], #90
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042006515104025801001008000010080000500640000120046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
1602042006515004025801001008000010080000500640000120046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
1602042006515004025801001008000010080000500640000020046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
1602042006515004025801001008000010080000500640000120046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
1602042006515004025801001008000010080000500640000020046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
1602042006515104025801001008000010080000500640000020046020065200653238010020080000200240000200652006511160201100991001001600001000000031011111611200621600001002006620066200662006620066
1602042006515004025801001008000010080000500640000020046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
16020420065150070525801001008000010080000500640000020046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111611200621600001002006620066200662006620066
1602042006515104025801001008000010080000500640000020046020065200653238010020080000200240000200652006511160201100991001001600001000000001011111612200621600001002006620066200662006620066
1602042006515104025801001008000010080000500640000120046020065200653238010020080000200240000200652006511160201100991001001600001000001001011111611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420075151090052278001212800001280000626400001120033200522005232380012208010820240000200522005211160021109101016000010000001002531152541133200492202160000102005320053200532005320053
160024200521500480046278001212800001280000626400001120033200522005232380012208000020240000200522005211160021109101016000010000001002731133421133200492202160000102006220053200532005320053
16002420052150000046278001212800001280000626400001120033200612005232380012208000020240000200522006111160021109101016000010000001002631133421232200492201160000102005320053200532005320053
16002420052150000052278001212800001280000626400001120033200522005232380012208000020240000200522005211160021109101016000010009001002631132521142200492201160000102005320053200532005320053
160024200521500510046278001212800001280000626400001120033200522005232380012208000020240000200522005211160021109101016000010000001002631132521133200492201160000102005320137201362005320136
160024200521500150046278001212800001280000626400001120033200522005232380012208000020240000200522005211160021109101016000010000001002731132521123200492201160000102005320053200532005320153
16002420052150000046278001212800001280218626400001120033200522005232380012208000020240000200522005211160021109101016000010000001002731142521132200492201160000102005320053200532005320053
16002420052150000073278001212800001280000626400001120033200522005232380012208000020240000200522005211160021109101016000010000001002531122521124200492201160000102005320053200532005320053
160024200521500330046298001212800001280000626400001120033200522005232380012208000020240000201342014411160021109101016000010000001002531232522133200492202160000102005320053200532005320053
1600242005215004320046278001212800001280000626400001120033200522006132380012208000020240000200612005211160021109101016000010900001002531132522134200492201160000102005320053200532005320053

Test 6: throughput

Count: 16

Code:

  fcmla v0.8h, v16.8h, v17.h[1], #90
  fcmla v1.8h, v16.8h, v17.h[1], #90
  fcmla v2.8h, v16.8h, v17.h[1], #90
  fcmla v3.8h, v16.8h, v17.h[1], #90
  fcmla v4.8h, v16.8h, v17.h[1], #90
  fcmla v5.8h, v16.8h, v17.h[1], #90
  fcmla v6.8h, v16.8h, v17.h[1], #90
  fcmla v7.8h, v16.8h, v17.h[1], #90
  fcmla v8.8h, v16.8h, v17.h[1], #90
  fcmla v9.8h, v16.8h, v17.h[1], #90
  fcmla v10.8h, v16.8h, v17.h[1], #90
  fcmla v11.8h, v16.8h, v17.h[1], #90
  fcmla v12.8h, v16.8h, v17.h[1], #90
  fcmla v13.8h, v16.8h, v17.h[1], #90
  fcmla v14.8h, v16.8h, v17.h[1], #90
  fcmla v15.8h, v16.8h, v17.h[1], #90
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044042730003900420251601171001600221001600005005715261411864119042285199730319998160100200160000200480000411904004311160201100991001001600001000151011031644400371600001004119142286400414004140041
160204412093000210082454125160100100160001100160000500128000040021422974122719973032114816010020016000020048000040040400401116020110099100100160000100001011041643400371600001004122041205400414004140041
160204400403000150061448525160100100160000100160000500128000040021411904120522222031999816010020016000020048000040040412041116020110099100100160000100001011041644400371600001004004140044407264004142286
160204400403090001271448525160100100160000100160000500128000040021423184004021095032114816010020016000020048000041205411901116020110099100100160000100001011031644412021600001004004140041400414004141191
160204400403000002242448525160117100160000100160000500128000040021411904120519973031999816010020016000020048000041190400401116020110099100100160000100001011041644400401600001004004140041400414119140041
1602044119030004200518375343160279100160102114160107500128000040021400404119019973032114816010020016000020048000040043400401116020110099100100160000100001011041634400371600001004119141206411914120641191
160204411903000902261025160100100160022100160000500571526140021400404004021095031999816010020016000020048000040040411901116020110099100100160000100001011031644422821600001004004440041400414119140041
1602044004030007502261025160100100160000100160000500131999940021411904004019973031999816010020016000020048000041190412091116020110099100100160000100001011041634400371600001004004141206411914004240041
1602044120430003300726025160100100160000100160000500131999940021400404004019973032114816010020016000020048000040040400401116020110099100100160000100001011041644412021600001004004141191412204120540041
160204411903170120042448525160100100160000100160000500128000041171400404004021095031999816010020016000020048000040040412611116020110099100100160000100001011041644400371600001004004241191422864004141191

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)181e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)st unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400973061000035047025160045101600531016000050128000011541200041226400401999603200231601192016021020480630400404004211160021109101016000010000141010022811916211117412260208160000104122740042400414004140041
1600244005130800000530670251600101016003510160000501280000015400210400404122619996032229816001020160000204800004004042318111600211091010160000100000010022841716211715400370209160000104004142319400414231940041
1600244004229900000350470251600101016005310160000501280000115400210400404231822244032002016001020160000204800004004040043111600211091010160000100001620100228411516211157423150208160000104228240041400424231940041
16002440055317000000047029316050112160172111600006114869341154120704004040814211410102089116001020160169204806244062240149211600211091010160000104000010022841716211117422821209160000104080741094400414122741313
1600244004030301110415981047460725160010101600001016000050572804711540021042281412191999603200201600102016000020480000400404009521160021109101016000010011001002284111162111111412230208160000104122040041400434230342282
1600244009230000000530490251600451016000110160000501280000115400210423184004019996032229816001020160000204800004122640040111600211091010160000100002370100228411116411715400370208160000104004140043412274004140041
160024422852990000000890251600101016003510160000501280000115400210400404122921146253212061600102016000020480000400404231811160021109101016000010000001002484111162111111423150208160000104004140041412274004140041
16002442281299000006005302516001010160000101600005012800001154120004121940040199960320020160010201600002048000040040400401116002110910101600001000000100228411116411117400390409160000104004142282412204122740041
16002440101300000000053025160010101600351016000050586833311540021041226412261999625320020160010201600002048000040041412191116002110910101600001000000100228411116211715400370208160000104004142319400414228641227
1600244010230900000104702516004510160001101600005057280471154002104004042281199960320020160010201600002048000041226400401116002110910101600001000033010022841716211711400370208160000104004140041400414004142319