Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLS (by element, 4S)

Test 1: uops

Code:

  fmls v0.4s, v1.4s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004403730126134072510001000100053190814018403740373258338951000100030004037403711100110002073116113473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403731040234072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384085403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730024134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
100440373066134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
10044037301266134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730216134072510001000100053190814018403740373258338951000100030004037403711100110000073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmls v0.4s, v1.4s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373000100007263940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071013162239479100001004003840038400384003840038
10204400373000000571613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010001371012162239479100001004003840038400384003840038
1020440037300000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071013162239479100001004003840038400384003840038
1020440037300000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071013160239479100001004003840038400384003840038
10204400373000000011073940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071013162239479100001004003840038400384003840038
10204400373000000120613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400372110201100991001001000010020695371013162239479100001004003840038400384003840038
1020440037300000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071013162239479100001004003840038400384003840038
1020440037300100000613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000073212162239479100001004003840038400384003840038
1020440037300000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071013162239479100001004003840038400384003840038
1020440037300000000613940725101001001000010210000516570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000071012162239479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037300000005637394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000306404162239473010000104003840038400384003840038
10024400372990000061394072510010101000010101485057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001000000006403162239473010000104003840038400384003840038
100244003730000000124394072510010101000010101485057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000000198394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000012100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006403482239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001002000006402162239473010000104003840038400384003840038
10024400372990000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006403162239473010000104003840038400384003840038
1002440037299000002581394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
10024400373000000061394072510010101000010100005057069081400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmls v0.4s, v0.4s, v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03091e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020440037300006139407251012210010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005708307140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300096139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071012411394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037299006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040037400371110201100991001001000010000071011611394790100001004003840038400384003840038
1020440037300006139407251010010010000100100005005706908140018040037400373810833874510100200100002003000040132400371110201100991001001000010000071011611394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003730006139407251001010100001010000505706908140018040037400373813033876710010201000020304804003740037111002110910101000010000640416553947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640616443947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640416543947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640516453947310000104003840038400384003840038
1002440037300072639407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640416663947310000104003840038400384003840038
100244003729906139407251001010100001010000505706908140018040037400373813033876710010201016220300004003740037111002110910101000010000640516563947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640616563947310000104003840038400384003840038
100244003729906139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640616653947310000104003840038400384003840038
100244003729906139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640416663947310000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018040037400373813033876710010201000020300004003740037111002110910101000010000640516653947310000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmls v0.4s, v1.4s, v0.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373003613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
102044008329901243940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400372990613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400372990613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001007101161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003730001033940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010010006402162239473010000104003840038400384003840038
100244003730001033940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038
100244003730007263940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038
10024400372990613940725100101010000101000050570690804001840037400373813033876710306201000020300004003740037111002110910101000010002006402162239473010000104003840038400384003840038
10024400373000713940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473110000104003840038400384003840038
10024400373000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038
10024400372990613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038
100244003730005363940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038
10024400372990613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038
10024400373000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000006402162239473010000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmls v0.4s, v8.4s, v9.s[1]
  movi v1.16b, 0
  fmls v1.4s, v8.4s, v9.s[1]
  movi v2.16b, 0
  fmls v2.4s, v8.4s, v9.s[1]
  movi v3.16b, 0
  fmls v3.4s, v8.4s, v9.s[1]
  movi v4.16b, 0
  fmls v4.4s, v8.4s, v9.s[1]
  movi v5.16b, 0
  fmls v5.4s, v8.4s, v9.s[1]
  movi v6.16b, 0
  fmls v6.4s, v8.4s, v9.s[1]
  movi v7.16b, 0
  fmls v7.4s, v8.4s, v9.s[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042008815000004025801001008000010080000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515100004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
16020420065151000018925801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000050001011111611200621600001002006620066200662006620066
16020420065150000016625801001008000010080000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000120046200652006532380342200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
160204200651510012092225801001008000010080000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000908225801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515100004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2507

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)l2 tlb miss instruction (0a)181e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200541500000450075258001212800001280000626400001120028200472004732380012208010820240000200532005311160021109101016000010001005862212826322301820050231160000102005420054200542005420054
1600242005315120000187258001212800001280000626400000120034200532005332380012208000020240000200532005311160021109101016000010001005562212926322312920050231160000102005420054200542005420054
1600242005315021100075258001212800001280000626400000020034200532005332380012208000020240000200532005311160021109101016000010001005562212826322291820050231160000102005420054200542005420054
1600242005315021000187258001212800001280000626400000020034200532005332380012208000020240000200532005311160021109101016000010001005762212826322282820050231160000102005420054200542005420054
1600242005315121100175258001212800001280000626400000020034200532005332380012208000020240000200532005311160021109101016000010001005662212826322292920050231160000102005420054200542005420054
1600242005315011100064258001212800001280000626400000120034200532005332380012208000020240000200532005311160021109101016000010001005562212926322303120050231160000102005420054200542005420054
16002420053151211001372258001212800001280000626400000120034200532005332380012208000020240000200532005311160021109101016000010001005162212426322312520050231160000102005420054200542005420054
160024200531502110297187258001212800001280000626400000120034200532005332380012208000020240000200532005311160021109101016000010001005662212726322272620050231160000102005420054200542005420054
160024200531512110372087258001212800001280000626400000120034200532005332380012208000020240000200532005311160021109101016000010001005762213026322313120050231160000102005420054200542005420054
1600242005315021000187258001212800001280000626400000120034200532005332380012208000020240000200532005311160021109101016000010001005662213026322312820050231160000102005420054200542005420054

Test 6: throughput

Count: 12

Code:

  fmls v0.4s, v12.4s, v13.s[1]
  fmls v1.4s, v12.4s, v13.s[1]
  fmls v2.4s, v12.4s, v13.s[1]
  fmls v3.4s, v12.4s, v13.s[1]
  fmls v4.4s, v12.4s, v13.s[1]
  fmls v5.4s, v12.4s, v13.s[1]
  fmls v6.4s, v12.4s, v13.s[1]
  fmls v7.4s, v12.4s, v13.s[1]
  fmls v8.4s, v12.4s, v13.s[1]
  fmls v9.4s, v12.4s, v13.s[1]
  fmls v10.4s, v12.4s, v13.s[1]
  fmls v11.4s, v12.4s, v13.s[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9facc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
120204400392991230613568925120103100120001100120000500563064014002040039416912657732499712010020012000020036000040039416861112020110099100100120000100007610131622416831200001004004040040400404004040040
1202044003931230061379662512010110012000110012000050056306401400204003941691265823249971201002001200002003600004168640039111202011009910010012000010000761021622400301200001004004041692400404168740040
12020440039312108061379662512010110012000210012000050056306401416724169140039249323266491201002001200002003600004169140039111202011009910010012000010000761021622400301200001004004041687400404169240040
1202044003931254061356892512010310012000110012000050056306401400204003941691265823249971201002001200002003600004169140039111202011009910010012000010000761021622416831200001004004041692400404168740040
1202044003931214416199612512010010012000010012000050058519931400204168640039249323249971201002001200002003600004169140039111202011009910010012000010000761021622400301200001004169240040416924009141692
120204416913008706199612512015210012000010012000050056306401400204003941687265823249971201002001200002003600004003941691111202011009910010012000010000761021622400301200001004169240040416874004041687
12020441686300102361996125120100100120000100120000500585186914167241691400392493224249971201002001200002003600004169140039111202011009910010012000010000761021622400301200001004168740040416924004041692
1202044169130010216199612512010010012000010012000050058518691416674168640039249323249971201002001200002003600004003941686111202011009910010012000010000761021622416831200001004004041692400404169240040
120204400393129036199612512010010012000010012000050058518691400204003940039249323249971201002001200002003600004168640039111202011009910010012000010000761021622400301200001004004040040416924004040040
12020441686300126172699612512010010012000010012000050058518691400204003940039249323249971201002001200002003600004169140039111202011009910010012000010000761021622400301200001004169240040416924004041692

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)03091e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0ebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
120024400393160016199612512001010120000101200005056306400040020400394003924955325019120010201200002036000040039400391112002110910101200001000000752000716344003000120000104004040040400404004040040
120024400393000006199612512001010120000101200005056306400540020400394003924955325019120010201200002036000040039400391112002110910101200001000000752000316444003000120000104004040040400404004040040
120024400393000906199612512001010120000101200005056306401540020400394003924955325019120010201200002036000040039400391112002110910101200001000000752053416454003000120000104004040040400404004040040
12002440039299000619961251200101012000010120000505630640004002040039416862495532501912001020120000203600004003940039111200211091010120000100079800752053416444003000120000104004040040400404004040040
120024400392990006199612512001010120000101200005058518690540020400394003924955325019120010201200002036000040039400391112002110910101200001000001752053416344003000120000104004040040400404004040040
120024400393000006199612512001010120000101200005056306401540020400394003924955325019120010201202282036000040039400391112002110910101200001000000752053316994003000120000104004040040400404004040040
120024400393000006199612512001010120000101200005056306401540020400394003924955325019120010201200002036000040039400391112002110910101200001000000752003516444003000120000104004040040400404004040040
120024400393000006199612512001010120000101200005056306401540020400394009024955325019120010201200002036000040039416911112002110910101200001000000752053516534003000120000104004040040400404004040040
120024400393000006199612512001010120000101200005058518690040020400394003924955325019120010201200002036000040039400391112002110910101200001000000752050416444003004120000104004040040400404004041687
120024400393000006199612512001010120000101200005056306400040020400394003924955325019120010201200002036000040039400391112002110910101200001000000752050516534003000120000104004040040400404004040040