Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLSL2 (vector, 2S)

Test 1: uops

Code:

  fmlsl2 v0.2s, v1.2h, v2.2h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)0309l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10044037300000000314340725100010001000531908140184037403732583389510001000300040374084111001100000000073116113473100040384038403840384038
1004403730000000061340725100010001000531908140184037403732583391410001160305440734133211001100022103565094137113509100040864085408540384086
10044074301011000327339849100010001148533304140534084407332588390711391000348640854085211001100002022193073124113489100040864074408540744074
10044084300011144880958339849100010061000533304140184084403732627391511481160348340844073211001100002203550294140113473100040844038408640864084
10044084300111135881873339845100610001082531908140534073408532618391411481163348340374083211001100000100073116113473100040384038403840384038
1004403730000000061340725100010001000531908140184037403732583389510001000300040374037111001100000000073116113473100040384038403840384038
10044037300000000229340725100010001000531908140184037403732583389510001000300040374037111001100000100073116113473100040384038403840384085
1004403730000000082340725100010001000531908140184037403732583389510001000300040374037111001100000000073116113473100040384038403840384038
1004403730000000061340725100010001000531908140184037403732583389510001000300040374037111001100000000073116113473100040384038403840384038
1004403730000000061340725100010001000531908040184037403732583389510001000300040374037111001100000000073116113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmlsl2 v0.2s, v1.2h, v2.2h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)0318191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003730000006139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710021622394790100001004003840038400384003840038
102044003730000006139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038
102044003730000006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100100000710121622394790100001004003840038400384003840038
102044003730000016139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038
102044003729900006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038
102044003730000006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038
102044003730000006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102021009910010010000100000000712121622394790100001004003840038400384003840038
102044003730000006139407641010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038
102044003730000006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038
1020440037300000044139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100000000710121622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400372990006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640316223947310000104003840038400384003840038
10024400373000006439407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
10024400373000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
100244003730000010639407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
10024400373000006139407641001010100001010000505706908040018401314008438130338767100102010000203000040037400371110021109101010000102200640216223947310000104003840038400384003840038
1002440037300001261394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001004100640216323947310000104003840038400384003840038
10024400373110006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100030640216223951110000104003840038400384003840038
100244003730000078839407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
10024400373000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038
100244003729900081339407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000640216223947310000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmlsl2 v0.2s, v0.2h, v1.2h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101160139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
1020440037300072639407251010010010000100100005005706908140018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
1020440037300031339407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
102044003730006139407251010010010000100100005005706908040018400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003729900006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000002006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000007242162239473010000104003840038400384003840038
10024400373000001836139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038
100244003730000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000006402162239473010000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmlsl2 v0.2s, v1.2h, v0.2h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373100000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400372990010613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000613940744101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000008001161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101161139509100001004003840038400384003840038
10204400372990000613940764101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100010007101161139479100001004003840038400384018040038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640316343947310000104003840038400384003840038
10024400372990006139407251001010100001010000565706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640316443947310000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640316343947310000104003840038400384003840038
100244003730000010539407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640416343947310000104003840038400384003840038
100244003730000153639407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640316433947310000104003840038400384003840038
10024400372990006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640316443947310000104003840038400384003840038
100244003729912006139407251003610100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640416443947310000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640416443947310000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640416433954710000104003840038400384003840038
10024400373000006139407251001010100001010000505706908400184003740037381303387671001020100002030000400374003711100211091010100001000000640316443947310000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmlsl2 v0.2s, v8.2h, v9.2h
  movi v1.16b, 0
  fmlsl2 v1.2s, v8.2h, v9.2h
  movi v2.16b, 0
  fmlsl2 v2.2s, v8.2h, v9.2h
  movi v3.16b, 0
  fmlsl2 v3.2s, v8.2h, v9.2h
  movi v4.16b, 0
  fmlsl2 v4.2s, v8.2h, v9.2h
  movi v5.16b, 0
  fmlsl2 v5.2s, v8.2h, v9.2h
  movi v6.16b, 0
  fmlsl2 v6.2s, v8.2h, v9.2h
  movi v7.16b, 0
  fmlsl2 v7.2s, v8.2h, v9.2h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420091150480402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001013811611200621600001002006620066200662006620066
1602042006515000402580100100800001188000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
160204200651503390402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
160204200651504320842580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
16020420065150150402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
160204200651502970402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
1602042006515000402580233100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
160204200651513360402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
160204200651504650402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066
160204200651502190402580100100800001008000050064000020046200652006532380100200800002002400002006520065111602011009910010016000010000001011111611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk instruction (07)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)c3cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242006915000462780012128000012800006264000011020033200522005232380012208000020240000200522005211160021109101016000010001002731192522156200492201160000102005320053200532005320062
1600242005215000512780012128000012800006264000011020033200522005232380012208000020240000200522005211160021109101016000010001002831152521135200492201160000102005320053200532005320053
1600242005215100462780012128000012800006264000011020033200522005232380012208000020240000200612006111160021109101016000010001003361172541274200492201160000102005320053200532005320121
1600242005215000462780012128000012800006264000011020033200522005232380012208000020240000200522005211160021109101016000010001002831132521155200492201160000102005320053200532005320053
1600242005215000462780012128000012800006264000011020033200522005232380012208000020240000200522005211160021109101016000010001002831132521135200492202160000102005320053200622006220053
160024200611500147462780012128000012800006264000011020033200522005232380012208000020240000200522005211160021109101016000010001002882132521157200582201160000102005320122200622005320053
1600242005215000462780012128000012800006264000011520033200522005232380012208000020240000200522005211160021109101016000010001002882162421135200492201160000102005320053200532005320053
160024200521500210462780012128000012800006264000011520033200522005232380012208000020240000200522005211160021109101016000010001002882152521135200492201160000102005320053200532005320053
16002420052150003312980012128000012800006264000011520033200522006132380012208000020240000200522005211160021109101016000010101002682182521156200492201160000102005320053200532005320053
16002420052150001412780012128000012800006264000011520033202332005232380012208000020240000200522005211160021109101016000010001002682132521135200492201160000102005320053200532005320053

Test 6: throughput

Count: 12

Code:

  fmlsl2 v0.2s, v12.2h, v13.2h
  fmlsl2 v1.2s, v12.2h, v13.2h
  fmlsl2 v2.2s, v12.2h, v13.2h
  fmlsl2 v3.2s, v12.2h, v13.2h
  fmlsl2 v4.2s, v12.2h, v13.2h
  fmlsl2 v5.2s, v12.2h, v13.2h
  fmlsl2 v6.2s, v12.2h, v13.2h
  fmlsl2 v7.2s, v12.2h, v13.2h
  fmlsl2 v8.2s, v12.2h, v13.2h
  fmlsl2 v9.2s, v12.2h, v13.2h
  fmlsl2 v10.2s, v12.2h, v13.2h
  fmlsl2 v11.2s, v12.2h, v13.2h
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)031e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
12020440039312120037335689251201011001200001001200005005966386140020040265416862493232664412010020012022320036060940039400391112020110099100100120000100007610116114003001200001004168740040416874004041687
12020441691300003619961251201011001200001001200005005630640042451041686400392493232499712010020012000020036000041686400391112020110099100100120000100007610116114246101200001004009240040400404004040040
12020441686300001619961251201001001200001001200005005851993041672040039400392493232664912010020012000020036000041686400391112020110099100100120000100007610116114003001200001004004041692424714004040040
12020440039299000619961251201031001200031001200005005851993040020040039416862493232664912010020012000020036000041686400391112020110099100100120000100007610116114167701200001004169240040400404168740040
1202044168630000353640274251201011001200001001200005005630640041669040039400392657732499712010020012000020036000041691400391112020110099100100120000100007610116114003001200001004004041687400404168740040
120204416863000006135689251201001001200001001200005005630640140020040039400392493232499712010020012000020036000040039400391112020110099100100120000100007610116114003001200001004169240040400404004041687
120204416863000006137966251201001001200021001200005005630640141672040039416862493232499712010020012000020036000040039400391112020110099100100120000100007610116114167701200001004004040040400404004040040
120204400393120036137966251201031001200031001200005005630640140020040039400392493232499712010020012000020036000041686400391112020110099100100120000100207610116114168301200001004004041692400404247140040
12020441691299003619961251201001001200001001200005005630640041667041686400392657732664412010020012000020036000041686400391112020110099100100120000100107610116114003001200001004247140040400404247140040
1202044003931200115635689251201011001200001001200005005630640040020040039400392493232499712010020012000020036000041686400391112020110099100100120000100007610116114168301200001004004041692400404004041687

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)d9daddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
12002440039300006199612512001010120000101200005058605081400200400394003924955325019120010201200002036000040039400391112002110910101200001000000075200516003540030120000104004041703400404004040040
1200244003929900149799612512001010120000101200005056306401400200400394003924955325019120010201200002036000040039400391112002110910101200001000000075200316003540030120000104169240040400404004040040
12002440039300006199612512001010120000101200005056306401416690400394003924955325019120010201200002036000040039400391112002110910101200001000000075550516005340030120000104004040040400404004040040
120024400393120461356892512001010120000101200005056306401400200400394003924955325019120010201200002036000040039416861112002110910101200001000000075200316005340030120000104004040040400404004040040
12002440039299006199612512001010120000101200005056306401400200400394003924955325019120010201200002036000040039400391112002110910101200001000000075200516005340030120000104004040040400404004040040
12002440039300006199612512001010120000101200005056306401400200400394003924955325019120010201200002036000040039400391112002110910101200001000000075200316003540030120000104004040040400404170340040
12002440039300006199612512001010120000101200005056306401400200400394003924955325019120010201200002036000040039400391112002110910101200001000000075200516005341693120000104004040040417034004040040
12002440039300006199612512001010120000101200005058605081400200400394003924955325019120010201200002036000040039416861112002110910101200001000000075200316003541678120000104004040040400404004040040
120024424703001206199382512001010120000101200005056306401400200400394003926615325019120010201200002036000040039400391112002110910101200001000000075200516005340030120000104004040040400404004040040
12002440039299006199612512001010120000101200005056306401400200400394003924955325019120010201200002036000040039400391112002110910101200001000000075200316005541693120000104004040040416874004041692