Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLA (by element, 2D)

Test 1: uops

Code:

  fmla v0.2d, v1.2d, v2.d[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03091e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100440373000823407251000100010005319084018403740373258338951000100030004037403711100110000073216123473100040384038403840384038
100440373000613407251000100010005319084018403740373258338951000100030004037403711100110000073116223473100040384038403840384038
1004403730006134072510001000100053190840184037403732583389510001000300040374037111001100001873216213473100040384038403840384038
1004403730003093407251000100010005319084018403740373258338951000100030004037403711100110000073116223473100040384038403840384038
100440373000843407251000100010005319084018403740373258338951000100030004037403711100110000073116213473100040384038403840384038
100440373000613407251000100010005319084018403740373258338951000100030004037403711100110000073116113473100040384038403840384038
1004403730006134072510001000100053190840184037403732583389510001000300040374037111001100001573216113473100040384038403840384038
100440373100823407251000100010005319084018403740373258338951000100030004037403711100110000073216113473100040384038403840384038
100440373000823407251000100010005319084018403740373258338951000100030004037403711100110000073216213473100040384038403840384038
1004403730001523407251000100010005319084018403740373258338951000100030004037403711100110000073216113473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla v0.2d, v1.2d, v2.d[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020440037300000061394072510100100100001001000050057069080400530400374003738108033874510100200100002003000040037400371110201100991001001000010000071004163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069081400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071003163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069081400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071013163339479100001004003840038400384003840038
1020440037299000061394072510100100100001001000050057069081400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071014164339549100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069081400180400374003738108033874510100200100002003000040037400851110201100991001001000010000071013163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069080400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071013163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069080400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071013163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069080400180400374003738108033874510100200100002003000040037400372110201100991001001000010000071014163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069081400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071013163339479100001004003840038400384003840038
1020440037300000061394072510100100100001001000050057069080400180400374003738108033874510100200100002003000040037400371110201100991001001000010000071013163339479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037300000613940743100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010001000640216223947310000104003840038400384003840038
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037299000613940725100101010000101000050570830414001840037400373813033876710010201000020300004003740037111002110910101000010000000640216233947310000104003840038400384003840038
1002440037299000613940725100101010000101000050570690814001840037400373813033876710157201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000000640216223947310000104003840038400384003840038
1002440037300000613940725100101010000101000050570690814001840037400373813033876710010201017220300004003740037111002110910101000010000000640216223947310000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmla v0.2d, v0.2d, v1.d[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)09181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373000000823940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161140034100001004003840038400384003840038
1020440037300000029973940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
10204400372990000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000017101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
10204400372990000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
102044003730000007263940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
10204400373000000613940744101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101171139479100001004003840038400384003840038
10204400373000000613940725101001001000010010000500570690814001840037400373810803387451010020010000200300004003740037111020110099100100100001000007101161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037299061394072510010101000010100005057069084001840037400373813003387671001020100002030000400374003711100211091010100001000000726402162239473010000104003840038400384003840038
100244003729906139407251001010100001010000505706908400184003740037381300338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
10024400373000613940725100181010000101000050570690840018400374003738130033876710010201000020300004003740037111002110910101000010000001416402162239473010000104003840038400384003840038
10024400373000613940725100101010006111000050570690840018400374003738130033876710010201000020300004003740037111002110910101000010000001416402162239473010000104003840038400384003840038
10024400372990613940725100101010000101000050570690840018400374003738130033876710010201000020300004003740037111002110910101000010000001356402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908400184003740037381300338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813003387671001020100002030000400374003711100211091010100001000079036402162239473010000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813003387671001020100002030000400374003711100211091010100001000018096402162239473010000104003840038400384003840038
100244008530006139398251001010100001010000505706908400184003740037381300338767100102010000203000040037400371110021109101010000100000036402163339473010000104003840038400384003840038
100244003729906139407251001010100001010000505706908400184003740037381300338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038

Test 4: Latency 1->3

Code:

  fmla v0.2d, v1.2d, v0.d[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003730000061394072510100100100001001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100937101161139479100001004003840038400384003840038
1020440037299000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001004737101160139479100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001005007101161139479100001004003840038400384003840038
1020440037299000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001005067101161139479100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001004597101161139479100001004003840038400384003840038
102044022730000061394072510100100100061001000050057069081400180400374003738108338745101002001000020030000400374003711102011009910010010000100837101161139479100001004003840038400384003840038
1020440037300000613940725101001001000010010000500570690814001804003740037381083387451010020010000200300004003740037111020110099100100100001000667101161139479100001004003840038400384003840038
102044003729900061394072510100100100001001000050057069080400180400374003738108338745101002001000020030000400374003711102011009910010010000100007101161139479100001004003840038400384003840038
10204400373000006139407251010010010000100100005005706908140018040037400373810826387451010020210000200300004008440037111020110099100100100001003937101161139479100001004008540038400384003840038
102044003730000061394072510100100100001001000050057069080400183400374003738108338745101002001000020030000400374003711102011009910010010000100107101161139479100001004003840038400384003840185

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037299010339407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100026402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908140018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003729906139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003729906139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100006402162239473010000104003840038400384003840038
100244003730006139407251001010100001010000505706908040018400374003738130338767100102010000203000040083400371110021109101010000100006402162239473010000104003840038400384003840038
1002440037300061394072510010101000010100005057069080400184003740037381303387671001020100002030000400374003711100211091010100001062306402162239473010000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla v0.2d, v8.2d, v9.d[1]
  movi v1.16b, 0
  fmla v1.2d, v8.2d, v9.d[1]
  movi v2.16b, 0
  fmla v2.2d, v8.2d, v9.d[1]
  movi v3.16b, 0
  fmla v3.2d, v8.2d, v9.d[1]
  movi v4.16b, 0
  fmla v4.2d, v8.2d, v9.d[1]
  movi v5.16b, 0
  fmla v5.2d, v8.2d, v9.d[1]
  movi v6.16b, 0
  fmla v6.2d, v8.2d, v9.d[1]
  movi v7.16b, 0
  fmla v7.2d, v8.2d, v9.d[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042008815000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620171201712015920066
1602042006515000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000011011111611200621600001002006620066200662006620066
16020420065151000040258010010080000100800005006400000200462006520065402380100200800002002400002006520065111602011009910010016000010000020001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515100004025801001008000010080000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000011480000500640000020046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066
1602042006515000004025801001008000010080000500640000120046200652006532380100200800002002400002006520065111602011009910010016000010000000001011111611200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)031e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200861510029227800121280000128000062640000100200342005220052032380012208000020240000200522005211160021109101016000010000000100348111425211118200492601160000102005320053200532005320053
16002420052151002112780012128000012800006264000010520033200522005203238001220800002024000020052200521116002110910101600001000202010034331725211711200492401160000102005320053200532005320053
16002420053150120123727800121280000128000062640000115201102005220052032380012208000020240000200522005211160021109101016000010021000100348311125211813200832201160000102005320053200532005420054
160024200521500027227800121280000128000062640000100200332005220052032380012208000020240000200522005211160021109101016000010000600100348111125211117200492201160000102005320053200542005320053
160024200521500027427800121280000128000062640000115200332005220052032380012208000020240000200522005211160021109101016000010000000100308311125211117200492201160000102005320053200532005320053
16002420052150002352780012128000012800006264000011520033200522005203238001220800002024000020052200521116002110910101600001000000010030831725211711200492401160000102005320053200532005320053
1600242005215000263278001212800001280000626400001152003320052200525323800122080000202400002005220052111600211091010160000100000001003483113252111111200492201160000102005320053200532005320053
16002420052150004627800121280000128000062640000115200332005220052032380012208000020240000200522005211160021109101016000010000000100308311125211117200492201160000102005320053200532005320053
160024200521500021627800121280000128000062640000115200332005220052032380012208000020240000200522005211160021109101016000010000300100308311125211119200492201160000102005320053200532005320053
1600242005215000130278001212800001280000626400001152003320052200610323800122080000202400002005220052111600211091010160000100000001003083111252111111200492201160000102005320053200532005320053

Test 6: throughput

Count: 12

Code:

  fmla v0.2d, v12.2d, v13.d[1]
  fmla v1.2d, v12.2d, v13.d[1]
  fmla v2.2d, v12.2d, v13.d[1]
  fmla v3.2d, v12.2d, v13.d[1]
  fmla v4.2d, v12.2d, v13.d[1]
  fmla v5.2d, v12.2d, v13.d[1]
  fmla v6.2d, v12.2d, v13.d[1]
  fmla v7.2d, v12.2d, v13.d[1]
  fmla v8.2d, v12.2d, v13.d[1]
  fmla v9.2d, v12.2d, v13.d[1]
  fmla v10.2d, v12.2d, v13.d[1]
  fmla v11.2d, v12.2d, v13.d[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
12020440039308002339961251201011001200011001200005005630640424514003940039249323249971201002001200002003600004169140039111202011009910010012000010000007610116114003001200001004004040040400404004040040
1202044003930000619961251201001001200001001200005005630640400204247040039273593274281201002001200002003600004169140039111202011009910010012000010000007610116114003001200001004004042471400404169240040
12020440039300001249961251201001001200001001200005005630640400204003940039249323249971201002001200002003600004003940090111202011009910010012000010000007610116114003001200001004004040040424714004040040
1202044168630001849961251201001001200001001200005005630640400204003940039249323249971202962001200002003600004247040039111202011009910010012000010000007610116114003001200001004004040040400404004040040
1202044003931200619961251201001001200001001200005005630640400204003942470273593274281201002001200002003600004003942470111202011009910010012000010000007610116114003001200001004004040040400404004040040
12020440039318002089961251201001001200001001200005005966386424514247040039249323249971201002001200002003600004003940039111202011009910010012000010000007610116114246101200001004004040040400404004040040
12020440039300002379961251201011001200001001200005005630640400204003941675249323266491201002001200002003600004247040039111202011009910010012000010000007610116114003001200001004247140040424714004040040
1202044003930000619961251201001001200011001200005005630640400204247040039249323274281201002001200002003600004247040039111202011009910010012000010000007610116114003001200001004004040040400404004042471
120204424703000010337966251201021001200001001200005005630640400204003940039249323249971201002001200002003600004003940039111202011009910010012000010000007610116114003001200001004004040040400404004040040
12020440039300007069961251201001001200011001200005005630640400204247040039249323249971201002001200002003600004003940039111202011009910010012000010000007610116114003001200001004247141702424714004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3337

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)181e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1200244110430000000006199612512001010120000101200005056306400140020400394003924955032501912001020120000203600004003941691111200211091010120000100000000761800181600024400300000120000104004040040400404004041692
1200244168629900000006199612512001010120000121200006056306400141672400394003924955032501912001020120000203600004003940039111200211091010120000100000000752000051600042400300000120000104004040040400404004040040
1200244003930000000006199612512001010120000101200005056306400140020400394003924955032501912001020120000203600004003941701111200211091010120000100000000752000031600034400300000120000104004040040400404004040040
12002440039300000000083799612512001010120000101200005056306400140020400394003924955032501912001020120000203600004003940039111200211091010120000100001020840752000041600034400300000120000104169240040400404004040040
12002440039300100120006199612512001010120000101200005056306400140020400394003924955032501912001020120000203600004003940039111200211091010120000100000000752000021600026400300000120000104004040040400404004040040
1200244003930000000006199612512001312120000121200006056306400140020400394003924955032501912001020120000203600004003940039111200211091010120000100002000752000151620062400300000120000104004040040400404004040040
120024400393130001201061356892512001210120000101200005056306400140020400394003924955032666812001020120000203600004003940039111200211091010120000100000030752000041600042400300000120000104004040040400404004040040
1200244003930000000006199612512001010120000101200005056306400142451400394003924955032501912001020120000203600004003940039111200211091010120000100000000752000021600057400300000120000104004040040400404004040040
1200244003930000000006199612512001010120000101200005056306400140020400394003924955032667112001020120000203600004003940039111200211091010120000100000000752000041600024400300000120000104004040040401094177240040
1200244003930000000006199612512001010120000101200005056306400140020400394003924955032666812001020120000203600004003940039111200211091010120000100000000752000021600042403280000120000104004040040400404004040040