Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLA (vector, 4S)

Test 1: uops

Code:

  fmla v0.4s, v1.4s, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
1004403730020934072510001000100053190814018403740833258338951000100030004037403711100110000073216223473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
1004403730010834072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
100440373106134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038
100440373006134072510001000100053190814018403740373258338951000100030004037403711100110000073216223473100040384038403840384038

Test 2: Latency 1->1

Code:

  fmla v0.4s, v1.4s, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373000000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710031622394790100001004003840038400384003840038
102044003729900000613940725101001001000010010000500570690840018400374003738108338745101002001000020030000400374003711102011009910010010000100000007101216223947919100001004003840038400384003840038
10204401813000000061394074410100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000003710121622394790100001004003840038400384003840038
102044003730000000103394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121632394790100001004003840038400384003840038
102044003730000018161394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110202100991001001000010000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384003840038
10204400373000000161394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000103710121622394790100001004003840038400384003840038
10204400372990000061394072510100100100001001000050057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710121622394790100001004003840038400384003840038
10204400373000000061394072510100100100001001014750057069084001840037400373810833874510100200100002003000040037400371110201100991001001000010000000710131622394800100001004003840038400384003840038
10204400373000000061394072510100100100001001000053657073154001840037400373810833874510100200100002003000040037400371110201100991001001000010000100739121622394790100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400372990613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000640416223947310000104003840038400384003840038
100244003730001243940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010030640216223947310000104003840038400384003840038
10024400373000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400373000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400373000613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640216233947310000104003840038400384003840038
10024400373000613940725100101010000101000050570690814001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440037299013983940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
100244003730007263940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400372990613940725100101010000101000050570690804001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400373000613940725100101010000101000050570690804001840037400373813033876710157201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038

Test 3: Latency 1->2

Code:

  fmla v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020440037300000002603940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840038400384003840038
102044003730000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840038400384003840038
102044003729900000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139513100001004003840038400384003840038
102044003729900000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840086400854003840038
1020440037299011002083940725101001001000010010000500570868904001840084401323810833874510100200103272063048340134401323110201100991001001000010000007101161139479100001004003840038400384003840038
102044003730000000613940725101001001000010010000522570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840038400384003840038
102044003730000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840038400384003840038
102044003730000000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840038400384003840038
102044003730000000613940725101001001000010010000500570690814001840037400373810833874510100200100002003000040037400371110201100991001001000010000007101161139479100001004003840038400384003840038
102044003729900000613940725101001001000010010000500570690804001840037400373810833874510100200100002003000040037400371110201100991001001000010000017101161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003729900028517612439407251001010100001010000505706908040053400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
1002440037300000006139407251001010100001010000505706908040018402264003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
10024400373001002706139407251001010100001010000505712492040018400374003738130338767100102010000203000040037400371110021109101010000100016006402162239473010000104003840038400384003840038
1002440037300000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100003006402162239473010000104003840038400384013340038
1002440037300000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100013006402163339473010000104003840038400384003840038
1002440037299000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100020206402162239473010000104003840038400384003840038
1002440037299000006139407251001010100001010000505708304040018400374003738130338767100102010000203000040037400371110021109101010000100000006402242239473010000104003840038400384003840038
10024400373000000012439407251001012100001010148505706908040018400844003738135338786100102010000203048340037400371110021109101010000100000006402162239473010000104003840038400384003840038
1002440037299000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840038
1002440037299000006139407251001010100001010000505706908040018400374003738130338767100102010000203000040037400371110021109101010000100000006402162239473010000104003840038400384003840227

Test 4: Latency 1->3

Code:

  fmla v0.4s, v1.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400373000613940725101001001000010010000500570690804001804003740037381157387411010020410008200300244003740037111020110099100100100001000071001161139479100001004003840038400384003840038
102044003730007103940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840081400384003840038
102044003730006139407251010010010000100100005775706908040018040037400373810833874510100200100002003000040037400371110201100991001001000010018371001161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038
102044003729901053940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038
102044003730007263940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038
1020440037300028663940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038
10204400373000613940725101001001000010010000500570690804001804003740037381083387451010020010000200300004003740037111020110099100100100001000071001161139479100001004003840038400384003840038

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440037300061394072510010101000010100005057069084001840037400373813033878910010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400372992761394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440113300061394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400373000145394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
10024400373000631394074410010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038
1002440037300061394072510010101000010100005057069084001840037400373813033876710010201000020300004003740037111002110910101000010000640216223947310000104003840038400384003840038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmla v0.4s, v8.4s, v9.4s
  movi v1.16b, 0
  fmla v1.4s, v8.4s, v9.4s
  movi v2.16b, 0
  fmla v2.4s, v8.4s, v9.4s
  movi v3.16b, 0
  fmla v3.4s, v8.4s, v9.4s
  movi v4.16b, 0
  fmla v4.4s, v8.4s, v9.4s
  movi v5.16b, 0
  fmla v5.4s, v8.4s, v9.4s
  movi v6.16b, 0
  fmla v6.4s, v8.4s, v9.4s
  movi v7.16b, 0
  fmla v7.4s, v8.4s, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420091150040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011641643200621600001002006620066200662006620066
16020420065151040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011531633200621600001002006620066200662006620066
16020420065150040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011631643200621600001002006620066200662006620066
16020420065150040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011531633200621600001002006620066200662006620066
16020420065150040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011531623200621600001002006620066200662006620066
16020420065151340258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000021011631633200621600001002006620066200662006620066
16020420065150040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011631633200621600001002006620066200662006620066
16020420065150061258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011231633200621600001002006620066200662006620066
16020420065150040258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000001011531633200621600001002006620066200662006620066
16020420065150063258010010080000100800005006400000200462006520065323801002008000020024000020065200651116020110099100100160000100000201011631633200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)0918191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242007615000000240462780012128000012800006264000011020033200522005232380012208000020240000200522005211160021109101016000010001003632232344221220200582402160000102006220062200622006220062
160024200611510000000522980012128000012800006264000001520033200612005232380012208000020240000200612006111160021109101016000010001004384113252111719200492201160000102005320053200532005320053
160024200521500000000522980012128000012800006264000001520033201352005232380012208000020240000200522005211160021109101016000010001003484111252111311201162201160000102005320053200532005320053
160024200521500100000522980012128000012800006264000011520033200522006132380012208000020240000200612006111160021109101016000010131004085211344221016201372402160000102006220062200622006220062
16002420086151011113688522980012128000012800006264000001520042200612006132380012208000020240000200612006111160021109101016000010001004185111344221114200492401160000102006220062200532005320062
1600242006115000000004627800121280000128000062640000115200332006120052323800122080000202400002006120061111600211091010160000100010039115213344221219200582402160000102006220062200622006220062
1600242006115000000006162780012128000012800006264000001520033200612006132380012208000020240000200612006111160021109101016000010001003685213344221313200492401160000102006220053200532005320053
1600242005215000000180462780012128000012800006264000011520033200612005232380012208000020240000200612006111160021109101016000010001004086217344211311200582402160000102006220062200622006220062
160024200611500000000462980012128000012800006264000011520033200522005232380012208000020240000200522005211160021109101016000010001003685114252111411200492201160000102005320053200532005320053
16002420052150000002704627800121280000128000062640000115200332005220052323800122080000202400002005220061111600211091010160000100010039116214254221411200582402160000102006220062200622006220062

Test 6: throughput

Count: 16

Code:

  fmla v0.4s, v16.4s, v17.4s
  fmla v1.4s, v16.4s, v17.4s
  fmla v2.4s, v16.4s, v17.4s
  fmla v3.4s, v16.4s, v17.4s
  fmla v4.4s, v16.4s, v17.4s
  fmla v5.4s, v16.4s, v17.4s
  fmla v6.4s, v16.4s, v17.4s
  fmla v7.4s, v16.4s, v17.4s
  fmla v8.4s, v16.4s, v17.4s
  fmla v9.4s, v16.4s, v17.4s
  fmla v10.4s, v16.4s, v17.4s
  fmla v11.4s, v16.4s, v17.4s
  fmla v12.4s, v16.4s, v17.4s
  fmla v13.4s, v16.4s, v17.4s
  fmla v14.4s, v16.4s, v17.4s
  fmla v15.4s, v16.4s, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03091e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204410763002017310251601431001600641001600195005705933040021411904228521169619991160119200160024200480072422854004011160201100991001001600001000001111011811600400371600001004228640041411914120641191
160204400403090043314486251601081001600801001600195005715386040021400404004021115619991160123200160024200480072412054004011160201100991001001600001006001111011801600412061600001004120640041411914120642286
1602044004030802422614541251601001001600011001600005001280000040021400404004021111321163160100200160000200480000400434004011160201100991001001600001000000001011011611411871600001004004141191412064119141206
16020441205300000614485251601221001600221001600005001319999042266400404004019973321163160100200160000200480000400404004011160201100991001001600001000000001011011611411871600001004004140041400414004141206
160204400403000043420251601001001600001001600005005715261041185411904120519973321148160100200160000200480000400404004011160201100991001001600001000000001011011611400371600001004004140041400414119141206
160204412053090043614485251601221001600221001600005005705774040021400404004021095319998160100200160000200480000411904120511160201100991001001600001000000001011011611412021600001004004140041411914228640041
160204411903170017610251601431001600001001600005001280000041171400404004019973321148160100200160000200480000400404004011160201100991001001600001000300001011011621400371600001004228640041400414004140041
16020440040300000420251601001001600171001600005005705774041171400404004019973319998160100200160000200480000400404004011160201100991001001600001000000001011011611400371600001004004140041411914120640041
16020440040300000420521601001001600171001600005005715261040021400434004019973321148160100200160000200480000411904120911160201100991001001600001000000001011011611400371600001004120641191400424004140041
160204400403000270610251601001001600001001600005001280000041186412094004021111319998160100200160000200480000400404004011160201100991001001600001000000001011011611400371600001004004140041411914120640041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)dfe0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400623000000000000674589251600101016005110160000505728047110400214004040040199960321206160010201600002048000042318400401116002110910101600001000008110026112107164113434003704027160000104231940041400414004140041
16002441226308000000000175402516001110160000101600005057280471154120741226412262114103212061600102016000020480000423184004011160021109101016000010000001002511310416211333423150208160000104004142319400414004340041
1600244004030000000000016702516007010160002101600005012800001154226240040423182224403200201600102016000020480000400404122611160021109101016000010000031002611410316211433412020207160000104122741227412274122741227
1600244228530000000000060470251600701016000110160000505868333115412004231840040199960322261160010201600002048000040041412191116002110910101600001000001021002611410316211683400400207160000104004140041400434004140041
1600244004031700000000060680251600101016000110160000501280000115412074004041226199960321199160010201600002048000041205412191116002110910101600001000001201002611510416211443412020208160000104004142319400414004441220
160024400403160100000000732457644160070101600171016000050572804711541200400404122619996032002016001020160000204800004004240040111600211091010160000100000181002611510316211443423150207160000104122741227412274122741206
160024400403170000000000670251600281016000010160000505868333115422994004042318222440320020160010201600002048000041226400401116002110910101600001000001261002511510316211443400370208160000104004142319400414231940041
16002440040300000000000048025160028101600001016000050128000011542299400404004019996032002016001020160000204800004004042318111600211091010160000100001871002511510516241553412160208160000104004140043423024004140041
160024400402990001000000498970251600631016000110160000505868333115400234004042318199960320020160010201600002048000042318400401116002110910101600001000001831002611510316211423423150208160000104004142319400414004340041
16002440040317000000000067884425160028101600001016000050571065411540021400404122619996032120616001020160000204800004230240043111600211091010160000100000871002611510516211553423150208160000104004340042400414004140041