Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UMLAL (vector, 2D)

Test 1: uops

Code:

  umlal v0.2d, v1.2s, v2.2s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100430372206125482510001000100039831313018303730372415328951000100030003037303711100110000073216112630100030383038303830383038
100430372308225482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372206125482510001000100039831303018303730372415328951000100030003037303711100110001073116112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372296125482510001000100039831313018303730372415328951000116830003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372366125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  umlal v0.2d, v1.2s, v2.2s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722500000010329548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121623296340100001003003830038300383003830038
102043003722500000014729548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121623296340100001003003830038300383003830038
102043003722500000044029548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121622296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121623296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121623296340100001003003830038300383003830038
10204300372250000016129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121623296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121623296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100030710121622296340100001003003830038300383003830038
102043003722500000034629548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000710121622296340100001003003830038300383003830038
10204300372240000006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100002754123222297063100001003013230133301343013430085

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000000027829548251001010100001010000504277313130018300373003728287328767100102010162203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037224000000012429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
10024300372240000120041929548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
100243003722500000007129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000019129548251001010100001010000504277313130018300373003728287728767100102010000203000030037300371110021109101010000100000000006402163229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000012429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000042329548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000030006402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  umlal v0.2d, v0.2s, v1.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722502332954844101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722501242954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722501242954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010040071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
10204300372250612954825101001001000010010000500427731330018030037300372826532874510100200100002003000030037300371110201100991001001000010000171011611296345100001003003830038300383003830038
10204300372250612954825101001001000010010000627427731330018030084300372826532874510100204100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03181e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250003092954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162329630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162329630010000103003830038300383003830038
10024300372250003992954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010200006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162329630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
10024300372250007262954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162329630010000103003830038300383003830038
100243003722501201702954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162329630010000103003830038300383003830038
10024300372250013802954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010001006402242229738010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
1002430037225100612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  umlal v0.2d, v1.2s, v0.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250061295482510100100100001001000050042773130300183003730037282653287451010020010167200300003003730037111020110099100100100001000003071011611296340100001003003830086300383003830038
102043003722500612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710116112963428100001003003830038300383003830038
102043003722506961295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
102043003722400145295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250061295482510100100100001001014950042773130300183003730037282653287451010020010182200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372240061295482510100100100001001000050042773130300183003730037282653287451010020010000204300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000040071011611296340100001003003830038300383003830086
10204300372240061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164429630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164429630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000101000006403163429630010000103003830038300383003830038
10024300372240251295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000024006404164429630010000103003830038300383003830038
1002430037225972629548251001010100001010000504277313030018300373003728287328767100102010000203050130037300371110021109101010000100000006404164429630010000103003830038300383003830038
1002430037225072629548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164429630010000103003830038300383003830038
100243003722406129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006403163429630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000006404164429630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000006403164429630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164329630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  umlal v0.2d, v8.2s, v9.2s
  movi v1.16b, 0
  umlal v1.2d, v8.2s, v9.2s
  movi v2.16b, 0
  umlal v2.2d, v8.2s, v9.2s
  movi v3.16b, 0
  umlal v3.2d, v8.2s, v9.2s
  movi v4.16b, 0
  umlal v4.2d, v8.2s, v9.2s
  movi v5.16b, 0
  umlal v5.2d, v8.2s, v9.2s
  movi v6.16b, 0
  umlal v6.2d, v8.2s, v9.2s
  movi v7.16b, 0
  umlal v7.2d, v8.2s, v9.2s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420088150000392580100100800001008000050064000000200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
160204200641500021392580100100800001008000050064000000200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
160204200641500039622580100100800001008000050064000000200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
16020420064150000392580100100800001008000050064000001200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
16020420064150000392580100100800001008000050064000001200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
16020420064151000392580100100800001008000050064000000200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065201462006520065
16020420064151000392580100100800001008000050064000001200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
16020420064150000392580100100800001008000050064000000200452006420064322801002008000020024000020145200641116020110099100100160000100001011111611200611600001002006520065200652006520065
160204200641500030392580100100800001008000050064000001200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
16020420064150000392580100100800001008000050064000001200452006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03181e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200941500004527800121280000128000062640000110200322005120051322800122080000202400002005120051111600211091010160000100001004081114252111111200482201160000102005220052200522005220052
160024200511510004527800121280000128000062640916100200322005120051322800122080000202400002005120051111600211091010160000100061110030361725211157200482201160000102005220052200522005220052
16002420051150000452780012128000012800006264000010020032200512005132280012208000020240000200512005111160021109101016000010000100301311725211108200482201160000102005220052200522005220052
16002420051150000452780012128000012800006264000010020032200512005132280012208000020240000200512005111160021109101016000010000100303111125211117200482201160000102005220052200522005220052
16002420051150000452780012128000012800006264000010020032200512005132280012208000020240000200512005111160021109101016000010000100303511125211138200482202160000102005220052200522005220052
1600242005115000045278001212800001280000626400001010200322005120051322800122080000202400002005120051111600211091010160000100001003436111252111511200483201160000102005220052200522005220052
16002420051151000452780012128000012800006264000011020032200512005132280012208000020240000200512014011160021109101016000010000100341361825211117200482201160000102005220052200522005220052
160024200511500004527800121280000128000062640000111020032200512005132280012208000020240000200512005111160021109101016000010000100361311725211117200482201160000102005220052200522005220052
1600242005115000045278001212800001280000626400001110200322005120051322800122080000202400002005120051111600211091010160000100001003013611225211157200482201160000102005220052200522005220052
160024200511500006362780012128000012800006264000011102003220051200513228001220800002024000020051200511116002110910101600001000010038136111252111511200482201160000102005220052200522005220052

Test 6: throughput

Count: 16

Code:

  umlal v0.2d, v16.2s, v17.2s
  umlal v1.2d, v16.2s, v17.2s
  umlal v2.2d, v16.2s, v17.2s
  umlal v3.2d, v16.2s, v17.2s
  umlal v4.2d, v16.2s, v17.2s
  umlal v5.2d, v16.2s, v17.2s
  umlal v6.2d, v16.2s, v17.2s
  umlal v7.2d, v16.2s, v17.2s
  umlal v8.2d, v16.2s, v17.2s
  umlal v9.2d, v16.2s, v17.2s
  umlal v10.2d, v16.2s, v17.2s
  umlal v11.2d, v16.2s, v17.2s
  umlal v12.2d, v16.2s, v17.2s
  umlal v13.2d, v16.2s, v17.2s
  umlal v14.2d, v16.2s, v17.2s
  umlal v15.2d, v16.2s, v17.2s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602044004030000000900412516010110016000010016000050012800000400294003940040199733199981601002001600002004800004003940039111602011009910010016000010000000010110116114004501600001004004040040400404004040040
1602044003930000000300412516010010016000010016000050012800000400204003940039199733199971601002001600002004800004003940048111602011009910010016000010000000010110124114003601600001004004040040400494004040040
1602044003930000000000412516010010016000010016000050013199971400204004840048199733199971601002001600002004800004003940039111602011009910010016000010000000010110116114003601600001004004040040400904004940040
1602044003930000000000412516010010016000010016000050012800001400204003940048199733199971601002001600002004800004004840048111602011009910010016000010000000010110116114003601600001004004040049400404004040040
1602044003930000000000412516010010016000110016000050012800001400304003940040199733199971601002001600002004800004004040039111602011009910010016000010000000010110116114003601600001004004040040400404004040040
1602044004830000000000412516010010016000010016000050013199971400294003940039199733200071601002001600002004800004003940039111602011009910010016000010000000010110116114003601600001004004040040400404004040040
1602044003930000000000412516010010016000010016000050012800001400204003940039199733199971601002001600002004800004003940039111602011009910010016000010000000010110116114003601600001004004040040400494004040040
1602044003930000000000422516010010016000010016000050023990270400204004840039199733199971601002001600002004800004004840039111602011009910010016000010000000010110116114003601600001004004040040400404004040040
1602044004830000000000412516011710016000010016000050012800001400204003940039199733199971601002001600002004800004003940039111602011009910010016000010000000010110116114003601600001004004040040400404004040040
1602044004030000000000412516010010016000010016000050023989990400204004840039199733199971601002001600002004800004003940039111602011009910010016000010000000010110116114003601600001004004040040400404004940040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)031e1f373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244004830030175525160010101600001016000050239899911400204003940039199963200191600102016000020480000400394003911160021109101016000010010022311181621117640036155160000104004040040400404004040040
16002440039300951004625160027101600171016000050239899911400204003940048199963200191600102016000020480000400394003911160021109101016000010010022311161621121740036155160000104004940040400494004040040
1600244004829935400472516001010160000101600005012800001140029400394003919996320028160010201600002048000040039400481116002110910101600001001002231161621161640045155160000104004040040400404004040040
1600244003930092700551161601951016001710160000501280000114002040039400481999632002816001020160000204800004004940039111600211091010160000100100223111616211161640036155160000104004040040400404004940040
1600244003930090901746251600101016000010160000501280000114002040039400391999632001916001020160000204800004003940048111600211091010160000100100223111616211161640036155160000104004040049400404004940040
16002440048300924005525160010101600001016000050128000011400294003940048199963200191600102016000020480000400484003911160021109101016000010010022311161621116640045155160000104004040049400404004940049
16002440039300000462516001010160000101600005012800001140029400394004820016320028160010201600002048000040039400391116002110910101600001001002231161621151640045155160000104004140049400494004040040
160024400393000006125160010101600001016000050239899901400294004840039199963200191600102016000020480000400394003911160021109101016000010010024622616422166400453010160000104004040040400494004040049
1600244004829971701752251600271016013210160000501280000014002040039400391999632001916001020160000204800004003940048111600211091010160000100100246221616422166400453010160000104004040049400404004940040
160024400393000005225160027101600171016000050128000001400204004840039199963200191600102016000020480000400394003911160021109101016000010010024622616422166400363010160000104004040049400404004940040