Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UMLSL2 (by element, 4S)

Test 1: uops

Code:

  umlsl2 v0.4s, v1.8h, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004303723014725482510001000100039831303018303730372415328951000100030003037303711100110000073116222630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372396125482510001000100039831313018303730372415328951000100030003037303711100110000073216222630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073216212630100030383038303830383038
1004303723940125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
1004303723426125482510001000100039831303054303730372415328951000100030003037303711100110000073216122630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100030003037303711100110000073216112630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100030003037303711100110000073216212630100030383038303830383038
10043037221514925482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073116222630100030383038303830383038

Test 2: Latency 1->1

Code:

  umlsl2 v0.4s, v1.8h, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372240014529548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071013162229634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012163229634100001003003830038300383003830038
1020430037225606129548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037224006129548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372250017729548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000671012162229634100001003003830038300383003830038
10204300372250110329548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372250042429548251010010010000100100005004277313130018330037300372826532874510100200100002003000030037300371110201100991001001000010000071013162229634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l1i tlb fill (04)3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722501147295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001005000064271610929630010000103003830038300383003830038
100243003722511822954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000642101671029630010000103003830038300383003830038
10024300372251261295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000000064410168829690010000103003830038300383003830038
10024300372251161295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000000064291610929630010000103003830038300383003830038
1002430037225116129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000006426167729630010000103003830038300383003830038
10024300372251158229548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100000006429166829630010000103003830038300383003830038
100243003722511172295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000000064210169929630010000103003830038300383003830038
100243003722511632954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000642816101029630010000103003830038300383003830038
10024300372251161295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000000064491691029630010000103003830038300383003830038
1002430037224116129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006429168629630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  umlsl2 v0.4s, v0.8h, v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722500000006129548251010010010000100100005004277313030018300373003728265328745101002121000020030000300373003711102011009910010010000100000000000710264112998937100001003046730327305143046830469
10204304642280007101332704328329467245102201551001613611043608428545503027030369303722828736288581118822611156226330453037030367811020110099100100100001002201121965300856186112966727100001003036730327302773037030371
10204303722261117494261665052944016210241156100881511163972042908070304143056030323283062728938116382181179323634992305623046710110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250010000612954825101001001000010010000500427731303001830037300372826532876310100200100002003000030037300371110201100991001001000010000002000071011611296340100001003003830038300383003830038
1020430037225000005460612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000004000071011611296340100001003003830038300383003830038
102043003722500000002322954825101001001000010010000500427731303001830037300372826532874510100208100002003000030037300371110201100991001001000010000000000071011611296340100001003022930038300383003830038
102043003722500010001032954825101001001000010010000500427731303001830132300372826532874510100200100002003000030037302251110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250000003521032954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000001030071011623296340100001003003830038300383003830038
10204300372250000000612954825101001231000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011612296340100001003003830079300383017930038
10204300372250000000612954825101001001000010010000500427731303016230037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722500000025029548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402164229630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402163229630010000103003830038300383003830038
100243003722500000044129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
100243003722400000033229548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402163229630010000103003830038300383003830076
10024300372350000006129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018030037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
10024300372250000906129548251001010100001010000504277313030018330037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  umlsl2 v0.4s, v1.8h, v0.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
10204300372253306129548251010010010000100100005004277313130018300853003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
102043003722439906129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296700100001003003830038300383003830133
102043008422500756295482510100100100001001000050042773131300183003730037282723287451010020010000200300003003730037111020110099010010010000100000755133112963416100001003008630133301353008530133
1020430037225606129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
10204300372251806129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
102043003722592646129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
10204300372245406129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
102043003722530053629548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009901001001000010000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006403162229630010000103003830038300383003830038
100243003722515061295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001006402162229700010000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630210000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038
10024300372250061295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038
10024300372240061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001006402162229630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  umlsl2 v0.4s, v8.8h, v9.h[1]
  movi v1.16b, 0
  umlsl2 v1.4s, v8.8h, v9.h[1]
  movi v2.16b, 0
  umlsl2 v2.4s, v8.8h, v9.h[1]
  movi v3.16b, 0
  umlsl2 v3.4s, v8.8h, v9.h[1]
  movi v4.16b, 0
  umlsl2 v4.4s, v8.8h, v9.h[1]
  movi v5.16b, 0
  umlsl2 v5.4s, v8.8h, v9.h[1]
  movi v6.16b, 0
  umlsl2 v6.4s, v8.8h, v9.h[1]
  movi v7.16b, 0
  umlsl2 v7.4s, v8.8h, v9.h[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204200781500392580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000001011131611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
16020420064150534392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
1602042006415018392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
16020420064150318392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
1602042006415015392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065
1602042006415024392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000001011111611200611600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03l1i tlb fill (04)l2 tlb miss data (0b)191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420052150200420014482580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000010060138104722111263520045216160000102004920049200492004920049
1600242011915010015006382580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000010061138103922111373820045216160000102004920049200492004920049
160024200481501000004702580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000210049138102522111402720045216160000102004920049200492004920049
160024200481501001201742580012128000012800006264000011102002920048200483228001220800002024000020052200521116002110910101600001000000010065169202626322382620049231160000102005320053200532005320053
16002420052150100000742580012128000012800006264000001102003320052200523228001220800002024000020052200521116002110910101600001000000010057169202426322314120049231160000102005320053200532005320053
1600242005215010047101742580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000010058138103922111383220045216160000102004920049200492004920049
1600242005215010060015492580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000010064138114022111303820045216160000102004920049200492004920049
16002420048150200000682580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000010060138103822111393920045216160000102004920049200492004920049
1600242004815010000028792580012128000012800006264000011102002920048200483228001220800002024000020048200481116002110910101600001000000010061169104022111403920045216160000102004920049200492004920049
16002420048150100000682580012128000012800006264000011102002920048200483228001220800002024000020048200521116002110910101600001000000010060168202522111383820045216160000102004920049200492004920049

Test 6: throughput

Count: 12

Code:

  umlsl2 v0.4s, v12.8h, v13.h[1]
  umlsl2 v1.4s, v12.8h, v13.h[1]
  umlsl2 v2.4s, v12.8h, v13.h[1]
  umlsl2 v3.4s, v12.8h, v13.h[1]
  umlsl2 v4.4s, v12.8h, v13.h[1]
  umlsl2 v5.4s, v12.8h, v13.h[1]
  umlsl2 v6.4s, v12.8h, v13.h[1]
  umlsl2 v7.4s, v12.8h, v13.h[1]
  umlsl2 v8.4s, v12.8h, v13.h[1]
  umlsl2 v9.4s, v12.8h, v13.h[1]
  umlsl2 v10.4s, v12.8h, v13.h[1]
  umlsl2 v11.4s, v12.8h, v13.h[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e373f4e5051schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
12020430059225000041671302512010110012000110012000050042836220317293174830039149733149971201002001200002003600003174830039111202011009910010012000010000000761021611300361200001003174930040300403004030040
1202043003923800004100251201181001200181001200005009900000300203003931748166533149971201002001200002003600003003930040111202011009910010012000010000000761011611300361200001003004030040300403174930040
120204300392250001872600251201001001200001001200005009600000317293174831748149733149971201002001200002003600003003930039111202011009910010012000010000600761011611317451200001003004031749300403174930040
12020430039238000010367130251201181001200181001200005009600000317293004030039149733149971201002001200002003600003174830039111202011009910010012000010000000761011611300361200001003174930040300403174930040
1202043174822500016100251201001001200001001200005009600000317293004030039149733149971201002001200002003600003003931748111202011009910010012000010000000761011611300371200001003174930040317493004031749
1202043174822500004100251201001001200001001200005009900000300203003931748149733149971201002001200002003600003004030039111202011009910010012000010000000761011611317451200001003004031749300403174930041
1202043004023800004167130251201181001200181001200005009600000300203003931748166533167061201002001200002003600003174830039111202011009910010012000010000000761011611317451200001003004031749300403174930040
12020430039225000070634050251201001001200001001200005009600000317293003931748166533167061201002001200002003600003174830039111202011009910010012000010000000761011611317451200001003174930040317493004030040
1202043003922500006100251201001001200001001200005009900000317293004030039149733149971201002001200002003600003003930040111202011009910010012000010000000761011611317451200001003004031749300403004130040
1202043003923800006167130251201011001200181001200005009600001300203003931748166533149971201002001200002003600003003930039111202011009910010012000010000000761011611317451200001003004031749300403174930040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
120024300482250006903507302512001110120000101200005096000001300203003930039149960315019120010201200002036000030040300401112002110910101200001000000000752231113162111517300360155120000103004030040300403004030040
1200243003922500000004602512001010120000101200005096000011300203003930039149960315019120010201200002036000030039300391112002110910101200001000000000752231114162111516300360155120000103004030040300403004030041
120024300392240006000460251200101012000010120000504399524113002030039300401499603150191200102012000020360000300393003911120021109101012000010000071200752231113162111315300360165120000103004030040300403004030040
12002430039231000150004602512001010120000101200005096000011300203095130039149960315019120010201200002036042030039300391112002110910101200001000000000752231113161111313300360165120000103004030040300403004030040
120024300392250002700028702512020610120000101200005096000011305923004130951149960815019120010201200002036000030039300391112002110910101200001000000000752231113161111314300360157120000103004030040300403004030040
120024300392250001500067341025120010101200001012000050428391511300203003930039149960315019120010201200002036000030040300391112002110910101200001000000000752231115162111716300360155120000103004030040300403004030041
12002430039226000240004602512001010120000101200005096000011309233004230039149960315019120010201200002036000030039300391112002110910101200001000000000752231114162111411300360155120000103004030040300403004030943
12002430039232000120104602512001010120000101200005096000011300203003930039149960315019120010201200002036000030039300391112002110910101200001000000000752231114162111310300360165120000103004030040300423092330040
1200243003922500000004602512001010120000101200005096000011300203003930039149960315019120010201200002036000030039300391112002110910101200001000000000752231117162111516300360165120000103004030943300403004030040
1200243003922500090006702512001010120000101200005096000011300203003930040149960315019120010201200002036000030039300401112002110910101200001000000000752231114162111414300360165120000103004130041300403094330040