Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SMLAL2 (vector, 8H)

Test 1: uops

Code:

  smlal2 v0.8h, v1.16b, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)191e1f3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004303722000061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303723000061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303722000061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
100430372300111061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303723000061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303723000061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303723000061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
10043037220015061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303723000061254825100010001000398313130183037303724153289510001000300030373037111001100000073116112630100030383038303830383038
1004303722000061254825100010001000398313030183037303724153289510001000300030373037111001100000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  smlal2 v0.8h, v1.16b, v2.16b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372251200612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037225300612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
102043003722525200612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030085300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372253300612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372251800612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037225000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000075712162229634100001003003830038300383003830038
10204300372253900612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372251800612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162329634100001003003830038300383003830038
10204300372252700612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372251200612954825101001001000010010000555427731303001830037300372826532874510100214100002003000030037300371110201100991001001000010002071212162229634100001003003830038300383018230038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006403162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830131301792828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010007262162229630010000103003830038300383003830038
1002430037224000612954825100101010000101044750427731313001830037300372828732876710010201000020300003003730037111002110910101000010006403163229630010000103003830038301333008530038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
10024300372250120612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
1002430037225000612954825100101010000101000050428138413001830037300372828732876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  smlal2 v0.8h, v0.16b, v1.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372255061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372240061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372240061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250082295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250061295482510100100100071001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722500061295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
100243003722500061295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
100243003722500061295482510010101000010100005042773130300180300373003728287328767100102010000203000030037300371110021109101010000100001640316332963010000103003830038300383003830038
100243003722500061295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332970010000103003830038300383003830038
100243003722400061295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
100243003722400082295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
100243003722500061295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
100243003722500084295482510010101000010100005042773130300180300373003728287328767100102010000203000030037300371110021109101010000100100640316332963010000103003830038300383003830038
1002430037224000807295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100000640316332966810000103003830038300383003830038
100243003722500061295482510010101000010100005042773131300180300373003728287328767100102010000203000030037300371110021109101010000100100640316332963010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  smlal2 v0.8h, v1.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100016271011611296340100001003007230038300383003830038
102043003722500061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730070111020110099100100100001000071011611296340100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100016271042511296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730070111020110099100100100001000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100015071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001051000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
10024300372250576329548251001012100001210000504277313130018300373003728287032876710012201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010336402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010096402162229630010000103003830038300383003830038
100243003722406129548251001010100001010000604277313130018300373003728287032876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010066402162229630010000103003830038300383003830038
100243003722506129548251001010100001010000504277313130018300373003728287032876710010201000020304833008430084111002110910101000010006406162229658010000103003830038300383003830038
1002430037225021829548251001010100001010000504277313130018300373003728287032876710010201000020300003003730037111002110910101000010006402162229630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  smlal2 v0.8h, v8.16b, v9.16b
  movi v1.16b, 0
  smlal2 v1.8h, v8.16b, v9.16b
  movi v2.16b, 0
  smlal2 v2.8h, v8.16b, v9.16b
  movi v3.16b, 0
  smlal2 v3.8h, v8.16b, v9.16b
  movi v4.16b, 0
  smlal2 v4.8h, v8.16b, v9.16b
  movi v5.16b, 0
  smlal2 v5.8h, v8.16b, v9.16b
  movi v6.16b, 0
  smlal2 v6.8h, v8.16b, v9.16b
  movi v7.16b, 0
  smlal2 v7.8h, v8.16b, v9.16b
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042008915000292680116100800161008002850064019612004520065200656128012820080028200240084200652006511160201100991001001600001000001111012021622200621600001002006620066200662006620066
1602042006515500292680116100800161008002850064019612004520064203863228010020080000200240000200642006411160201100991001001600001000000001011331632200611600001002016720065201552006520065
16020420064150003925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010005330001011321632200611600001002006520065200652006520065
16020420064150005142580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000000001011231633200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000000001011231623200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000030001011231633200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000000001011321633200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000030001011221623200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000000001011331623200611600001002006520065200652006520065
16020420064150003925801001008000010080000500640000020045200642006432280100200800002002400002006420064111602011009910010016000010004630001011231632200611600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2524

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420357151000051258001212800001280000626400001052002720050200503228001220800002024000020050200501116002110910101600001000001003262232244229720047230160000102005120047200512004720051
160024203121500000512580012128000012800006264000000520031200502005032280012208000020240000200502005011160021109101016000010000010036622620422151020047230160000102005120051200472005120051
160024202201500000512580012128000012800006264000000520031200502004632280012208000020240000200462004611160021109101016000010010961003182252042215820047230160000102005120051200512005120051
160024202521500000452580012128000012800006264000000520027200502004632280012208000020240000200502004611160021109101016000010000010032842724221101020047215160000102005120051200512004720051
160024201981500000452580012128000012800006264000001020031200502004632280012208000020240000200462005011160021109101016000010000721003763262042171420047215160000102005120051200512005120051
16002420214150000045258001212800001280000626400000002003120046200463228001220800002024000020050200461116002110910101600001000001003563192422191420047215160000102005120051200512005120051
16002420186150001505125800121280000128000062640000000200312004620046322800122080000202400002004620046111600211091010160000100000100346218204125620043215160000102005120047200512004720051
1600252021115000005125800121280000128000062640000010200312005020046322800122080000202400002004620050111600211091010160000100000100318219242118920047215160000102004720047200512005120051
16002420426150001204525800121280000128000062640000015200312004620050322800122080000202400002005020050111600211091010160000100000100398524242117820047230160000102005120051200472005120051
1600242019915000004525800121280000128000062640000010200312005020050322800122080000202400002005020050111600211091010160000100000100358427202216720043230160000102005120047200472005120051

Test 6: throughput

Count: 16

Code:

  smlal2 v0.8h, v16.16b, v17.16b
  smlal2 v1.8h, v16.16b, v17.16b
  smlal2 v2.8h, v16.16b, v17.16b
  smlal2 v3.8h, v16.16b, v17.16b
  smlal2 v4.8h, v16.16b, v17.16b
  smlal2 v5.8h, v16.16b, v17.16b
  smlal2 v6.8h, v16.16b, v17.16b
  smlal2 v7.8h, v16.16b, v17.16b
  smlal2 v8.8h, v16.16b, v17.16b
  smlal2 v9.8h, v16.16b, v17.16b
  smlal2 v10.8h, v16.16b, v17.16b
  smlal2 v11.8h, v16.16b, v17.16b
  smlal2 v12.8h, v16.16b, v17.16b
  smlal2 v13.8h, v16.16b, v17.16b
  smlal2 v14.8h, v16.16b, v17.16b
  smlal2 v15.8h, v16.16b, v17.16b
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440059300001456251601001001600001001600005001280000140020400394003919973319998160100200160000200480000400394003911160201100991001001600001008001011021622400451600001004004040040400404004940040
1602044003930000412516010010016000010016000050012800001400204003940039199733199971601002001600002004800004003940039111602011009910010016000010011601011021622400361600001004004040040400404004040040
1602044003930000412516010110016000010016000050012800001400204003940039199733200061601002001600002004800004003940039111602011009910010016000010011901011021622400451600001004004940040400404004040049
160204400393130041251601001001600181131600005002438865140020400394003919973319997160100200160000200480000400394003911160201100991001001600001000601011031622400361600001004004040040400404004940040
1602044003930000412516011710016001810016000050012800001400304004940039199733199971601002001600002004800004003940039111602011009910010016000010013601011021622400361600001004004040040400404004040040
1602044003930000412516010010016000010016000050012800001400294003940039199733199971601002001600002004800004003940039111602011009910010016000010011901011021622400361600001004004040040400404004040050
160204400393000041251601001001600181001600005001280000140020400394003919973319997160100200160000200480000400394003911160201100991001001600001008601011021622400361600001004004040040400404004040040
160204400393000174125160100100160000100160000500128000014002040039400391997331999716010020016000020048000040039400391116020110099100100160000100111201011021622400361600001004004040040400404004940041
1602044003930000706251601001001600171001600005001280000140020400394003919973319997160100200160000200480000400484003911160201100991001001600001000601011021622400361600001004004040040400404004040040
1602044003930000412516010110016000110016000050023989991400204005140048199893199971601002001600002004800004003940039111602011009910010016000010011301011021624400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e373a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400483000170465316001010160001101600005023989991104002040048400391999632001916001020160000204800004004040048111600211091010160000100000010024134121162112020400362012160000104004940040400404004040040
1600244003929901704647160010101600001016000050128000011104002040039400391999632002816001020160000204800004003940039111600211091010160000100011180100221381816211101940036206160000104004140049400404004140040
1600244003930001046481600271016000011160000501280000111040020400394004019996320019160010201600002048000040048400391116002110910101600001000230100221381916211202140037206160000104004040049400504004940040
160024400393000004645160027101600011016000050239899911104002940048400481999632001916001020160000204800004003940040111600211091010160000100083010022138171621120940037209160000104004940040400404004040049
1600244003930000053481600101016000010160000501320000111040020400394003919996320019160010201600002048000040039400391116002110910101600001000990100221381191621119940036209160000104004140040400404004140040
1600244003929901052431600111016000010160000502398999110400214004040039199963200291600102016000020480000400404003911160021109101016000010004730100221381191621119740036206160000104004040040400414004040040
1600244004830000046461600101016001710160000502398999111040020400404003919996320020160010201600002048000040039400481116002110910101600001000090100221382201622171940036206160000104004040040400494004040049
1600244003930000046471600101016000013160000501280000111040029400394003919996320019160010201600002048000040039400481116002110910101600001000590010022138191622191940036207160000104004040040400404004040049
16002440039299010464616001110160017101600005012800001110400204004840039199963200281600102016000020480000400394003911160021109101016000010008601002213811916211191940045209160000104005040040400404004040050
1600244003930000061531600271016001710160000502399082011040020400404003920011320020160010201600002048000040039400391116002110910101600001000890100241381191621219740045206160000104004940040400494004040049