Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SQRDMLAH (by element, S)

Test 1: uops

Code:

  sqrdmlah s0, s1, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037230612548251000100810003983131301830373037241532895100010003000303730371110011000000073216112630100030383038303830383038
10043037220612548251000100010003983130301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037220612548251000100010003983130301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
100430372201442548251000100010003983130301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037220612548251000100010003983131301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010003000303730371110011000000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  sqrdmlah s0, s1, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010030071014162229634100001003003830038300383003830038
10204300372250000612953025101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071013162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162329634100001003003830038300383003830038
10204300372240000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071212163229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372240000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
10204300372250000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010161223000030037300371110021109101010000100000640316432963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316432963010000103003830038300383003830038
1002430037224006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316432963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640317342963010000103003830038300383003830038
10024300372250246129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316332963010000103003830038300383003830038
1002430037224006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000640316342963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  sqrdmlah s0, s0, v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722500005100612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372240000300612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300853003830038
1020430037225000036300612954825101001001000010010000500427731313001830037300372826532874510100200100002043000030037300371110201100991001001000010000000000171011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038
102043003722500012100612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225004561295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064005165529630010000103003830038300383003830038
100243003722500661295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000064005165529630010000103003830038300383003830038
1002530037225001261295484410010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064004165429630010000103003830038300383003830038
100243003722500061295482510010101000010100005042773130300183003730037282873287671001020100002030000300843008411100211091010100001000064004164529630010000103003830038300383003830038
1002430037225001261295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064005164529630010000103003830038300383003830038
1002430037225003961295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064005164529630010000103003830038300383003830038
100243003722500661295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064005165429630010000103003830038300383003830038
100243003722500061295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064005165529630010000103003830038300383003830038
100243003722500285145295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000664005165529630010000103003830038300383003830038
1002430037225002161295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000064004165529630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  sqrdmlah s0, s1, v0.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722533536295482510100100100001001000050042773130300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
10204300372249726295482510100100100001001000050042773130300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722563251295482510100100100001001000050042773130300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722551726295482510100100100001001000050042773131300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
10204300372252761295482510100100100001001000050042773131300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773130300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773131300183300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773130300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
10204300372252761295482510100100100001001000050042773131300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
1020430037225661295482510100100100001001000050042773130300180300373003728265328745101002001000020030000300373003711102011009910010010000100007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250061295482510010101000010100005042773133001830037300372828732876710010201000020300003003730037111002110910101000010009640216222963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000129640216222963010000103003830038300383003830038
10024300372330088229548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000141640216222963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000132640216222963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000153640216222963010000103003830038300383003830038
1002430037225106129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000114640216222963010000103003830038300383003830038
100243003722500536295482510010101000010100005042773133001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372240061295482510010101000010100005042773133001830037300372828732876710010201000020300003003730037111002110910101000010009640216222963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000144640216222963010000103003830038300383003830038
10024300372250025129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000144640216222963010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sqrdmlah s0, s8, v9.s[1]
  movi v1.16b, 0
  sqrdmlah s1, s8, v9.s[1]
  movi v2.16b, 0
  sqrdmlah s2, s8, v9.s[1]
  movi v3.16b, 0
  sqrdmlah s3, s8, v9.s[1]
  movi v4.16b, 0
  sqrdmlah s4, s8, v9.s[1]
  movi v5.16b, 0
  sqrdmlah s5, s8, v9.s[1]
  movi v6.16b, 0
  sqrdmlah s6, s8, v9.s[1]
  movi v7.16b, 0
  sqrdmlah s7, s8, v9.s[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042006515100812580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100031011111611200611600001002006520065200652006520065
160204200641500011182580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415200392580100100800001008000050064000002004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415010392580100100800001008000050064000012004502006420064322802282008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065
1602042006415000392580100100800001008000050064000012004502006420064322801002008000020024000020064200641116020110099100100160000100001011111611200611600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03181e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acbranch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420059150004527800121280000128000062643352112003220051200513228001220800002024000020051200511116002110910101600001010000100333111225211107200482201160000102005220273200652005220052
16002420051151004527800121280000128000062640000112003220051200513228001220800002024000020051200511116002110910101600001020000100303111025211128200482201160000102005220266200872005220052
160024200511500014027800121280000128000062640000112003220051200513228001220800002024000020051200511116002110910101600001000000100313117252111012200482201160000102005220238200652006120061
16002420060150005127800121280000128000062640000112003220051200513228001220800002024000020051200511116002110910101600001000000100353118252111112200482201160000102005220231200652005220052
16002420051150004527800121280000128000062640000112003220051200513228001220800002024000020051200511116002110910101600001000000100353128252121213200482201160000102005220234200892005220061
16002420051156104527800121280000128000062640000112003220051200513228001220800002024000020051200511116002110910101600001000300100353111025211128200482201160000102005220245200652005220052
16002420051150004527800121280000128000062640000112003220051200513228001220800002024000020051200511116002110910101600001000000100353118252111010200482402160000102006120246200652005220052
160024200511500045278001212800001280000626400001120032200512005132280012208000020240000200512005111160021109101016000010170300100356221034422107200572402160000102006120247200652005220052
1600242005115000452780012128000012800006264000011200322005120051322800122080000202400002005120051111600211091010160000100000010030311734211810200482201160000102005220242200652005220052
16002420051150006152780012128000012800006264000011200322005120051322800122080000202400002005120051111600211091010160000100027001003431110252111212200482201160000102005220233200872005220061

Test 6: throughput

Count: 12

Code:

  sqrdmlah s0, s12, v13.s[1]
  sqrdmlah s1, s12, v13.s[1]
  sqrdmlah s2, s12, v13.s[1]
  sqrdmlah s3, s12, v13.s[1]
  sqrdmlah s4, s12, v13.s[1]
  sqrdmlah s5, s12, v13.s[1]
  sqrdmlah s6, s12, v13.s[1]
  sqrdmlah s7, s12, v13.s[1]
  sqrdmlah s8, s12, v13.s[1]
  sqrdmlah s9, s12, v13.s[1]
  sqrdmlah s10, s12, v13.s[1]
  sqrdmlah s11, s12, v13.s[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
12020430852225101000000230002512010010012000010012000050099000013172930040300391497331499712010020012000020036000030039317481112020110099100100120000100000007614716593174501200001003092330040317493004031749
120204300392371010000002716713251201181001200181001200005009600001300203174830039149733149971201002001200002003600003003930039111202011009910010012000010000000761410167103174501200001003004030041300403174930040
120204300392251010000002516713251201181001200181001200005004399524130020300393003914973314997120100200120000200360000300403003911120201100991001001200001000000076149169103003601200001003174930040317493004031749
12020430040225101000000251671325120100100120000100120000500960000130021300393174816653815063120100200120000200360000300403003911120201100991001001200001000000076144161093174501200001003004030040300403255930040
1202043003923810100000021130251201001001200001001200005009900001300233174830039149733149971201002001200002003600003174830039111202011009910010012000010000000761441610103174501200001003004031749300403174930040
1202043003922510100000182239025120100100120001100120000500960000130020300393003916653314998120100200120000200360000300423003911120201100991001001200001000000076149161093003601200001003004030040300403174930041
12020431748225101000001828520251201001001200001001200005009600001300203003931748149733149971201002001200002003600003003931748111202011009910010012000010000000761410161073003601200001003004031749300403174931749
120204300392381010000002720251201001001200001001200005009600001300203003930042166533167061201002001200002003600003174830039111202011009910010012000010000000761491610103003601200001003174930040300403174930923
1202043004022510100000127167132512011810012001810012000050099000013002030039317481665331670612010020012000020036000030039317481112020110099100100120000100000007614101610103003601200001003004130040300413004031749
12020430039237101000001827167132512010010012000110012000050096000013002030039317481497331670612010020012000020036000030040300391112020110099100100120000100000007614916493003601200001003004031749300403174930040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1200243003922500000004740251200101012000010120000509600001153002030039300391499631501912001020120000203600003003930039111200211091010120000100000075228215162114530036155120000103004030040300403004030040
120024300392250000000880251200101012000010120000509600001153002030039300391499631501912001020120000203600003003930039111200211091010120000100000075228214162114430036155120000103092330040300403004030040
12002430039224000000084902512001010120000101200005042834000153002030039300391499631501912001020120000203600003003930039111200211091010120000100000075228214162114430036475120000103004030040300403004030040
1200243003922500000006720251200101012000010120000509600000153002030039300391499631501912001020120000203600003003930039111200211091010120000100000075228213162114530036155120000103004030040300403004030040
1200243003922500000009650251200101012000010120000509600001153002030039300391499631501912001020120000203600003092230039111200211091010120000100000075228214162114430036155120000103004030040300403004030040
1200243003922500000009020251200101012000010120000509600001153002030039300391499631501912001020120000203600003003930039111200211091010120000100000075228214162114430036155120000103004030040300403004030040
12002430039225000000084302512001010120000101200005096000011530020300393003914996315019120010201200002036000030039300391112002110910101200001000000752411314164223530036155120000103004030040300403004030040
1200243003922500000009202512001010120000101200005096000011530020300393003914996315019120010201200002036000030039300391112002110910101200001000000752283131621234300361510120000103004030040300403004030040
1200243003922400000004680251200101012000010120000509600001153002030039300391499631501912001020120000203600003003930922111200211091010120000100000075228214162114430036155120000103004030040300403004030040
1200243003922500000001780251200101012000010120000509600001153002030039300391499631501912001020120000203600003003930039111200211091010120000100000075228223162113430036155120000103004030040300403004030040