Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SQRDMLSH (by element, 8H)

Test 1: uops

Code:

  sqrdmlsh v0.8h, v1.8h, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037220006125482510001000100039831313018303730372415328951000116330003083303711100110000073216112630100030383038303830383038
10043037220006125482510001000100039831303018303730372415328951000100030003037308411100110000073116112630100030383038303830383038
100430372311010325482510001000100039831303018303730372415328951000100030003037303711100110000073116222630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037220006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100034833037303711100110000073116112630100030383038303830383038
10043037230008225482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372200025125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  sqrdmlsh v0.8h, v1.8h, v2.h[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225906129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071002162229634100001003003830038300383003830038
10204300372251806129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
1020430037225016129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
102043003722515061295482510100100100001001000050042773133001830037300372826522287451010020010000200300003003730037111020110099100100100001000071012162329634100001003003830038300383003830038
10204300372256606129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
10204300372255706129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
10204300372251806129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
102043003722532706129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
10204300372251506129548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071013162229634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225025861295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037225011461295484410010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037225033961295482510010101000010100005042773131300183003730037282873287671031022100002030000300373003711100211091010100001000640216432966710000103003830038300383003830038
10024300372251061295482510010101000010100005042773131300183003730037282873287671015920100002030000300373003721100211091010100001000640216222963010000103003830038300383003830038
10024300372250061295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722509661295482510010101000010100005042773131300183003730037282873287671001020100002030000300373008011100211091010100001000640216222963010000103003830038300383003830038
1002430037225010861295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037224031561295482510010101000010100005042773131300183003730037282877287861001020100002030000300373003711100211091010100001019640216222963010000103003830038300383003830038
1002430037225033961295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722406361295482510010101000010100005042773131300183003730037282873287671001020100002030000300373003711100211091010100001000640316222963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  sqrdmlsh v0.8h, v0.8h, v1.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225078061295482510100100100001001000050042773131300183003730037282653287441012520010000200300003003730037111020110099100100100001000071021611296340100001003003830038300383003830038
1020430037225024061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225033061295482510100100100001001000062642773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225015061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037224018061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225033061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071021711296340100001003003830038300383003830038
1020430037224015061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000371211611296340100001003003830038300383003830038
1020430037225000726295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
10204300372250150251295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250216129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037225006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037224006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001020640216222963010000103003830038300383003830038
10024300372250246129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001010640224222963010000103003830038300383003830038
100243003722512826129548251001010100001010000504277313300183008430037282877287671016020101612031188301323003711100211091010100001000640216222963010000103003830038300383003830038
10024300372250276129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001003640216222963010000103003830038300383003830038
100243003722503816129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038
1002430037225006129548251001910100001210000714277313300183003730037282873287671001020100002030984300373003731100211091010100001000640216222963010000103003830038300383003830038
10024300372250310329548251001010100001010000504277313300183003730037282873288031001020100002030000300373003711100211091010100001000640216222963010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  sqrdmlsh v0.8h, v1.8h, v0.h[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225061295482510100100100001001000050042773133001830037300372827262874110100200100082003002430037300371110201100991001001000010001117180160029646100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372827262874110100200100082003002430037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133005430037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133012630037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000006061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006403162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
10024300372250000027061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037224000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730179282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037224000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000061295482510010101000010100005042773130300183003730037282870328767100102010000203000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sqrdmlsh v0.8h, v8.8h, v9.h[1]
  movi v1.16b, 0
  sqrdmlsh v1.8h, v8.8h, v9.h[1]
  movi v2.16b, 0
  sqrdmlsh v2.8h, v8.8h, v9.h[1]
  movi v3.16b, 0
  sqrdmlsh v3.8h, v8.8h, v9.h[1]
  movi v4.16b, 0
  sqrdmlsh v4.8h, v8.8h, v9.h[1]
  movi v5.16b, 0
  sqrdmlsh v5.8h, v8.8h, v9.h[1]
  movi v6.16b, 0
  sqrdmlsh v6.8h, v8.8h, v9.h[1]
  movi v7.16b, 0
  sqrdmlsh v7.8h, v8.8h, v9.h[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204200891500000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000110111616112006101600001002006520065200652006520065
160204200641500000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641500000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641510000039258010010080000100800005006400001200452006420064322802322008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
1602042006415000000514258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641500000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641510000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641500000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641510000039258010010080000108800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065
160204200641500000039258010010080000100800005006400001200452006420064322801002008000020024000020064200641116020110099100100160000100000000010111116112006101600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420102150100270621278001212800001280000626400001152004120060200513228001220800002024000020051200511116002110910101600001001004282127254111818200482201160000102005220052200522005220052
160024200511501100057278001212800001280000626400001152004120051200513228001220800002024000020051200511116002110910101600001031004182119252111420200482201160000102005220052200522005220052
1600242005115011012051278001212800001280000626400000152003220051200513228001220800002024000020051200511116002110910101600001001004082119252111918200482201160000102005220052200522005220052
16002420051150111004567800121280000128010462640000115200322005120051322800122080000202400002005120051111600211091010160000105431004382118252111818200482201160000102005220052200522005220052
1600242005115010012057278001212800001280000626400001152003220051200513228001220800002024000020051200511116002110910101600001001004082117252111819200482201160000102005220052200522005220052
160024200511501100051278001212800001280000626400001152003220051200513228001220800002024000020051200511116002110910101600001001004382121252112021200482201160000102005220052200522005220052
160024200511501100045278001212800001280000626400001152003220051200513228001220800002024000020051200511116002110910101600001001003582120252111919200482201160000102005220052200522005220052
160024200511501100068278001212800001280000626400001152003220051200513228011820800002024000020051200511116002110910101600001001004282119252111918200482201160000102005220052200522005220052
160024200511500100045278001212800001280000626400001152003220051200513228001220800002024000020060200601116002110910101600001001004182120252111817200482201160000102005220052200522005220052
160024200511501100057278001212800001280000626400001152003220051200513228001220800002024000020051200511116002110910101600001001003982112252111617200482201160000102005220052200522005220052

Test 6: throughput

Count: 12

Code:

  sqrdmlsh v0.8h, v12.8h, v13.h[1]
  sqrdmlsh v1.8h, v12.8h, v13.h[1]
  sqrdmlsh v2.8h, v12.8h, v13.h[1]
  sqrdmlsh v3.8h, v12.8h, v13.h[1]
  sqrdmlsh v4.8h, v12.8h, v13.h[1]
  sqrdmlsh v5.8h, v12.8h, v13.h[1]
  sqrdmlsh v6.8h, v12.8h, v13.h[1]
  sqrdmlsh v7.8h, v12.8h, v13.h[1]
  sqrdmlsh v8.8h, v12.8h, v13.h[1]
  sqrdmlsh v9.8h, v12.8h, v13.h[1]
  sqrdmlsh v10.8h, v12.8h, v13.h[1]
  sqrdmlsh v11.8h, v12.8h, v13.h[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)031e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1202043044022500616713251201001001200001001200005004399524130020031748300391497331499712010020012000020036000030039300401112020110099100100120000100000761011611300361200001003174930040317493004030041
1202043174822500440251201001001200001001200005004399524130020030039317481665331499812010020012000020036000031748300391112020110099100100120000100000762711611300361200001003004030040300403004031749
120204317482250041025120118100120001100120000500960000130020030039317481497331670612010020012000020036000031748300391112020110099100100120000100000761011611300361200001003174930040317493004031749
1202043174822501861025120100100120000100120000500960000130021030039317481665331499812010020012000020036000031748300391112020110099100100120000100000761011611300361200001003004031749300403004130040
120204300392250041671325120100100120001100120000500960000130020030039317481665331670612010020012000020036000031748300391112020110099100100120000100000761011611300371200001003004031749300403174930040
1202043003922400410251201001001200001001200005004399524131729031748300391497331499712010020012000020036000030039317481112020110099100100120000100000761011611317451200001003004030041300403174930040
1202043003922501841025120118100120001100120000500960000130020030039300401665331499812010020012000020036000030039317481112020110099100100120000100000761011611300361200001003174930040300413004031749
1202043174822500726025120100100120000100120000500990000130020030039317481497331499712010020012000020036000030039317481112020110099100100120000100000761011611317451200001003004031749300403004130040
120204300392250141025120101100120001100120000500960000131729030039300421497331499712010020012000020036000030039317481112020110099100100120000100000761011611317451200001003004030041300403174930040
12020430039225018610251201001001200001001200005004399524130020030039300401665331670612010020012000020036000030039300391112020110099100100120000100000761011611317451200001003004030040300403004130040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
120024300392250000000400251200101012000010120000509600000130020300393003914996315019120010201200002036000030039300391112002110910101200001000000002752021655300360120000103004030040300403004030040
120024300392250000000400251200101012000010120000509600000130020300393003914996315019120010201200002036000030039300391112002110910101200001000000000752051654300360120000103004030041300403004030040
120024300392250000000610251200451012000010120000509600000130021300393003914996315019120010201200002036000030039300391112002110910101200001000000000752041655300360120000103004030040300403004030041
120024300392250000000400251200101012000010120000509600000130020300393003914996315019120010201200002036000030039300391112002110910101200001000000000752031644300360120000103004030040300403004030040
120024300392250000000400251200101012000010120000509600001130020300393003914996315019120010201200002036000030039300391112002110910101200001000000000752021633300360120000103004030040300403004030040
120024300392250000000400251200101012000010120000509600000130020300393003914996315019120010201200002036000030039300391112002110910101200001000000000752041644300360120000103004030040317493004030043
120024300392240000000400251200101012000010120000509600000131731300393003914996315019120010201200002036000030039300391112002110910101200001000000000752051634300360120000103004030040300403004030040
120024300392250000000400251200101012000010120000509600000130021300393003914996315019120010201200002036000030039300391112002110910101200001000000000752041644300360120000103004030040300403004030040
120024300392250000000400251200101012000010120000509600000130020300393003914996315019120010201200002036000030039309221112002110910101200001000000000752031694300360120000103004030040300403004030040
120024300392250000000400251200101012000010120000509600000130932300393003914996315019120010201200002036000030039300391112002110910101200001000000000752041655300360120000103004030040300403004030040