Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

RADDHN2 (2D)

Test 1: uops

Code:

  raddhn2 v0.4s, v1.2d, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)0309191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037220006125482510001000100039831313018303730372415328951000100030003037303711100110000073216112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110004073116112701100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110001973116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110001373116112630100030383038303830383038
100430372300061254825100010001000398313130183037303724153289510001000300030373037111001100002773116112630100030383038303830383038
10043037230006125482510001000100039831313018303730372415328951000100030003037303711100110006073116112630100030383038303830383038
10043037220006125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  raddhn2 v0.4s, v1.2d, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250000000612954825101001001000010010000500427731315300183003730037282653287451010020010000200300003003730037111020110099100100100001000000007106021622296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731315300183003730037282653287451010020010000200300003003730037111020110099100100100001000000007101121622296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731315300183003730037282653287451010020010000200300003003730037111020110099100100100001000000007106122722296340100001003003830038300383003830038
102043003722400005122006129548251010010010000100100005004277313153001830037300372826532874510100200100002003000030037300371110201100991001001000010000001700148311126425296347100001003032330324303253032130229
102043032522811668045281407029494125101001141004011510745610428545515302703003730322282872928837108612221098521832961303233027671102011009910010010000100220013888284260364322983410100001003032630275302743027630328
1020430311226115593652804533294821021016612110048119110435984286812153027030373303602829040288711101122210498222334803037330370811020110099100100100001004221194984957112104322990718100001003032230518305163055630515
1020430566237010220700612954825101001001000010010000500427731315300543008630037282653287451010020010000202300003003730037111020110099100100100001000000007106121622296340100001003003830038300383003830038
102043003722500000001032954825101001001000010010000500427731315300183003730037282653287451010020010000200300003003730037111020110099100100100001000000007101124922296340100001003003830038300383003830038
1020430037225000018300612954825101001001000010010000500427731315300183003730037282653287451010020010000200300003003730037111020110099100100100001000000007106021622296340100001003003830038300383003830038
10204300372250000900612954825101001001000010010000500428096415300183003730037282653287451010020010000200300003003730037111020110099100100100001000000607101021622296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640316322963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640316332963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002030486300853017841100211091010100001060301282280640216232963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640316332963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640316332963010000103003830038300383003830038
10024300372250013642954825100101010000101000050427731313001833003730037282873287671001020100002030000300373003711100211091010100001000000000640316332963010000103003830038300383003830038
100243003722400612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640416332963010000103003830038300383003830038
100243003722400612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640216422963010000103003830038300383003830038
100243003722590612954825100101010000101000050427731313001803003730037282873287671001020100002030000300373003711100211091010100001000000000640216332963010000103003830038300383003830038
100243003722500612954825100101010000101000050427731313001803003730037282873287671001020100002030000300843003711100211091010100001000000000640316332963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  raddhn2 v0.4s, v0.2d, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250000168295482510100100100001001000050042773130300183008030037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
1020430037225000084295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250000421295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071015911296340100001003003830038300383003830038
10204300372250000168295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250000210295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372250000183295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
10204300372240000250295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000000080311611296340100001003003830038300383003830038
10204300372250000498295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038
1020430037225000061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722502622954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001000006403163329630010000103003830038300383003830038
100243003722501872954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001000006403163329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731303001830037300372828703287671001020100002030000300373003711100211091010100001000006402163329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001000006403163329630010000103003830038300383003830038
10024300372240612954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001002006403163329668010000103003830038300383003830038
10024300372250612954825100101010000101000050427731303001830037300372828703287671001020100002030000300373003711100211091010100001000306403163329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001000006403163329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001000006403163329630010000103003830038300383003830038
100243003722503432954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001000006403163329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828703287671001020100002030000300373003711100211091010100001010006403163329630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  raddhn2 v0.4s, v1.2d, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225096129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
10204300372240012029548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101162129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037224006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372240000061295482510010101000010100006642773130300183003730037282873287671001020100002030000300373003711100211091010100001002000640216222963310000103003830038300383003830038
1002430037225000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100001290640216222963010000103003830038300383003830038
1002430037225000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100002130640216222963010000103003830038300383003830038
1002430037225000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100001350640216222963010000103003830038300383003830038
10024300372250000040202954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000330640216222963010000103003830038300383003830038
10024300372250000061295482510010101000010100005042773130300183003730037282873287671001020100002030000300373003711100211091010100001000000640216222963010000103003830038300383003830038
1002430077225000006129548251002812100001210149504277313030018300373003728287328767100102010000203000030037300371110021109101010000100001320640216222966310000103003830038300383003830038
1002430037225000006129548251001010100001010000504277313030090300373003728287328767100102010494203000030037300371110021109101010000100001470640216222965310000103003830038300383003830038
1002430037233000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100001500640216222963010000103003830038300383003830038
1002430037225000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100001440640216222963010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  raddhn2 v0.4s, v8.2d, v9.2d
  movi v1.16b, 0
  raddhn2 v1.4s, v8.2d, v9.2d
  movi v2.16b, 0
  raddhn2 v2.4s, v8.2d, v9.2d
  movi v3.16b, 0
  raddhn2 v3.4s, v8.2d, v9.2d
  movi v4.16b, 0
  raddhn2 v4.4s, v8.2d, v9.2d
  movi v5.16b, 0
  raddhn2 v5.4s, v8.2d, v9.2d
  movi v6.16b, 0
  raddhn2 v6.4s, v8.2d, v9.2d
  movi v7.16b, 0
  raddhn2 v7.4s, v8.2d, v9.2d
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042008815008125801001008000012480000500640000002004520064200643228010020080000200240000200642006411160201100991001001600001003310111011611200611600001002006520065200652006520065
1602042006415003925801001008000010080000500640000002004520064200643228010020080000200240000200642006411160201100991001001600001007210111011611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000010200452006420064322801002008000020024000020064200641116020110099100100160000100010111011611200611600001002006520065200652006520065
1602042006415003925801001008000010080000500640000102004520064200643228010020080000200240000200642006411160201100991001001600001001510111011611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000000200452006420064322801002008000020024000020064200641116020110099100100160000100010111011611200611600001002006520065200652006520065
160204200641500392580100100800001008000050064000010200452006420064322801002008000020024000020064200641116020110099100100160000100010111011611200611600001002006520065200652006520065
1602042006415115392580100100800001008000050064000010200452006420064322801002008000020024000020064200641116020110099100100160000100010111011611200611600001002006520065200652006520065
1602042006415003925801001008000010080000500640000002004520064200643228010020080000200240000200642006411160201100991001001600001009010111511611200611600001002006520065200652006520065
160204200641510392580100100800001008000050064000005200452006420064322801002008000020024000020064200641116020110099100100160000100010111511611200611600001002006520065200652006520065
1602042006415003925801001008000010080000500640000102004520120200643228010020080000200240000200642006411160201100991001001600001002410111011611200611600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2507

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242008715004527800121280000128000062640000112003220051200510322800122080000202400002005120051111600211091010160000100610030311102521247200482201160000102005220052200522005220052
1600242006015004527800121280000128000062640000112003220051200510322800122080000202400002005120051111600211091010160000100211002931273421174200482201160000102005220065200522005220052
160024200511500452980012128000012800006264000011200322005120051032280012208000020240000200512005111160021109101016000010001002762143422147200572401160000102005220052200522006120052
1600242005115104529800121280000128000062640000112003220051200510322800122080000202400002006020051111600211091010160000100211003032172541157200572201160000102006120052200522005220052
16002420051150023529800121280000128000062640000002004120051200520322800122080000202400002006020051111600211091010160000100121003031152521159200482201160000102005220052200612005220061
1600242005115004529800121280000128000062640000112003220060200510322800122080000202400002005120051111600211091010160000100151003331242521177200482201160000102005220061200522006120052
1600242005115004527800121280000128000062640000102004120060200510322800122080000202400002005120051111600211091010160000100010030312173442147200482201160000102005220052200522005220052
160024200601500452780012128000012800006264168010200412005120051032280012208000020240000200602005111160021109101016000010001002731142521277200482201160000102006120061200522005220052
1600242006015004529800121280000128000062640000012003220051200510322800122080000202400002005120051111600211091010160000100121003031142521159200482201160000102006120052200522005220052
160024200511500452780012128000012800006264000010200322005120060032280012208000020240000200512005111160021109101016000010091003032172522147200482201160000102006120052201242005220061

Test 6: throughput

Count: 16

Code:

  raddhn2 v0.4s, v16.2d, v17.2d
  raddhn2 v1.4s, v16.2d, v17.2d
  raddhn2 v2.4s, v16.2d, v17.2d
  raddhn2 v3.4s, v16.2d, v17.2d
  raddhn2 v4.4s, v16.2d, v17.2d
  raddhn2 v5.4s, v16.2d, v17.2d
  raddhn2 v6.4s, v16.2d, v17.2d
  raddhn2 v7.4s, v16.2d, v17.2d
  raddhn2 v8.4s, v16.2d, v17.2d
  raddhn2 v9.4s, v16.2d, v17.2d
  raddhn2 v10.4s, v16.2d, v17.2d
  raddhn2 v11.4s, v16.2d, v17.2d
  raddhn2 v12.4s, v16.2d, v17.2d
  raddhn2 v13.4s, v16.2d, v17.2d
  raddhn2 v14.4s, v16.2d, v17.2d
  raddhn2 v15.4s, v16.2d, v17.2d
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440049300005125160108100160008100160020500239913104002040039400391997761999116012020016003220048009640039400391116020110099100100160000100001111011811611400361600001004004940049400494004040040
16020440039300004125160101100160017100160000500128000004002040048400481997331999716010020016000020048000040039400481116020110099100100160000100000001011021612400361600001004004040049400404004040040
160204400393000042251601001001600171001600005001280000040020400394003919973262000616010020016000020048000040039400391116020110099100100160000100000001011011611400361600001004004940049400494004940040
160204400393000174125160100100160000100160000500128000004002040039400391997332000616010020016000020048000040039400391116020110099100100160000100000001011011611400361600001004004940049400404004040040
16020440039300004125160117100160000100160000500239899904002040039400391997331999716010020016000020048000040048400481116020110099100100160000100000001011011611400451600001004004940049400404004040040
16020440039300014125160100100160000100160000500131999904002040039400391997331999716010020016000020048000040039400391116020110099100100160000100000001011011611400361600001004004940049400494004040049
160204400393000174125160100100160017100160000500239908204002040039400391997331999716010020016000020048000040048400391116020110099100100160000100000001011021611400451600001004004040040400494004040040
160204400393000174125160100100160017100160000500128000004002140048400481997331999716010020016000020048000040039400481116020110099100100160000100000001011011611400361600001004004940049400404004040049
160204400393000175025160117100160000100160000500128000014002040039400391997331999716010020016000020048000040039400391116020110099100100160000100000001011021611400361600001004004040040400404004040050
160204400393000175025160117100160000100160000500239908204002940048400391997332000616010020016000020048000040039400391116020110099100100160000100000001011001621400361600001004004040040400404004040049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f373a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440039299000000000046025160010101600011016000050128000011040020400484004819996032002916001020160000204800004003940039111600211091010160000100000000010022311816212544003602070160000104004040040400404004040049
16002440039300000000000056025160010101600001016000050239902710540020400394003919996032001916001020160000204800004003940048111600211091010160000100000000010022311416211434004502090160000104032640049400414004040040
16002440048300000000000047025160010101600001016000050131999811540029400394003920028032001916001020160000204800004004940039111600211091010160000100002309760010022851516211664003702090160000104004940050400504004940049
16002440039300000000017617047025160027101600001016000050239899911540020400394004819996032001916001020160000204800004003940039111600211091010160000100000000010022851516211554003602060160000104009940198400404004040040
1600244003930000000060002663324716129012161035111609745517893881054062740981409702028108720495161825201619442048543940039400481116002110910101600001026314815020102998515138212354073012070160000104114240922410824061240772
16002440994304102111162112149617413276113761615331116220413161618652374102100400294003940039199960320019160688201617902048611440669411202116002110910101600001004012694000102663515168211444077602090160000104004940040400404004940847
16002440801314010017004906620251600271016001710160000502398999015400204003940048199960320019160010201600002048000040049400391116002110910101600001000000000100241152616422354003604060160000104005040049400404004040040
16002440039300000000010800520251600271016000010160000501280000015400294003940048199960320028160010201600002048000040039400481116002110910101600001000000000100248626164225440036040180160000104004040041400494004040040
1600244003930000000000170610251600101016000010160000501280000115400204003940048199960320019160010201600002048000040048400391116002110910101600001000000000100246526162215440045040120160000104004940040400494004040050
1600244003930000000000170610251600111016001710160000502398999015400204003940048199960320019160010201600002048000040049400391116002110910101600001000000000100241161416221444004504090160000104004040040400494004040049