Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

MLS (vector, 8H)

Test 1: uops

Code:

  mls v0.8h, v1.8h, v2.8h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073216112630100030383038303830383038
1004303723025125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372206125482510001000100039831303018303730372415328951000100030003037303711100110001073116112630100030383038303830383038
10043037239612548251000100010003983130301830373037241532895100010003000303730371110011000022373116112630100030383038303830383038
1004303722012025482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372208225482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
1004303723027425482510001000100039831303018303730372415328951000100030003037303711100110001073116112630100030383038303830383038
1004303723023025482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372206125482510001000100039831303018303730372415328951000100030003037303711100110001073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  mls v0.8h, v1.8h, v2.8h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)0318193a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)acc2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0ec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
10204300372250004412954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
1020430037225000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121632296340100001003003830038300383003830038
10204300372251107262954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
10204300372250007262954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
1020430037225000612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
10204300372250007262954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
102043003722500053629548251010010010000100100005004277313030018300373003728265262874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
1020430037225000612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000710121622296340100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313030018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101216222963414100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372330000906129539251001010100001010000504277313030018300373003728292328767101592010000203000030037300371110021109101010000100010306403164329630010000103003830038300383003830038
100243003723300000030329548251001010100001010000504277313130018300373003728287328767100102010000203000030225301311110021109101010000100000006403163429630010000103003830038300383003830038
100243003723300000053729548251001010100001010000504290929030018300373003728287328786100102010000203000030037300371110021109101010000100000006405164329669210000103041630416303703036830466
100243051623510501323105666294588210086141007210123845042881691303423045230367283224228933113552411152243391830370304159110021109101010000100210549026403414429767010000103013330512302293013430416
100243013123401222731761876295126510026101001614102985042800270301263013230132282961328802103082210331223096930275304633110021109101010000100302306823164329630010000103045230426304653041730454
100243046023600912164410742049294851611007012100561211192834285505030270300373003728338462897611802221196924359013060530592141100211091010100001022003590006404163329630010000103003830038300383003830085
10024300372250000006129548251001010100001010000504277313030018300843003728287328767100102010000243247230037300371110021109101010000100200006404164429630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404165429630010000103003830038300853003830038
10024300372250000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006403164429630010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000026403163329630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  mls v0.8h, v0.8h, v1.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722500000014142954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000001710011611296340100001003003830038300383003830038
102043003722500000011352954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
10204300372250000009902954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
102043003722500000010402954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
1020430037225000000612954825101001001000810410447500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
10204300372240000009302954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
102043003722500000010952954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
10204300372250000002352954825101261001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
102043003722500000010632954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038
1020430037225000000612954825101001001000010010000500427731303001830037300372826532874510100200101802003000030037300371110201100991001001000010000000000710011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)091e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000012429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722500008429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722500008429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722400008429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722500008429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225000017029548251001010100001010000504277313130018300373008428287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
1002430037225000018929548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722500008429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722400008429548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100006402162229630010000103003830038300383003830038
100243003722500008429548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000101006402162229630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  mls v0.8h, v1.8h, v0.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225077429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722512344129548251010010010000100100005004277313130018030084300842826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
1020430037225014429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722508429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722508429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722508429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
1020430037225014929548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
1020430037225017029548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722508429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722508429548251010010010000100100005004277313130018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225002292954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
100243003722500612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
1002430037225001262953925100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
100243003722500612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
100243003722400612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
100243003722500612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
100243003722500612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
1002430037225104912954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000006402162229630010000103003830038300383003830038
1002430037225001912954864100101010000101000050427731303001830037300372828732876710010201000020304833008430037111002110910101000010000306402162229630010000103003830038300853003830086
1002430037225004842954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010230006402162229630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  mls v0.8h, v8.8h, v9.8h
  movi v1.16b, 0
  mls v1.8h, v8.8h, v9.8h
  movi v2.16b, 0
  mls v2.8h, v8.8h, v9.8h
  movi v3.16b, 0
  mls v3.8h, v8.8h, v9.8h
  movi v4.16b, 0
  mls v4.8h, v8.8h, v9.8h
  movi v5.16b, 0
  mls v5.8h, v8.8h, v9.8h
  movi v6.16b, 0
  mls v6.8h, v8.8h, v9.8h
  movi v7.16b, 0
  mls v7.8h, v8.8h, v9.8h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03181e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420065151000292580116100800161008014050064019620045200652006561280128200800282002400842006520065111602011009910010016000010000000011110119149002006201600001002006620066200662006620066
160204200651510003982580116100800161008002850064019620045200652006561280100200800002002400002006420064111602011009910010016000010000000000010111116112006101600001002006520065200652006520065
160204200641500001882580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000000000010111116412006101600001002006520065200652006520065
160204200641500004322580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000100000010111116212006101600001002006520065200652006520065
1602042006415000042425801001008000010080000500640000200452006420064112280100200800002002400002006420064111602011009910010016000010000000000010111116212006101600001002006520065200652006520065
16020420064150000602580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000000000010111116212006101600001002006520065200652006520065
160204200641500004942580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000000000010111116212006101600001002006520065200652006520065
160204200641510004992580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000013000010111116112006101600001002006520065200652006520065
160204200641500001022580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000000000010111116112006101600001002006520065200652006520065
160204200641500004192580100100800001008000050064000020045200642006432280100200800002002400002006420064111602011009910010016000010000000000010111116112006101600001002014620065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420078150000004525800121280000128000062640000111020027200462004603228001220800002024000020046200461116002110910101600001000001004213213340211171920043215160000102004720047200472004720047
16002420046150000004525800121280000128000062640000111020027200462004603228001220800002024000020046200461116002110910101600001000001004213513340211151920043215160000102004720047200472004720047
1600242004615000000452580012128000012800006264000011102002720046200460322800122080000202400002004620046111600211091010160000101000100423512732412152420043215160000102004720047200472004720051
1600242004615000001210825801171280000128000062640000111020027200462004603228001220800002024000020046200461116002110910101600001000001003913113132221152420043215160000102004720047200472005120047
1600242004615000000144025800121280000128000062640000101020027200462004603228001220800002024000020046200461116002110910101600001000001003813513227211202020043215160000102004720047200472004720047
16002420046150000004525800121280000128000062640000111020027200462004603228001220800002024000020046200461116002110910101600001000001004313512540211161920043215160000102004720047200472004720047
160024200461500000045258001212800001280000626400001102002720046200460322800122080000202400002004620046111600211091010160000100000100433513132211151920043215160000102004720047200472004720047
16002420046150000004525800121280000128000062640000101020027200462004603228001220800002024000020046200461116002110910101600001000001004313513528211201620043215160000102004720047200472004720047
16002420046150000004525800121280000128000062640000101020027200462004603228001220800002024000020046200461116002110910101600001000001003813513228211151920043215160000102004720047200472004720047
160024200461500000077225800121280000128000062640000111020027200462004669183228001220800002024000020046200461116002110910101600001000001003813112640211151920043215160000102004720047200472004720047

Test 6: throughput

Count: 16

Code:

  mls v0.8h, v16.8h, v17.8h
  mls v1.8h, v16.8h, v17.8h
  mls v2.8h, v16.8h, v17.8h
  mls v3.8h, v16.8h, v17.8h
  mls v4.8h, v16.8h, v17.8h
  mls v5.8h, v16.8h, v17.8h
  mls v6.8h, v16.8h, v17.8h
  mls v7.8h, v16.8h, v17.8h
  mls v8.8h, v16.8h, v17.8h
  mls v9.8h, v16.8h, v17.8h
  mls v10.8h, v16.8h, v17.8h
  mls v11.8h, v16.8h, v17.8h
  mls v12.8h, v16.8h, v17.8h
  mls v13.8h, v16.8h, v17.8h
  mls v14.8h, v16.8h, v17.8h
  mls v15.8h, v16.8h, v17.8h
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e373f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400583000141251601001001600001001600005001280000140020400394003919973319997160100200160000200480000400394004811160201100991001001600001000001011041611400361600001004004040040400494004040040
1602044003930001750251601171001600171001600005002399027140020400394003919973319997160100200160000200480000400394003911160201100991001001600001000001011021621400461600001004004040049400404005040040
160204400483000142251601171001600171001600005002399027140020400494003919973320006160100200160000200480000400404004011160201100991001001600001000001011031611400361600001004004040040400404004140040
160204400393000041251601001001600171001600005002399027140030400394003919973319997160100200160000200480000400394003911160201100991001001600001000001011061611400361600001004004040040400404004040040
160204400393000051251601001001600011001600005001280000140020400494004819973320006160100200160000200480000400484003911160201100991001001600001003001011061611400361600001004004040050400534005040040
1602044004030001742251601001001600001001600005001280000140020400404003919973320006160100200160000200480000400394004811160201100991001001600001000001011061611400361600001004004040040400404005040040
160204400482990051251601001001600011001600005002399027140021400484003919973319997160100200160000200480000400394003911160201100991001001600001000001011031611400361600001004004940040400494004040040
160204400493000041251601001001600171001600005002399027140030400394004019973320007160100200160000200480000400394004811160201100991001001600001000001011021611400361600001004004040040400494004040040
160204400393000041251601001001600171001600005001280000140020400394003919973319997160100200160000200480000400394004811160201100991001001600001000001011061611400371600001004004040040400494004040040
1602044004829901741251601001001600001001600006441280000140020400394004819973319997160100200160000200480000400484003911160201100991001001600001000001011061611400361600001004004040049400404005040050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk instruction (07)1e373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244004830000174602516001010160000101600005012800001104002040039400711999632001916001020160000204800004003940048111600211091010160000100001002438113162111127400360206160000104004040040400404004040040
1600244003930000046025160071101600001016000050128000011540020400714003919996320019160010201600002048000040048400481116002110910101600001000010022811125162112511400360206160000104007240050400494007240040
16002440039299001746025160027101600171016000050239899911540020400394004819996320019160010201600002048000040039400391116002110910101600001000010022811125622112614400360206160000104004040040400404004040040
1600244003929900073025160027101600011016000050128000011540020400394003919996320019160010201600002048000040039400391116002110910101600001000010022811112162112513400360209160000104004040040400404004040040
16002440039300006146025160010101600611016000050128000011540029400484003919996320019160010201600002048000040039400711116002110910101600001000010022811125162111227400360206160000104004040040400404004040040
1600244003930000175502516002710160017101600005053871881154002040039400391999632001916001020160000204800004007140039111600211091010160000100001002411112251642225274003604017160000104004040040400404004040072
16002440039300000460251600101016006110160000501280000115400204003940039199963200511600102016000020480000400394003911160021109101016000010000100241182261642212274003604012160000104004040040400404004040040
160024400393000002220251600101016000010160000501280000015400294004840048199963200281600102016000020480000400394003911160021109101016000010000100228112251642212234003604012160000104004040040400404004040040
160024400393000005502516002710160017101600005012800000154002040039400391999632001916001020160000204800004003940039111600211091010160000100301002411112251642212274003604012160000104004040040400404004040040
1600244003930000184602516002810160017101600005023990820154002040039400711999632001916001020160000204800004003940039111600211091010160000100001002288110162112613400680206160000104004040040400404004940040