Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UMLAL (vector, 8H)

Test 1: uops

Code:

  umlal v0.8h, v1.8b, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073216112630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
1004303723010325482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100030003037303711100110000673116112630100030383038303830383038
100430372206125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372206125482510001000100039831303018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372306125482510001000100039831313018303730372415328951000100030003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  umlal v0.8h, v1.8b, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225001872954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037225002142954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
102043003722501612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
102043003722500612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
102043003722400612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000073412162229634100001003003830038300383003830038
102043008422500612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
1020430037224003232954825101001261000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
102043003722500612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038
102043003722500612954825101001001000010010000500427731303001830037300372826532874510100200100002003000030037300371110201100991001001000010000071212162229634100001003003830038300383003830038
102043003723500612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010000071012162229634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225010092954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
100243003722409322954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
10024300372240822954825100101010000101000050427731330018330037300372828732876710010201000020300003003730037111002110910101000010001640316332963010000103003830038300383003830038
100243003722509552954825100101010000101000050427731330018030037300372828732876710010201000020300003003730084111002110910101000010000640316332963010000103003830038300383003830038
100243003722508962954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
100243003722409552954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
1002430037225010462954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
1002430037225010012954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
100243003722508742954825100171010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038
100243003722509082954825100101010000101000050427731330018030037300372828732876710010201000020300003003730037111002110910101000010000640316332963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  umlal v0.8h, v0.8b, v1.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722506810657040599329476165102081471006414811341746428816903030630464303712829439288741133623011331224339813037030415811020110099100100100001000010223187792803329958100001003003830038300383003830038
102043003722500000018729548251010010010008100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101171129634100001003003830038300383003830038
10204300372240000006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000037101161129634100001003003830038300383003830038
10204300372250000006129548251010010010000100100006124277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313130018300373007428265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383008530038
10204300372250000006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100201027567101161129634100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
102043003722500000018929548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038
10204300372240000006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250000010529548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100230006404163429630010000103003830038300383003830038
10024301312250000010529548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164329630010000103003830038300383003830038
10024300372250000050729548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164429630010000103003830038300383003830038
10024300372250000029129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006403164329630010000103003830038300383003830038
10024300372250000012629548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000306404333429630010000103003830038300383003830038
10024300372250000012629548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100040606404164329630010000103003830038300383003830038
10024300372250000017129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006403163429630010000103003830038300383003830038
1002430037225000026410529548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404164329630010000103003830038300383003830038
100243003722500012060529548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100010277806403164429630010000103003830038300383003830038
10024300372250000020429548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100000006404163429630010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  umlal v0.8h, v1.8b, v0.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037225000900612954825101001001000010010000500427731330018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071011611296340100001003003830038300383003830038
1020430037225087105670411401294761631021111410064149110437634288169303063041830417282924328918113372261132522233969304213041741102011009910010010000100222002229608983105212998335100001003041830422302313046830417
102043018022819952870425248294761811020814310072133111927054289526301623041730417282924228889114912281132323034482304203046981102011009910010010000100442102521309161106222978226100001003003830038300383003830038
10204300372250000007262954825101001001000010010000500427731330018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071011611296340100001003003830038300383003830038
10204300372250000007262954825101001001000010010000500427731330018300373003728265328745101002001000020030000300373008511102011009910010010000100000000071011611296340100001003003830038300383003830038
10204300372250000004472954825101001001000010010000500427731330018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071011611296340100001003003830038300383003830038
10204300372250000006129548251010010010000100100005004277313300183003730037282652128745101002001000020030000300373003711102011009910010010000100000000071011611296340100001003003830038300383003830038
1020430037225000600612954825101001001000010010000500427731330018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071011611296340100001003003830038300383003830038
1020430037225000000612954825101001001000010010000500427731330018300373003728265328745105862001000020030000300373003711102011009910010010000100000000073511611296340100001003003830038300383003830038
1020430037225000000612954825101001001000010010000500427731330018300373003728265328745101002001000020030000300373003711102011009910010010000100000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fa9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024300372250612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006404164429630010000103003830038300383003830038
1002430037225064142954825100101010000101000050427731303001830037300372828732876710010201000020300003003730082111002110910101000010006403163429630010000103003830038300383003830038
100243003722501352954825100101010000101000050427731303001830037300372828732876710609201000020300003003730037111002110910101000010028266404163429630010000103003830038300383003830038
10024300372240612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010006403162329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006403164429630010000103003830038300383003830038
10024300372240612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010006403164429630010000103003830038300383003830038
1002430037225063432954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010006403164429630010000103003830038300383003830038
10024300372250612954825100101010000101000050427767403001830037300372828732876710010201000020300003003730037111002110910101000010006403162429630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006403164329630010000103003830038300383003830038
10024300372250612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010006403163429630010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  umlal v0.8h, v8.8b, v9.8b
  movi v1.16b, 0
  umlal v1.8h, v8.8b, v9.8b
  movi v2.16b, 0
  umlal v2.8h, v8.8b, v9.8b
  movi v3.16b, 0
  umlal v3.8h, v8.8b, v9.8b
  movi v4.16b, 0
  umlal v4.8h, v8.8b, v9.8b
  movi v5.16b, 0
  umlal v5.8h, v8.8b, v9.8b
  movi v6.16b, 0
  umlal v6.8h, v8.8b, v9.8b
  movi v7.16b, 0
  umlal v7.8h, v8.8b, v9.8b
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204200891500000603925801001008000010080000500640000120045200642006436580100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
1602042006415100002403925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
16020420064150000068403925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
160204200641500000003925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
1602042006415000003303925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
1602042006415100003303925801001008000010080000500640000120045200642006432280531200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652014520065
1602042006415000001203925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065201352006520065
1602042006415000002703925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
1602042006415000002403925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065
160204200641510000003925801001008000010080000500640000120045200642006432280100200800002002400002006420064111602011009910010016000010000000010111116112006101600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242006315000000450045258001212800001280000626400001152002720046200463228001220800002024000020046200461116002110910101600001000000000100318218202117720043215160000102004720047200472004720047
160024200461500000000045258001212800001280000626400001152002720046200463228001220800002024000020046200461116002110910101600001000000000100298415202116720043215160000102004720047200472004720047
160024200461500000030045258001212800001280000626400001152002720046200463228001220800002024000020046200461116002110910101600001000000000100318518202118520043215160000102004720047200472004720047
1600242004615000000540045258001212800001280000626409401152002720046200463228001220800002024000020046200461116002110910101600001000000000100288515202116720043215160000102004720047200472004720047
1600242004615000000900512580012128000012800006264000001520031200522005232280012208000020240000200502005011160021109101016000010000000001003111627264227820047230160000102005120051200512004720051
16002420050150000004800512580012128000012800006264000001520031200502005032280012208000020240000200462004611160021109101016000010000000001003211628242228720047231160000102005120051200512005320051
16002420052150000004500452580012128000012800006264000001520031200502004632280012208000020240000200502005011160021109101016000010000000001003111618242227820047230160000102005120051200512005320051
1600242005015000000000512580012128000012800006264000011520033200522005032280012208000020240000200522005211160021109101016000010000000001003211627243228620049231160000102005120051200512005320051
1600242005215000000000452580012128000012800006264000001520031200502005032280012208000020240000200522005011160021109101016000010000000001003111626242225620047230160000102005320051200472005120053
1600242005015000000000512580012128000012800006264000001520031200522005032280012208000020240000200502005011160021109101016000010000010001003211525243228820047230160000102005120051200512005120051

Test 6: throughput

Count: 16

Code:

  umlal v0.8h, v16.8b, v17.8b
  umlal v1.8h, v16.8b, v17.8b
  umlal v2.8h, v16.8b, v17.8b
  umlal v3.8h, v16.8b, v17.8b
  umlal v4.8h, v16.8b, v17.8b
  umlal v5.8h, v16.8b, v17.8b
  umlal v6.8h, v16.8b, v17.8b
  umlal v7.8h, v16.8b, v17.8b
  umlal v8.8h, v16.8b, v17.8b
  umlal v9.8h, v16.8b, v17.8b
  umlal v10.8h, v16.8b, v17.8b
  umlal v11.8h, v16.8b, v17.8b
  umlal v12.8h, v16.8b, v17.8b
  umlal v13.8h, v16.8b, v17.8b
  umlal v14.8h, v16.8b, v17.8b
  umlal v15.8h, v16.8b, v17.8b
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)0318191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9daddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400723000078005002516010010016000010016000050023989990154003040039400401997332000716010020016000020048000040039400401116020110099100100160000100000000101105003160033400360001600001004005040040400404005040072
16020440039300000005002516011710016000010016000050023989990154002040039400711997332002516010020016000020048000040039400491116020110099100100160000100000000101105003160023400460001600001004004040040400504004040040
16020440039300000008302516010010016000010016000050012800000154012140039400391999061999716010020016009920048000040159401511116020110099100100160000100000200101105002160032400360001600001004004040050400404004040040
16020440049300000004102516011710016000010016000050023989990154002040048400391997332002916010020016000020048000040039400391116020110099100100160000100001000101128413160023400360001600001004005040049400404004140050
160204400483000033008402516011710016000010016000050012800000004002140052400401997332002916010020016000020048000040039400491116020110099100100160000100000000101100002160033400460001600001004005040040400494004040040
160204400393000027004102516011710016000010016000050012808240004005240049400481997332002316010020016021020048060640048400712116020110099100100160000100000006101108003160033400450001600001004004040040400404004040040
160204400493000000041025160117100160000100160000500239908201540020400394004919973320007160100200160000200480000400404003911160201100991001001600001000010001011051031620334004602101600001004004040040400404007240040
160204400392990033009302516011710016000010016000050023990270054003040048400401997331999716010020016000020048000040039400391116020110099100100160000100000002101105002160223400360001600001004004940040400494004040040
160204400483000000614902516011710016001710016000050012800000154002040040400391997331999716010020016000020048000040049400391116020110099100100160000100000060101108002160033400370001600001004004940040400504004040040
160204400403000000070602516010010016001710016000050013200000104002040039400391997331999716010020016000020048000040049400481116020110099100100160000100000000101100003160032400450001600001004004040050400404004040049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03l1i tlb fill (04)1e373a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440060300142005525160010101600001016000050128000011540020400484003919985320028160010201600002048000040048400391116002110910101600001000001002385114316111373640036165160000104005340049400404004940040
16002440039299130900692516002710160017101600005012800001110400204003940048199943200191603292016000020480000400394004811160021109101016000010200010024137113516111383840045166160000104004940040400494004040049
160024400393001001782516002710160017101600005012800000110400204003940048199943200191600102016000020480000400484003911160021109101016000010000010023138113116111393940036165160000104004040049400404004940040
1600244003930013900692516001010160000101600005023989991110400294004840039199953200191600102016000020480000400394004811160021109101016000010000010023138113516111263540036165160000104004940040400494004040049
16002440048300139171782516001010160000101600005023989991110400204003940048199953200281600102016000020480000400394003911160021109101016000010000010023138113816121392440036165160000104004940040400494004040040
1600244003930012071717525160027101600171016000050128000011104002040048400391999532001916001020160000204800004004840039111600211091010160000100000100261392139161113936400361611160000104004040049400404004940040
16002440048300115170782516001010160000101600005012800001110400204003940048199953200191601162016000020480000400494003911160021109101016000010000010024139114016111243940036166160000104004040040400404004040049
160024400483001391707525160027101600171016000050128000011104002040039400391999432002816001020160000204800004004840039111600211091010160000100000100241310112416322373840036165160000104004040040400494004040040
1600244003930003300752516001010160000101600005012800000110400204003940048199953200191600102016000020480000400484003911160021109101016000010000010023139112616111393940036166160000104004940040400404004040049
160024400483001270171842516002710160017121600005012800000110400204003940048199953200281600102016000020480000400394004811160021109101016000010000010023139112416111373840045165160000104004940040400494004040049