Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UMLSL (vector, 2D)

Test 1: uops

Code:

  umlsl v0.2d, v1.2s, v2.2s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037220612548251000100010003983133018303730372415328951000100030003037303711100110003073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372384612548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
100430372269612548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037220612548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037220612548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100030003037303711100110000973116112630100030383038303830383038
10043037230662548251000100010003996723018303730372415328951000100030003037303711100110000073116112630100030383038303830383038
10043037230822548251000100010003983133018303730372415328951000100030003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  umlsl v0.2d, v1.2s, v2.2s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001001071000162229634100001003003830038300383003830038
102043003722500612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010056971212162229634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071013162229634100001003003830038300383003830038
10204300372240061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
102043003722400612954825101001001000010010000500427731313001830037300372826532874510100200100002003000030037300371110201100991001001000010044671012162229634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773130300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003003830038300383003830038
1020430037225006129548251010010010000100100005004277313130018300373003728265328745101002001000020030000300373003711102011009910010010000100410271012162229634100001003003830038300383003830038
10204300372250061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001000071012162229634100001003018630038300383003830038
10204300372240061295482510100100100001001000050042773131300183003730037282653287451010020010000200300003003730037111020110099100100100001001071012162229634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acl1d tlb miss nonspec (c1)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037278010242954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000640416332963010000103003830038300383003830038
10024300372780612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000640316332963010000103003830038300383003830038
10024300372590612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000640416332963010000103003830038300383003830038
10024300372600612954825100101010000101000050428274103001830037301312829232876710010201000020300003003730037111002110910101000010001000640316332963010000103003830038300383003830038
10024300372600822954844100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000000640316332963010000103003830038300383003830038
10024300372410612954825100101010000101000050427731303001830037300372828732878610010201000020300003003730037111002110910101000010000000640316332963010000103003830038300383003830038
1002430037241127262954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010003000640316332963010000103003830038300383003830038
10024300372410612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010002000640316332963010000103003830038300383003830038
100243003724206129548251001010100001010000504277313130018300373003728287328767100102010000203000030037300371110021109101010000100040000640316332963010000103003830038300383003830038
100243003723206129548251001010100001010000504277313030018300373003728287328767100102010000203000030037300371110021109101010000100011200640316332963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  umlsl v0.2d, v0.2s, v1.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250061295482510125125100001001000050042773133001830037300372826532874510100200100002003000030037300371110201100991001001000010000710116112963425100001003003830038300383003830038
1020430037225016329548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225006129539441010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000971011611296340100001003003830038300383003830038
10204300372250010329548251010010010000100100006264277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071031733296340100001003003830038300383003830038
10204300372250116629548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
10204300372240014529548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
10204300372240016629548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296840100001003003830038300383003830038
1020430037225006129539251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
10204300372250014529548251010010010000100100005004277313300183003730037282653287451010020010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383008630038
10204300372250016629548251010010010000100100005004277313300183003730037282653287451012520010000200300003003730037111020110099100100100001000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722500996129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250096129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300842250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730084282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313300183003730037282873287671001020100002030000300373003711100211091010100001000640316332963010000103003830038300383003830038

Test 4: Latency 1->3

Code:

  umlsl v0.2d, v1.2s, v0.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722506129548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383008530038
1020430037225276129548251010012810000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722406129548251010011910000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010040071011611296340100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
102043003722406129548251010013110000118102985814277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003018430038300383003830038
1020430037225072629548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010001071011611296340100001003003830038300383003830038
102053003722506129548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038
1020430037225010329548251010010010000100100005004277313030018030037300372826532874510100210100002003000030037300371110201100991001001000010020071011611296340100001003003830038300383003830038
102043022922406129521251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071021611296340100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313030018030037300372826532874510100200100002003000030037300371110201100991001001000010000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000612954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372250001242954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372240001662954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
1002430037225000612954825100101110000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372250001452954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372250001452954825100101010000101000050427731303001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372250001662954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372250005112954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
1002430037225000612954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038
10024300372250006272954825100101010000101000050427731313001830037300372828732876710010201000020300003003730037111002110910101000010000640216222963010000103003830038300383003830038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  umlsl v0.2d, v8.2s, v9.2s
  movi v1.16b, 0
  umlsl v1.2d, v8.2s, v9.2s
  movi v2.16b, 0
  umlsl v2.2d, v8.2s, v9.2s
  movi v3.16b, 0
  umlsl v3.2d, v8.2s, v9.2s
  movi v4.16b, 0
  umlsl v4.2d, v8.2s, v9.2s
  movi v5.16b, 0
  umlsl v5.2d, v8.2s, v9.2s
  movi v6.16b, 0
  umlsl v6.2d, v8.2s, v9.2s
  movi v7.16b, 0
  umlsl v7.2d, v8.2s, v9.2s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042007815012602580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001001010111116112006101600001002006520065200652006520065
160204200641500602580100100800001008000050064000012004520064200643438022820080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
160204200641510602580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
1602042006415001232580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
1602042006415101632580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000310111116112006101600001002006520065200652006520065
1602042006415002822580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
160204200641500602580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
160204200641500392580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
160204200641500392580100100800001008000050064000002004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065
1602042006415001022580100100800001008000050064000012004520064200643228010020080000200240000200642006411160201100991001001600001000010111116112006101600001002006520065200652006520065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)0318191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024200641500008725800121280000128000062640000112002720050200503228001220800002024000020050200461116002110910101600001000100266218244214920047230160000102005120051200512005120051
160024200461500007225800121280000128000062640000112003120050200463228001220800002024000020050200501116002110910101600001000100306225242213520043230160000102005120047200512005120047
1600242005015000011425800121280000128000062640000112003120046200463228001220800002024000020050200501116002110910101600001000100313123244229320047230160000102005120047200472005120047
160024200501500009325800121280000128000062640000012003120050200503228001220800002024000020050200501116002110910101600001000100283225204226520047215160000102005120051200512005120051
160024200501500009325800121280000128000062640000012002720050200503228001220800002024000020050200501116002110910101600001000100296213244123520047230160000102005120051200472005120047
160024200501500008725800121280000128000062640000012003120050200503228001220800002024000020050200461116002110910101600001000100306129204213920047230160000102005120047200512005120051
1600242005015000055225800121280000128000062640000012003120050200463228001220800002024000020050200461116002110910101600001000100286225204216420043230160000102005120051200512005120051
1600242004615000013525800121280000128000062640000012003120050200463228001220800002024000020046200461116002110910101600001000100356123204113520043230160000102005120051200472005120051
160024200501510009325800121280000128000062640000012003120046200503228001220800002024000020050200501116002110910101600001000100283123244219520047230160000102005120047200472005120051
1600242004615000010825800121280000128000062640000112003120050200503228001220800002024000020046200461116002110910101600001000100316229202229320047315160000102005120051200512004720051

Test 6: throughput

Count: 16

Code:

  umlsl v0.2d, v16.2s, v17.2s
  umlsl v1.2d, v16.2s, v17.2s
  umlsl v2.2d, v16.2s, v17.2s
  umlsl v3.2d, v16.2s, v17.2s
  umlsl v4.2d, v16.2s, v17.2s
  umlsl v5.2d, v16.2s, v17.2s
  umlsl v6.2d, v16.2s, v17.2s
  umlsl v7.2d, v16.2s, v17.2s
  umlsl v8.2d, v16.2s, v17.2s
  umlsl v9.2d, v16.2s, v17.2s
  umlsl v10.2d, v16.2s, v17.2s
  umlsl v11.2d, v16.2s, v17.2s
  umlsl v12.2d, v16.2s, v17.2s
  umlsl v13.2d, v16.2s, v17.2s
  umlsl v14.2d, v16.2s, v17.2s
  umlsl v15.2d, v16.2s, v17.2s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)030e191e1f373f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400723002000173002516010810016001710016002050012801324002040048400391997761999016012020016003220048009640039400481116020110099100100160000100000011110118116404003601600001004004040050400404005040049
16020440039300000003102516011710016000910016002050012801324003040049400391997761999116012020016003220048009640040400391116020110099100100160000100000011110118016404003601600001004004040040400504004040050
16020440040300000003002516010810016000810016002050023991314002040039400491997761999016012020016003220048009640039400481116020110099100100160000100000011110118016204003701600001004004040050400404004040040
16020440048300000014002516010810016000810016002050012801324002040039400391997761999116012020016003220048009640071400391116020110099100100160000100000011110118016504004601600001004004040050400404005040040
160204400393000000173902516010910016001710016002050012801324002040049400481997762000016012020016003220048009640049400481116020110099100100160000100000011110118016104003601600001004004040041400404004140050
160204400403000000173102516010910016001710016002050012801324002040039400481997761999016012020016003220048009640039400481116020110099100100160000100000011110118016304003601600001004004040041400404004040040
160204400493000000173002516010810016000810016002050013201324002040039400401997761999116012020016003220048009640049400481116020110099100100160000100000011110118016304003701600001004004040050400494007240040
16020440039300000003902516010810016001710016002050012801324003040049400391997762002216012020016003220048009640049400391116020110099100100160000100000011110118016404022701600001004004040041400404004040050
160204400493000000173002516010810016001710016002050012801324002940039400481997761999016012020016003220048009640039400481116020110099100100160000100000011110118016304003701600001004004040050400404004140040
16020440039300000014002516011710016000910016002050012801324002040039400481997762000016012020016003220048009640040400391116020110099100100160000100000011110118016304003701600001004004140040400414004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)18191e1f373a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440091300000000001199251600101016001810160000501280000114002040039400391999632002816001020160000204800004004840039111600211091010160000100000001002231110162111010400450155160000104004040040400404004040050
16002440039299000000001692516001010160000101600005012800001140020400394003919996320019160010201600002048000040039400391116002110910101600001000000010022311816211107400360156160000104004140049400494004940040
160024400392990001000004625160010101600001016000050128000011400204003940039199963200191600102016000020480000400394003911160021109101016000010000000100223118162111010400360155160000104005040050400404004040040
160024400393000000000004625160027101600001016000050128000011400204003940039199963200191600102016000020480000400394003911160021109101016000010000000100223111016211107400360156160000104005040040400404004040040
160024400393000000000180462516001010160018101600005012800001140020400394004919996320019160010201600002048000040039400491116002110910101600001000000010024102213164221284003603010160000104004040040400404004040040
16002440039300000000018146251600101016000010160000501280000014002040049400391999632002816001020160000204800004003940039111600211091010160000100000001002462210162221074003603010160000104004040040400404004040040
1600244003930000000000071725160028101600001016000050128000001400304004840048199963200291600102016000020480000400394003911160021109101016000010000000100243237164228104003603010160000104004040040400504004040040
16002440039299000000000157251600101016000010160000501280000014003040039400391999632001916001020160000204800004003940039111600211091010160000100000001002232112164228124003601510160000104004040040400404004040040
1600244003930000000001802642516001010160000101600005012800000140030400394004919996320019160010201600002048000040039400391116002110910101600001000000010024622101642112124003603010160000104004040040400404004940040
1600244003930000000000052251600101016000010160000501280000014002040048400481999632001916001020160000204800004003940039111600211091010160000100000001002232210164228104004603010160000104004940040400404004040050