Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1M

Test 1: uops

Code:

  sha1m q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e5051schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100440383096128650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383106128654022251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
1004403830876128654022251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128654022251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
1004403830010328650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128650251000100010001398954019403840383690338961000100030004038403811100110000073216223873100040394039403940394039

Test 2: Latency 1->1

Code:

  sha1m q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204400383000000612986525101001001000010010000500142689504001940086400383854033874610100200100002003000040038400381110201100991001001000010000520071003163339871100001004003940039400394003940039
10204400383000000612986525101001001000010010000500142689514001940038400383854033874610100200100002003000040038400381110201100991001001000010000000071013163339871100001004003940039400394003940039
1020440038299000072629865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100003000071213163439871100001004003940039400394003940039
10204400383000000612986525101001001000010010000500142689514001940038400383854033874610100200100002003000040038400381110201100991001001000010000000071013163339871100001004003940039400394003940039
10204400383000000612986525101001001000010010000500142689504001940038400383854033874610100200100002003000040038400381110201100991001001000010000200071013163339871100001004003940039400394003940039
1020440038300001470612986525101001001000010010000500142689504001940038400383854033874610100200100002003000040038400381110201100991001001000010020000071013243339871100001004003940039400394003940039
10204400822990000612986525101001001000010010000500142689514001940038400383854033874610100200100002003000040038400381110201100991001001000010000000071013163339871100001004003940039400394003940039
10204400382990000612986525101001001000010010000500142689504001940038400383854033874610100200100002003000040038400381110201100991001001000010000000071013163339871100001004003940039400394003940039
10204400383000001612986525101001001000010010000500142689504001940038400383854033874610100200100002003000040038400381110201100991001001000010000100071213163339871100001004003940039400394003940039
10204400383000000612986525101001001000010010000500142689514001940038400383854033874610100200100002003000040038400381110201100991001001000010000600071013163339871100001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03091e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400383000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001002960640516333987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001003030640316433987210000104003940039400394003940039
1002440038300006129865251001010100001010000501426895040019400384003838562338768100102010000203000040038400381110021109101010000100200640316343987210000104003940039400394003940039
10024400383000072629865251001010100001010000501426895040019400384003838562338768100102010000203000040038400381110021109101010000100200640316343987210000104003940039400394003940039
1002440038299006129865251001010100001010000501426895040019400384003838562338768100102010000203000040038400381110021109101010000100100640316433987210000104003940039400394003940039
1002440038300006129865251001010100001010000501426895040019400384003838562338768100102010000203000040038400381110021109101010000100221260640316533987210000104003940039400394003940039
1002440038299006129865251001010100001010000501426895140019400384003838562338768100102010000203000040038400381110021109101010000100100640316353994610000104003940039400394003940039
1002440038299006129865251001010100001010000501426895040019400384003838562338768100102010000203000040038400381110021109101010000100100640316353987210000104003940039400394003940039
1002440038300006129865251001010100001010000501426895140019400384003838562338768100102010000203000040038400381110021109101010000100100640316343987210000104003940039400394003940039
1002440038300006129865251001010100001010000501426895140019400384003838562338768100102010068203000040038400381110021109101010000100100640316343987210000104003940039400394003940039

Test 3: Latency 1->2

Code:

  sha1m q0, s0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102045003737400613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500823982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
1020450037374001053982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
1020450037375001033982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625950018500375003748491348745101002001000020030000500375003711102011009910010010000100007101161149822100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002450037375000061398292510010101000010100005017862590500180500375003748513034876710010201000020300005003750037111002110910101000010000640216224982410000105003850038500385003850038
1002450037374000061398292510010101000010100005017862591500180500375003748513034876710010201000020300005003750037111002110910101000010003640216224982410000105003850038500385003850038
1002450037375000061398292510010101000010100005017862590500183500375003748513034876710010201000020300005003750037111002110910101000010000640216224982410000105003850038500385003850038
10024500373750009107398292510010101000010100005017862591500180500375003748513034876710010201000020300005003750037111002110910101000010000640216224982410000105003850038500385003850038
1002450037375000061398292510010101000010100005017862591500180500375003748513034876710010201000020300005003750037111002110910101000010000640216224982410000105003850038500385003850038
1002450037375000061398292510010101000010100005017862590500180500375003748513034876710010201000020300005003750037111002110910101000010000640316234982410000105003850038500385003850038
100245003737500012103398292510010101000010100005017862590500180500375003748513034876710010201000020300005003750037111002110910101000010003640216224982410000105008550038500385003850038
10024500373750110631398292510010101001012100005017862590500180500375003748513034876710010201000020300005003750037111002110910101000010000640216324982410000105003850038500385003850038
10024500373750001261398292510010101000010100005017862591500180500375003748513034876710010201000020300005003750037111002110910101000010003640216224982410000105003850038500385003850038
1002450037375000396541398292510010101000010100005017862590500180500375003748513034876710010201000020300005003750037111002110910101000010000640216234982410000105003850038500385003850038

Test 4: Latency 1->3

Code:

  sha1m q0, s1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102045003737400613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050230500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850085500385003850038
1020450037375007263982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737400613982925101001001000010010000500178625905001850037500374850934874510100200100002003000050037500371110201100991001001000010030430007101161149822100001005003850038500385003850038
102045003737400613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625915001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385003850038
102045003737500613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000000007101161149822100001005003850038500385008550133

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100245003737400006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100640006402162249824010000105003850038500385003850038
10024500373740000613982925100101010000101000050178625905001850037500374851334876710010201000020300005003750037111002110910101000010000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001850037500374851334876710010201000020300005022750037111002110910101000010007006402162249824010000105003850038500385003850038
10024500373750000613982925100101010004101000050178640505001850037500374851334876710010201000020300005003750037111002110910101000010000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001850037500374851334876710010201000020300005003750037111002110910101000010000006402162249824010000105003850038500385003850038
10024500373750030613982925100101010000101000050178625905001850037500374851334876710010201000020300005003750037111002110910101000010000006402162449824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001850037500374851334876710010201000020300005003750037111002110910101000010000606402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001850037500374851334876710010201000020300005022750037111002110910101000010000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001850037500374851334876710010201000020300005003750037111002110910101000010010006402162249824010000105003850038500385003850038
100245003737500002513982925100101010000101000050178625905001850037500374851334876710010201000020300005003750037111002110910101000010000006402162249824010000105003850038500385003850038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1m q0, s8, v9.4s
  movi v1.16b, 0
  sha1m q1, s8, v9.4s
  movi v2.16b, 0
  sha1m q2, s8, v9.4s
  movi v3.16b, 0
  sha1m q3, s8, v9.4s
  movi v4.16b, 0
  sha1m q4, s8, v9.4s
  movi v5.16b, 0
  sha1m q5, s8, v9.4s
  movi v6.16b, 0
  sha1m q6, s8, v9.4s
  movi v7.16b, 0
  sha1m q7, s8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204320038239700000270923986525801001008000010080000500114368951320019320038320038299790329999680100200800002002400003200383200381116020110099100100160000100000000010110416113198860160000100320039320039320039320039320039
1602043200382397000006123986525801001008000010080000500114368951320019320038320038299790329999680100200800002002400003200383200381116020110099100100160000100000000010110116113198860160000100320039320039320039320039320039
1602043200382397000006123986525801001008000010080000500114368950320019320038320038299790329999680100200800002002400003200383200381116020110099100100160000100000000010110116113198860160000100320039320039320039320039320039
160204320038239700000612398652580100100800001008000050011436895132001932003832003829979032999968010020080000200240000320096320038111602011009910010016000010010322850010110116113198860160000100320039320039320097320039320039
1602043200382398000004412398652580100100800001008000050011436895032001932003832003829979032999968010020080000200240000320038320038111602011009910010016000010000003690010110116113198860160000100320039320039320039320039320039
160204320038239800000203623986525801001008000010080000500114368951320019320038320038299790329999680100200800002002400003200383200381116020110099100100160000100000000010110117113198860160000100320039320039320096320039320039
1602043200382398000007262398652580100100800001008000050011436895132001932003832003829979032999968010020080000200240000320038320038111602011009910010016000010000004050010110116113198860160000100320039320097320039320039320039
160204320096239700000212239865258010010080000100800005001143689513200193200383200382997903299996801002008000020024000032003832003811160201100991001001600001000092000010110116113198860160000100320039320039320039320039320039
1602043200382397000006123986525801001008000010080000500114368951320019320038320038299826329999680100200800002002400003200383200951116020110099100100160000100000000010110116113198860160000100320039320039320039320039320039
160204320038239700000612398652580100106800001008000050011436895132001932003832009529979032999968010020080000200240000320038320038211602011009910010016000010000181000010110116113198860160000100320039320039320039320039320039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024320376240621110198008372398652580010108000010800005011436895113200193200383200382998123300018800102080000202400003200383200381116002110910101600001000000010022311231611116631988721191516000010320039320039320039320039320039
16002432003823980000000067239835258001010800001080000501143689511320019320038320038299812330001880010208000020240000320038320038111600211091010160000100000001009231161611161631988721191516000010320039320039320039320039320039
16002432003823970000000067239865258001010800001080000501143689511320019320038320038299812330001880010208000020240000320038320038111600211091010160000100003001002231161611161631988721191516000010320039320039320232320039320039
1600243200382397000003007322398652580010108000010800005011436895113200193200383200382998123300018800102080000202400003200383200381116002110910101600001000030010022311616111161731988721191516000010320039320039320039320039320137
1600243200382398000000006723986525800101080000108000050114368951132001932003832003829983633000188001020800002024000032003832013511160021109101016000010000000100223111616111171731988721191516000010320039320039320039320039320039
16002432003823980000000067239865258001010800001080000501143689511320019320038320038299812330001880010208000020240000320038320038111600211091010160000100103001002231161611116631988721191516000010320039320039320039320089320039
1600243200382397000000007322398652580010108000010800005011437165113200193200383200382998123300018800502080000202400003200383200381116002110910101600001000000010022311161611161631988721381516000010320039320039320039320039320039
160024320038239700000000672398652580010108000010800005011436895113200193200383200382998123300018800102080000202400003200383200381116002110910101600001000000010022311161614116631988721191516000010320039320039320039320039320039
1600243200382397000006007322398652580010108000010800005011436895113200193200383200382998123300018800102080000202400003200383200381116002110910101600001001700001002231161611117731988721191516000010320039320039320039320039320039
1600243200382397000000006723986525800101080000108000050114368951132001932003832003829981233000188001020800002024018032003832003811160021109101016000010000000100223111616111121531988721191516000010320039320039320039320087320039

Test 6: throughput

Count: 16

Code:

  sha1m q0, s16, v17.4s
  sha1m q1, s16, v17.4s
  sha1m q2, s16, v17.4s
  sha1m q3, s16, v17.4s
  sha1m q4, s16, v17.4s
  sha1m q5, s16, v17.4s
  sha1m q6, s16, v17.4s
  sha1m q7, s16, v17.4s
  sha1m q8, s16, v17.4s
  sha1m q9, s16, v17.4s
  sha1m q10, s16, v17.4s
  sha1m q11, s16, v17.4s
  sha1m q12, s16, v17.4s
  sha1m q13, s16, v17.4s
  sha1m q14, s16, v17.4s
  sha1m q15, s16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0002

retire uop (01)cycle (02)03l1i tlb fill (04)09191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602046400384794000006147986525160129100160000100160000590228768951640019064003864003861979003619996160100200160000200480000640038640075111602011009910010016000010000001018900001011000916099639873000160000100640039640039640039640039640085
1602046400384864000120072647986543160100100160000100160000500228768951640019064003864003861979003619996160100200160000200480000640038640038111602011009910010016000010000006500000010110009160124639873000160000100640087640039640039640039640039
160204640038479400000726479847251601001001600001001600005002287689506400190640038640038619790036199961601002001600002004800006400386400381116020110099100100160000100000000201000010110001316097639873000160000100640039640039640039640039640039
160204640134479410000726479865251601061001600001001600005002287689506400190640038640038619790036199961601002001600002004800006400386400381116020110099100100160000100000000000001011000716045639873000160000100640039640039640135640039640039
160204640038479400000749479865251601001001600001001600005002287689506400190640038640038619790036199961601002001600002004800006400386400381116020110099100100160000100000000000001011000192020810639873000160000100640079640039640039640039640039
1602046400384794000352061479865251601001001600001001600005652287689506400190640086640038619790036199961601002001600002004802166400386400381116020110099100100160000100000010000001011000916049639873000160000100640039640039640039640039640039
160204640038479400000180047986537160100100160000100160000500228768950640019064003864003861979003619996160100200160000200480000640076640038111602011009910010016000010000001030000101100010160996398731400160000100640039640039640039640039640039
1602046400384795000001844479865251601001001600001001600005002287689506400190640086640038619790036199961601002001600002004800006400866400861116020110099100100160000100000000000001011001816094639873000160000100640039640039640039640039640039
160204640038479500000726479865251601001001600001001600005002287689506400190640038640038619790036199961601002001600002004800006400386400382116020110099100100160000100000000000001011020916049639873000160000100640039640039640039640039640039
160204640038479400000726479865251601001001600001001600005002287703916400190640038640038619790036199961601002001600002004800006400386400381116020110099100100160000100000000000101011020716049639873000160000100640039640039640039640039640039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0002

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600246400384794100000000076547986525160017111600001016000050228768951156400190640038640086619812036200181600102016000020480000640076640038111600211091010160000100000000001002285281611174639873021211516000010640039640039640039640039640039
16002464003847950000000000637479829251600101016000010160000502287689511564001906400386400386198120362001816001020160000204800006400386400381116002110910101600001000000000010022114141612164639873042423016000010640039640039640039640039640039
160024640038479400000001500738479865251600101016000010160000502287689501564001906400386400386198120362001816001020160000204800006400386400381116002110910101600001000000000110024115141622235639873042213016000010640039640039640039640039640039
16002464003847950000000000738479865251600101016000010160000502287689501564005606400386400386198120362001816001020160000204800006400386400381116002110910101600001000065900001002285141611155639873021211516000010640087640039640039640039640077
1600246400384794000000000073247986525160010101600001016000050228768951106400190640086640038619812036200181600102016000020480000640038640038111600211091010160000100000000001002285171611259639873021211516000010640080640039640039640039640039
1600246400384794000000000025747986525160010101600001116000050228768951156400190640038640038619812036200181600102016000020480000640038640038111600211091010160000100000000001002285141611177639873021211516000010640039640039640039640039640039
1600246400384795000000000073247986525160010101600001016000050228768951156400190640038640038619812036200181600102016000020480204640038640038311600211091010160000100000852620001002285141611135639873021211516000010640620640039640039640234640039
1600246400384794000000000073247986525160010101600001016000050228768951156400193640038640038619812036200181600102016000020480000640038640038111600221091010160000100000000001002285171651176639873021211516000010640039640039640039640039640039
1600246400384794000000000044747986525160010101600001016000050228768951156400190640038640038619812036200181600102016006820480000640038640038111600211091010160000100000600001002235161611187639873021213016000010640039640039640039640039640039
1600246400384794000000090075347986525160010101600001016000050228768951156400560640038640038619812036200531600102016000020480000640038640038111600211091010160000100000000001002285171611178639873021211516000010640039640039640039640039640083