Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA256H

Test 1: uops

Code:

  sha256h q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10044038300006128652510001000100013989540194038403836903389610001000300040384038111001100000073116213873100040394039403940394039
10044038300006128652510001000100013989540194038403836903393010001000300040384038111001100000073116113873100040394039403940394039
10044038300006128652510001000100013989540194038403836903389610001000300040384038111001100000073116113873100040394039403940394039
100440383000061286525100010001000139895401940384038369033896100010003000403840381110011000025073116113873100040394039403940394039
10044038300006128652510001000100013989540194087403836903389610001000300040384038111001100000073116113873100040394039403940394039
10044038300006128652510001000100013989540194038403836903389610001000300040384038111001100000373116113873100040394039403940394039
100440383000061286525100010001000139895401940384038369033896100010003000403840381110011000015073116113873100040394039403940394039
10044038300006128652510001000100013989540194038403836903389610001000300040384038111001100000073116113873100040394039403940394039
10044038300006128652510001000100013989540194038403836903389610001000300040384038111001100000073116113873100040394039403940394039
10044038310006128652510001000100013989540194038403836903389610001000300040384038111001100001073116113873100040394039403940394039

Test 2: Latency 1->1

Code:

  sha256h q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020440038300000000360529865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000000710121622398710100001004003940039400394003940039
10204400383000000006129865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000000710121623398710100001004003940039400394003940039
10204400383000000006129865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000000710121622398710100001004003940039400394003940039
10204400383000000016129865251010010010000100100005001426895140019400384003838540338746101002001000020030000400384003811102011009910010010000100000002710121622398710100001004003940039400394003940039
10204400383000000006129865251010010010000100100005001426895140019400384003838540338746101002001000020030000400384003811102011009910010010000100300300710142522398710100001004003940039400394003940039
10204400383000000006129865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000000712121623398710100001004003940039400394003940039
10204400383000000008229865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000100710121622398710100001004003940039400394003940039
10204400383000000006129865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000000710121622398710100001004003940039400394003940039
10204400383000000016129865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000001710121622398710100001004003940039400394003940039
10204400383000000006129865251010010010000100100005001426895040019400384003838540338746101002001000020030000400384003811102011009910010010000100000000710121622398710100001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006403162239872010000104003940039400394003940039
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
100244003829900305799298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
1002440038299000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
10024400383000000726298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000007294642140095010000104027940325402324018440039
1002440038300000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000000006402162239872010000104003940039400394003940039

Test 3: Latency 1->2

Code:

  sha256h q0, q0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020450037375009823982925101001001000010010000500178625915001805003750037484986487401010020010008200300245003750037111020110099100100100001000011171801600498340100001005003850038500385003850038
102045003737500094339829251010010010000100100005001786259150018050037500374849874874010100200100082003002450037500371110201100991001001000010001211171802400498350100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
10204500373750007263982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037374000613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375009613982925101001001000010010000500178625915001805003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498600100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024500373740000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038
10024500373750000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038
1002450037375000000018253982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402163249824010000105003850038500385003850038
1002450037374000000017443982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162149824010000105003850038500385003850038
10024500373750000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038
10024500373740000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037211002110910101000010000000006402162249824010000105003850038500385003850038
100245003737400000006139829431001010100001010000501786259150018500375003748513264876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038
10024500373750000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038
10024500373740000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038
10024500373740000000613982925100101010000101000050178625915001850037500374851334876710010201000020300005003750037111002110910101000010000000006402162249824010000105003850038500385003850038

Test 4: Latency 1->3

Code:

  sha256h q0, q1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)st unit uop (a7)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020450037375000012061398292510100100100001001000050017867891500185003750037484913487451010020010000200300005003750037711020110099100100100001000000007101161149822100001005003850038502255003850038
102045003737500600061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000010007721161249822100001005003850038500385003850038
1020450037374000040835261398292510127106100001001000050017862591500185003750037484913487451010020010000200300005003750037511020110099100100100001000012007101161149822100001005003850038501795013350038
102045003737500000061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000050007101161149822100001005003850038500385003850038
102045003737500000061398292510100100100001001000050017862591500185022550037484913487451010020010000200300005003750037111020110099100100100001000000007101161349822100001005003850180502275008550038
102045003737500036961661398292510100100100051001000060617862591500185003750037485433487451010020210000200300005003750037111020110099100100100001000000307101161149822100001005003850038500385003850038
1020450037376004039635261398292510120100100001201000060517867891500185003750037484913487451010020010000200300005003750037111020110099100100100001002000307101161149822100001005003850038500385003850038
1020450037375000018061398292510100100100001001000050017862591500185003750037484913487451010020010000200301685003750037111020110099100100100001000000007101161149822100001005003850086500385003850038
10204500373750000120103398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000007101161149822100001005003850038500385003850038
1020450037375100000628398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000151617101161149822100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024500373750000613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100211091010100001000006403162249824010000105003850038500385003850038
10024500373740000613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100231091010100001000006402162249824010000105003850038500385003850038
100245003737500006139829251001010100001010000501786259050018050037500374851314487671001020100002030000500375003711100211091010100001000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100221091010100001000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001805003750037485133487671001020100002030204500375003711100211091010100001000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100211091010100001000006402162249824110000105003850038500385003850038
10024500373750003613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100211091010100001000006402162249824010000105003850038500385003850038
10024500373740000613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100211091010100001000006402162249824010000105003850038500385003850038
100245003737500007263982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100211091010100001000006402162249824010000105003850038500385003850038
10024500373750000613982925100101010000101000050178625905001805003750037485133487671001020100002030000500375003711100211091010100001000006402162249824010000105003850038500385003850038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha256h q0, q8, v9.4s
  movi v1.16b, 0
  sha256h q1, q8, v9.4s
  movi v2.16b, 0
  sha256h q2, q8, v9.4s
  movi v3.16b, 0
  sha256h q3, q8, v9.4s
  movi v4.16b, 0
  sha256h q4, q8, v9.4s
  movi v5.16b, 0
  sha256h q5, q8, v9.4s
  movi v6.16b, 0
  sha256h q6, q8, v9.4s
  movi v7.16b, 0
  sha256h q7, q8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204160039119900906179937258010010080000100800005005677952116002001600391600391398873139997801002008000020024000016003916003911160201100991001001600001000000101101016891599800160000100160040160040160040160040160040
1602041600391199002407267993725801001008000010080000526567795211600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000010110916991599800160000100160040160040160040160040160040
16020416003911980000617993725801001008000010080000500567795211600200160039160039139887313999780100200800002002400001600391600392116020110099100100160000100000110110816981599800160000100160040160040160040160040160040
160204160039119900007267993725801001008000010080000500567795211600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000010110916991599800160000100160040160040160040160040160040
16020416003911990000617993725801001008000010080000500567795211600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000010110816891599800160000100160040160040160040160040160040
160204160039119911906179937258010010080000117802705005677952116002001600391600391399673014001580351200803402002411941603901603393116020110099100100160000100000010110916891599800160000100160040160040160040160040160040
16020416003911990000617993725801001008000010080000500567795211600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000010110916991599800160000100160040160040160040160040160040
16020416003911990000827993725801001008000010080000500567795211600200160039160039139887313999780100200800002002401921600391600391116020110099100100160000100000010110916981599800160000100160040160040160040160040160040
16020416003911990000617993725801001008000010080000500567795211600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000010110916941599800160000100160040160040160040160040160040
16020416003911980000617993725801001008000010080000500567795211600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000010110816991599800160000100160040160040160040160040160040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002416003911990006779937258001010800001080000505677952115160020160039160039139909314001980010208000020240000160039160039111600211091010160000100000100228211116211531599821513716000010160040160040160040160040160040
16002416003911990072677993725800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000010022821316211931599821526716000010160040160040160040160101160040
16002416003911990078677993725800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010006010041821316211361599821513716000010160279160040160040160040160174
1600241601801198008976779937258001010800001080000505677952115160020160039160039139909314001980010208000020240000160039160039111600211091010160000100000100241121916211531599821513716000010160040160040160040160040160040
16002416003911990005427993725800101080000108000050567795211516002016003916003913990931400198005520800002024000016003916003911160021109101016000010000010022821316211531599821513716000010160040160040160040160040160040
1600241600391199000737993725800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000010022821916211531599821526716000010160040160101160040160040160040
160024160039119900547327993725800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000010022831916211931599821513716000010160040160101160040160040160040
16002416003911990012677993754800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000010022821916211941599821513716000010160040160040160040160040160040
16002416003911990007327993725800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000010022831316211551599821513716000010160040160040160040160040160040
160024160039119800714677993725800101080000108000050567795211516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000010022821916211391599821513716000010160040160040160040160040160040

Test 6: throughput

Count: 16

Code:

  sha256h q0, q16, v17.4s
  sha256h q1, q16, v17.4s
  sha256h q2, q16, v17.4s
  sha256h q3, q16, v17.4s
  sha256h q4, q16, v17.4s
  sha256h q5, q16, v17.4s
  sha256h q6, q16, v17.4s
  sha256h q7, q16, v17.4s
  sha256h q8, q16, v17.4s
  sha256h q9, q16, v17.4s
  sha256h q10, q16, v17.4s
  sha256h q11, q16, v17.4s
  sha256h q12, q16, v17.4s
  sha256h q13, q16, v17.4s
  sha256h q14, q16, v17.4s
  sha256h q15, q16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)03mmu table walk data (08)181e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204320039239700000226815993725160100100160000100160000500113579521320020032003932003929988703299997160100200160000200480000320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320040320040320040
1602043200392397000008215993725160100100160000100160000500113579520320020032003932003929988703299997160100200160072200480000320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320091320040320040
16020432003923980000061159937251601001001600001001600005001135795203200200320039320039299887024299997160100200160000200480198320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320040320040320040
16020432003923970000056915993725160100100160000100160000500113579521320020032003932003929988703299997160100200160000200480000320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320040320040320040
160204320039239700000611599372516011410016001310416000050011357952032002003200883200392998870329999716010020016000020048000032003932003921160201100991001001600001000000099301010110116113199680160000100320040320040320040320040320040
1602043200392397000006115993725160100100160000100160000500113579520320020032003932003929988703299997160100200160000200480000320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320040320040320040
16020432003923970300072615993725160100100160013100160000500113579520320020032009032003929988703299997160100200160000200480000320039320039111602011009910010016000010000000600010110116113199680160000100320040320040320040320040320040
16020432003923980039006115993725160100100160000100160000500113579521320020032003932003929988703299997160100200160000200480000320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320040320040320040
16020432003923970000072615993725160100100160000100160000500113579520320020032003932003929988703299997160100200160000200480000320039320039111602011009910010016000010000000000010110116113199680160000100320040320040320040320040320040
1602043200912398000006115993725160100100160000100160000500113579521320020032003932003929988703299997160100200160000200480000320039320039111602011009910010016000010000000010010110116113199680160000100320040320040320040320040320040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002432003923970102000732159937421600101016000010160000501135783511032002003200393200392999090330001916001020160000204802043200393200391116002110910101600001000062000010024116211161115331996802120916000010320040320040320040320040320040
1600243200392397000000073815993725160010101600001016000050113579521153200200320039320039299909033000501600102016000020480000320039320039111600211091010160000100000000010022116231622246319968042201816000010320040320040320040320040320040
1600243200392397000000073159937251600101016000010160000501135795201532002033200393200392999090330005816001020160000204800003200393200391116002110910101600001000000390010024116261622256320008042401816000010320040320040320040320040320040
1600243200392397000004203407159937251600101016000010160000501135795211532002003200393200392999090330001916001020160000204800003200393200391116002110910101600001000000000100223514161113431996802120916000010320040320040320040320040320040
1600243200392398000000067159937251600101016001312160000501135795201532002003200393200392999090330001916001020160000204800003200893200391116002110910101600001000000000100228514161116431996802120916000010320040320040320091320040320040
16002432003923980001000162159937251600101016000010160000501135795211532002003200393200392999090330001916001020160000204800003200393200391116002110910101600001000000000100228114161116531996802120916000010320040320040320040320040320040
160024320039239800000006715993725160010101600001016000050113579528153200200320491320387299909017300019160010201600002048085832003932009111160021109101016000010240027415001002432615162214631996804240916000010320040320040320040320040320040
16002432003923970000000738159937251600101016000010160000501135795281532002003200393200392999090330001916001020160000204800003200903200901116002110910101600001000000000100228616161113631996802120916000010320091320040320040320040320040
1600243200392397000000067159937251600251016000010160045501135795211532002003200393200392999090330001916001020160000204800003200393200391116002110910101600001000000000100228625161116631996802120916000010320040320040320040320040320040
16002432003923970000000731599372516001010160000101600005011357952015320020032003932003929990903300019160010201600002048000032003932003911160021109101016000010000000001002411723171227631996804240916000010320040320040320240320091320040