Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA256H2

Test 1: uops

Code:

  sha256h2 q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100440383106128652510001000100013989514019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128652510001000100013989504019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
1004403830019728652510001000100013989504019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
100440383006128652510001000100013989504019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
1004403830061286525100010001000139895040194038403836903389610001000300040384038111001100022673216223873100040394039403940394039
100440383106128652510001000100013989504019403840383690338961000100030004038403811100110005073216223873100040394039403940394039
100440383006128652510001000100013989504019403840383690338961000100030004038403811100110000073216223873100040394039403940394039
1004403831061286525100010001000139895040194038403836903389610001000300040384038111001100013073216223873100040394039403940394039
1004403830061286525100010001000139895040194038403836903389610001000300040384038111001100016073216223873100040394039403940394039
100440383006128652510001000100013989504019403840383690338961000100030004038403811100110000073216223908100040394039403940394039

Test 2: Latency 1->1

Code:

  sha256h2 q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9accdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003830000000124298652510100100100001001000050014268950400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071003162239871100001004003940039400394003940039
10204400383000000061298652510100100100001001000050014268951400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071002162239871100001004003940039400394003940039
10204400383000000061298652510100100100001001000050014268951400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071002162239871100001004003940039400394003940039
102044003830000000233298652510100100100001001000050014268950400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071002162239871100001004003940039400394003940039
102044003830000000533298652510100100100001001000050014268951400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071012162239871100001004003940039400394003940039
10204400383000000061298652510100100100001001000050014268950400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071012162239871100001004003940039400394003940039
102044003830000054061298652510100100100001001000050014268951400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071012163239871100001004003940039400394003940039
10204400383000000061298652510100100100001001000050014268951400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071013162339871100001004003940039400394003940039
10204400383000000061298652510100100100001001000050014268950400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071012162339871100001004003940039400394003940039
10204400383000000061298652510100100100001001000050014268951400194003840038385403387461010020010000200300004003840038111020110099100100100001000000071012162239871100001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100244003830000000229298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640416333987210000104003940039400394003940039
100244003830000000251298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640316333987210000104003940039400394003940039
10024400383011132061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640316433987210000104003940039400394003940039
10024400383000000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640316433987210000104003940039400394003940039
10024400382990000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003821100211091010100001000640316333987210000104003940039400394003940039
100244003830000000726298652510010101000010100005014268951400194003840038385623387681001020100002030168400384003811100211091010100001000640316333987210000104003940039400394003940039
10024400383000000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640316333987210000104003940039400394003940039
10024400383000000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001010640316333987210000104003940039400394003940039
10024400383000000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640316433987210000104003940039400394003940039
10024400383000000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640316333987210000104003940039400394003940039

Test 3: Latency 1->2

Code:

  sha256h2 q0, q0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0f191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020450037375000200613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000000613982925101001001000010010000500178625915001850037500374851303487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037374000000613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000072711611498220100001005003850038500385003850038
1020450037375000000613982925101001001000010010000500178625905001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000000613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000001071011611498220100001005003850038500385003850038
1020450037374000000613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000000613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
10204500373750000007263982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000000613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000000613982925101001001000010010000500178625915001850037500374849103487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)0318191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100245003737500061398292510010101000010100005017862595001850082500374851303487671001020100002030000500375003711100211091010100001000000640316734982410000105003850038500385003850038
100245003737412061398292510010101000010100005017862595001850037500374851303487671001020100002030000500375003711100211091010100001020000640316434982410000105003850038500385003850038
100245003737500061398292510010101000010100005017862595001850037500374851303487671001020100002030000500375003711100211091010100001000000640316434982410000105003850038500385003850038
1002450037375000103398292510010121001010100007117862595008850037500844851303487671001020100002030000500845008511100211091010100001000030640316534982410000105003850038500385003850038
1002450037374001261398292510010101000010100005017862595001850037500374851303487671001020100002030000500375003711100211091010100001000000640316634982410000105003850038500385003850038
100245003737500061398292510026101000010100006117862595001850037500374857803487671001020100002030000500375003711100211091010100001000000640316344982410000105003850038500385003850038
100245003737400061398292510010101000010100005017862595001850037500374851303487671001020100002030000500375003711100211091010100001000000640316334982410000105003850038500385003850038
1002450037375000536398292510010101000010100005017862595001850037500374851333487671001020100002030000500375003711100211091010100001000000640316534982410000105003850038500385003850038
1002450037375000107398292510010101000010100005017862595001850037500374851303487671001020100002030000500375003711100211091010100001000000640316334982410000105003850038500385003850038
100245003737500061398292510010101000010100005017862595001850037500374851303487671001020100002030000500375003711100211091010100001000000640316434982410000105003850038500385003850038

Test 4: Latency 1->3

Code:

  sha256h2 q0, q1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020450037375000061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000061398292510100100100001001000050017862590500185003750037484913487451010020010000200300005003750037211020110099100100100001000000071011611498220100001005003850086500385003850038
1020450037375000061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037374000061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037374000061398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450084374000061398292510100104100001001000050017862590500185003750037484913487771010020010000200300005003750037111020110099100100100001000004371011610498220100001005003850038500385003850038
10204500373750027688460398292510100100100001001000050017862591500185003750037484913487451010020010000200300005003750037111020110099100100100001000010071011611498220100001005003850038500385003850038
10204500373740000103398292510100100100001001000050017862590500185003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385008550038
102045003737511120580398292510100100100001001000050017862590500185003750037484913487451010020010000200300005003750037111020110099100100100001000000071011611498220100001005003850038500385003850038
1020450037375000061398292510100100100001001000050017862590500185003750037484913487451010020010000200300005003750037111020110099100100100001000000371011611498227100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002450037375006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037374006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037375006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037375006139829251001010100001010000501786399050018500375003748513348767100102010000203000050037500371110021109101010000100306402162249824010000105003850038500385003850038
1002450037375006139829251001010100001010000501786259150018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037375006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037374006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037374096139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037375006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038
1002450037375006139829251001010100001010000501786259050018500375003748513348767100102010000203000050037500371110021109101010000100006402162249824010000105003850038500385003850038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha256h2 q0, q8, v9.4s
  movi v1.16b, 0
  sha256h2 q1, q8, v9.4s
  movi v2.16b, 0
  sha256h2 q2, q8, v9.4s
  movi v3.16b, 0
  sha256h2 q3, q8, v9.4s
  movi v4.16b, 0
  sha256h2 q4, q8, v9.4s
  movi v5.16b, 0
  sha256h2 q5, q8, v9.4s
  movi v6.16b, 0
  sha256h2 q6, q8, v9.4s
  movi v7.16b, 0
  sha256h2 q7, q8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204160039119900000061799372580100100800001008000050056779521600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000001011031611159980160000100160040160040160040160040160040
160204160039119900000061799372580100100800001008000050056779521600200160039160039139887313999780100200800002002406751600391600391116020110099100100160000100000001011011611159980160000100160040160040160040160040160040
160204160039119900000061799372580100100800001008004550056779521600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000001011021611159980160000100160040160040160040160040160040
1602041600391199000000726799372580100100800001008000050056779521600200160039160100139887313999780100200800002002400001600391600391116020110099100100160000100000001011021611159980160000100160040160040160040160040160040
160204160039119900000061799372580100100800001008000050056779521600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000001011021611159980160000100160040160040160040160040160040
1602041600391199000130319261799372580100100800001008000050056779521600580160039160039139887313999780100200800002002408461602891600391116020110099100100160000100200001011021611159980160000100160040160040160147160040160040
1602041600871199011012061799372580100100800001008000050056779521600200160039160039139887313999780100200800002002406781600391600391116020110099100100160000100000001011031611159980160000100160040160040160040160040160040
1602041600391199000000726799372580100100800001008000050056779521600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000001011021611159980160000100160040160040160040160040160040
16020416003911990000001280799372580100100800001008000050056779521600203160039160039139887313999780100200800002002400001600391600391116020110099100100160000100000001011021611159980160000100160040160095160040160040160040
160204160039119800003061799372580100100800001008000050056779521600200160039160039139887313999780100200800002002400001600391600391116020110099100100160000100003001011021611159980160000100160040160040160040160040160040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024160039119800000900677993725800101080000108000050567795221016002016003916003913990931400198001020800002024000016003916003911160021109101016000010000000000010022622141621171015998202017916000010160040160040160040160040160040
160024160039119800000000677993725800101080000108000050567795211016002016003916003913990931400198001020800002024000016003916003911160021109101016000010000000000010022311161621171115998202017916000010160040160040160040160040160040
16002416003911990004000067799371178001010800131180180505678027115160020160039160039139909171401028005520801132024051016013716009021160021109101016000010000000000010022351151621161015998202017916000010160040160040160040160040160040
16002416014011991013200077479937258011710800001380225505677952115160020160039160189139909341400748001020800562024000016008916003911160021109101016000010000000000010022841181621112815998202017916000010160040160040160040160040160040
160024160039119900000000677993725800101080000118000050567795221516002016003916003913990931400198001020800002024000016003916003911160021109101016000010000000000010022841151621191015998202017916000010160040160040160040160040160040
160024160039119800000840073279937258001010800001080000505677952210160020160039160039139909314001980010208000020240000160039160039111600211091010160000100000000000100223411216211101915998202017916000010160040160040160040160040160040
16002416003911980000000067799202580010108000010800005056779521001600201600391600391399093140019800102080000202400001600391600391116002110910101600001000000000011002234181621161415998202017916000010160040160040160040160040160040
16002416003911980000000073279937258001010800131080000505677952215160020160039160039139909714001980010208000020240000160039160039111600211091010160000100000000000100228416162116815998202017916000010160040160040160040160040160040
1600241600391199000001200677993725800101080000108000050567795210016002016003916003913990931400198001020800002024000016003916003911160021109101016000010000000000010022315151621181015998202017916000010160040160040160040160040160040
1600241600391199000000007327993725800101080000108000050567795221016002016003916003913990931400198001020800002024000016003916003911160021109101016000010000000000010022341131621115715998202017916000010160040160040160040160040160040

Test 6: throughput

Count: 16

Code:

  sha256h2 q0, q16, v17.4s
  sha256h2 q1, q16, v17.4s
  sha256h2 q2, q16, v17.4s
  sha256h2 q3, q16, v17.4s
  sha256h2 q4, q16, v17.4s
  sha256h2 q5, q16, v17.4s
  sha256h2 q6, q16, v17.4s
  sha256h2 q7, q16, v17.4s
  sha256h2 q8, q16, v17.4s
  sha256h2 q9, q16, v17.4s
  sha256h2 q10, q16, v17.4s
  sha256h2 q11, q16, v17.4s
  sha256h2 q12, q16, v17.4s
  sha256h2 q13, q16, v17.4s
  sha256h2 q14, q16, v17.4s
  sha256h2 q15, q16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204320039239810611599372516010010016000010016000050011357952032002003200393200392998873299997160100200160000200480216320039320039111602011009910010016000010000000010110316223199680160000100320040320040320040320040320040
160204320039239700611599162516010010016000010016000050011357952032002003200393200392998873299997160100200160000200480000320039320039111602011009910010016000010000000010110216223199680160000100320040320085320040320040320040
1602043200392397007261599372516010010016000010016004758511357952032002003200393200392999143299997160100200160000200480000320039320039111602011009910010016000010000000010110216223199680160000100320040320040320040320040320040
160204320039239800611599372516010010016000010016000050011357952032002003200393200392998873299997160100200160000200480000320039320039111602011009910010016000010000000010110116223200400160000100320040320040320040320040320040
1602043200392397007261599372516010010016000010016000050011357952032002003200393200392998873299997160100200160000200480000320039320039111602011009910010016000010000000010110216223199680160000100320040320040320040320040320040
16020432003923970687726159937251601001001600001001600005001135818603207050320039320039299887329999716010020016000020048000032003932003911160201100991001001600001000303300010110216223199680160000100320040320040320040320040320040
160204320039239700611599372516010010016000010016000050011357952032002033200393200392998873299997160100200160000200480000320039320039111602011009910010016000010000000010110216223199680160000100320040320040320040320040320040
1602043200392397006115993725160100100160000100160000500113579520320020032003932003929988732999971601002001600002004800003200393200391116020110099100100160000100000360010110216223199680160000100320040320040320040320040320040
1602043200392397006311599372516010010016000010316000050011357952032002003200393200392998873300034160100200160000200480000320039320039111602011009910010016000010000000010110216223199680160000100320040320040320040320040320040
16020432003923970061159937251601001001600001091601805001135795203200200320039320039299887329999716010020016000020048000032003932003911160201100991001001600001000003540010110216223199680160000100320040320040320040320040320090

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002432003923970000007321599372516001010160000101600005011357952115320020032003932003929990903300019160010201600002048000032003932009011160021109101016000010000000001002282123161111663199681615816000010320089320040320040320040320040
16002432003923970000901311599372516001010160000101600475011357952015320020032003932003929990903300019160192201600002048000032003932003911160021109101016000010000030001002483116161111663199681615816000010320040320040320040320040320040
16002432003923970000002561599372516001010160000101600005011357952115320020032003932003929990903300019160010201600002048000032003932003911160021109101016000010000000001002284116161111663199681615816000010320040320090320040320040320040
16002432003923970000602571599372516001010160000101600005011357952115320020032003932003929990903300019160010201600002048000032003932003911160021109101016000010000000001002285116161117163199681615816000010320040320040320040320040320040
16002432003923970000001551599372516001010160000101600005011358111115320020032003932003929990903300019160010201600002048000032003932003911160021109101016000010000000001002285161611111163199683215816000010320040320040320040320040320040
16002432003923970000007321599372516001010160000101600005011357952115320020032003932003929990903300019160010201600002048000032003932003911160021109101016000010000000001002285116161116163199681615816000010320040320040320040320040320040
16002432003923970000006715993725160024101600651016000050113585611153202480320039320285299929016300019160010201600002048000032003932003911160021109101016000010000046300001002485113161111663199681615816000010320040320040320040320040320391
16002432078424000000007321599372516001010160000101600005011357952115320020032003932003929990903300019160056201600002048000032003932003911160021109101016000010000000001002285161611011363199681615816000010320040320040320040320040320040
160024320039239700006067159937251600101016000010160000501135795211532002003200393200392999090253000191600102016000020480000320039320039111600211091010160000100000000010022851161611116163199681615816000010320040320040320040320040320040
160024320039239800000067159918251600101016000010160000601135795201532002003200393200392999090330001916001020160000204800003200393200391116002110910101600001000000000100228515161117173199683215816000010320040320040320040320040320040