Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1C

Test 1: uops

Code:

  sha1c q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03191e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100440383000822865251000100010001398950401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
100440383009612865251000100010001398951401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
1004403830012612865251000100010001398951401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
1004403830001242865251000100010001398951401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
100440383000612865251000100010001398950401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
100440383000612865251000100010001398950401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
100440383000612865251000100010001398951401940384038369033896100010003000403840381110011000073216223873100040394039403940394039
1004403830001242865251000100010001398950401940384038369033896100010003000403840381110011000073216223873100040394039403940394039

Test 2: Latency 1->1

Code:

  sha1c q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102044003830011001642986525101001001000010010000500142689504001904003840038385477387411010020010008200300244003840038111020110099100100100001000011171903164439883100001004003940039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385477387411010020010008200300244003840038111020110099100100100001000011172004163339883100001004008740039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216166639871100001004003940039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216166639871100001004003940039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216164639871100001004003940039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071214166539871100001004003940039400394003940039
102044003830011101642986525101001001000010010000511142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216164639871100001004003940039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216166639871100001004003940039400394003940039
102044003829911001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216166639871100001004003940039400394003940039
102044003830011001642986525101001001000010010000500142689504001904003840038385403387461010020010000200300004003840038111020110099100100100001000000071216166639871100001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0038

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024400383000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640216323987210000104003940039400394003940039
100244003830002461298652510016101000010100005014268950400194003840087385813387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268951400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400382990361298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
100244003830000726298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039
10024400383000061298652510010101000010100005014268950400194003840038385623387681001020100002030000400384003811100211091010100001000640216223987210000104003940039400394003940039

Test 3: Latency 1->2

Code:

  sha1c q0, s0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204500373750000006139829251010010010000100100005001786259150018500375003748491348745101002001000020030213500375003711102011009910010010000100000071021611498220100001005003850038500385003850038
10204500373750000006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100010071011611498220100001005003850038500385003850038
10204500373740000009439829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011611498220100001005003850038500385003850038
10204500373740000006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011611498220100001005003850085500855003850038
102045003737500000072639829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071011611498220100001005003850038500385003850038
10205500373750000006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100003071011611498220100001005003850038500385003850038
10204500373740000006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100000071221611498220100001005003850038500385003850038
10204500373751000006139829251010010010000100100005001786259050018500375003748491348745101562001000020030000500375003711102011009910010010000100000071011611498220100001005003850038500385003850038
10204500373750000006139829251010010010000132100005001786259150018500375003748491348745101002001000020030000500845003711102011009910010010000100010071011611498220100001005003850038500385003850038
10204500373750000006139829251010010010000100100005001786259150018500375003748491348745101002001000020030000500375003711102011009910010010000100400071011611498220100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10024500373740061398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373750061398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373750061398292510010101000010100005017862591500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373750061398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373740061398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373740061398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
100245003737400726398292510010101000010100005017862590500185003750037485133487671001020100682030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373750061398292510010101000010100005017862591500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
100245003737400726398292510010101000010100005017862591500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105003850038500385003850038
10024500373740061398292510010101000010100005017862591500185003750037485133487671001020100002030000500375003711100211091010100001000000640216224982410000105007150038500385003850038

Test 4: Latency 1->3

Code:

  sha1c q0, s1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020450037375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450084375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450037374000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110202100991001001000010000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450037374000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450037375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020450037374000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038
1020550037375000613982925101001001000010010000500178625905001850037500374849134874510100200100002003000050037500371110201100991001001000010000071011611498220100001005003850038500385003850038

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100245003737400168398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001003000640416564982410000105003850038500385003850038
100245003737400581398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640616654982410000105003850038500385003850038
10024500373750084398292510010101000010100005017862590500185003750037485133487671001020100602030000500375003711100211091010100001010000640616564982410000105003850038500385003850038
1002450037374006553982925100101010000101000050178625905001850037500374851311487671001020100002030000500375003711100211091010100001000000640616644982410000105003850038500385003850038
100245003737500509398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640516664982410000105003850038500385003850038
100245003737500551398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640616564982410000105003850038500385003850038
100245003737500739398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640516664982410000105003850038500385003850038
100245003737500265398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640516664982410000105003850038500385003850038
100245003737500546398292510010101000010100005017862590500185003750037485133487671001020100002030000500375003711100211091010100001000000640616464982410000105003850038500385003850038
100245003737400509398292510010101000010100005017862590500185003750037485133487671001020100002030000500845003711100211091010100001000000640616564982410000105003850038500385003850038

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1c q0, s8, v9.4s
  movi v1.16b, 0
  sha1c q1, s8, v9.4s
  movi v2.16b, 0
  sha1c q2, s8, v9.4s
  movi v3.16b, 0
  sha1c q3, s8, v9.4s
  movi v4.16b, 0
  sha1c q4, s8, v9.4s
  movi v5.16b, 0
  sha1c q5, s8, v9.4s
  movi v6.16b, 0
  sha1c q6, s8, v9.4s
  movi v7.16b, 0
  sha1c q7, s8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0005

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)ec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020432003823980300001032398652580100100800001008000050011436895132001903200383200382997903299996801002028000020024062432003832003811160201100991001001600001000004000001018091609431988600160000100320039320039320039320039320039
1602043200382397000021072623986510480100100800001008000050011436895132001903200383200382997903300103801002008000020024063032003832003811160201100991001001600001000004230001011071607831988600160000100320039320039320039320039320039
1602043200382397000000111423986510180100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000001030001011041609931988600160000100320039320039320039320233320039
160204320038239710010017242398052580100100800001008000050011436895132001903200383202322997903299996801002008000020024000032003832003811160201100991001001600001002010056820001011084909931988600160000100320039320231320039320039320039
160204320231239700000072623986525801001038000010080000500114368951320019032003832003829983733001038010020080000200240000320038320038111602011009910010016000010000000000010110916016931988600160000100320039320039320137320185320039
160204320038239700004809782398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000001030001011081608432003400160000100320039320039320039320039320039
1602043200382397000010207262398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000001011091609931988600160000100320039320039320039320039320039
16020432003823980000007262398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832003811160201100991001001600001000000000001011091609431988600160000100320039320039320039320039320039
160204320038239700003006123986525801001008000010080000500114368951320019032003832003829979032999968010020080000200240000320038320038111602011009910010016000010000000000010110101609931988600160000100320039320039320039320039320039
1602043200382397000000612398652580100100800001008000050011436895132001903200383200382997903299996801002008000020024000032003832023111160201100991001001600001000000000001011091609431988600160000100320039320039320039320232320039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024320038239701000006723986525800351080000108000050114379711153200190320038320038299857330009980171208021020240000320038320038511600211091010160000100000000000010022113181611124319887021191516000010320039320039320039320234320039
160024320038239900000005422398652580010108000010800005011436895115320019032003832003829983533000188001020800002024000032003832008621160021109101016000010000200300001002283181611162319887021191516000010320039320039320039320039320039
16002432003824250001000672398652580035108000010800005011436895115320019032003832003829981233000188001020800002024000032003832003811160021109101016000010000000000001002283141611144319887021191516000010320039320234320092320039320039
160024320038239700000407322398652580010108000010800005011436895115320019032003832003829981233000188017020800002024000032003832003811160021109101016000010000010000001002283141611142319887021191516000010320039320039320039320039320039
160024320038239700000096723986525800101080000108000050114368951153200190320038320038299812330001880010208000020240174320038320038111600211091010160000100000000000010022113141611262319887021193016000010320039320039320039320039320039
16002432003823970000000672398652580010108000010800005011436895115320019032003832003829981233000188001020800002024000032003832003811160021109101016000010000000000001002283121611142319887021191516000010320039320039320039320039320039
1600243200382397000000022382398652580010108000010800005011436895115320019332003832003829981233000188001020800002024000032003832003811160021109101016000010000000000011002283141611124319887021191516000010320039320039320039320233320039
1600243200382397000000024712398652580036108000010800005011436895115320019032003832003829981233000188001020800002024000032023232003811160021109101016000010000010300001002283161611145319887021191516000010320039320039320039320039320039
1600243200382398000000126723986525800101080024118000050114368951153200190322120322063300241194301259818932082573202461593223043219234911600211091010160000100200122120820001166183111737111811322598121201516000010322943323424323074323515325215
16002432496324346121636519833272398502580010108000010800005011436895015320019032003832003829981233000188001020800002024000032003832003811160021109101016000010000000000001002283141611142319887021191516000010320039320039320039320039320039

Test 6: throughput

Count: 16

Code:

  sha1c q0, s16, v17.4s
  sha1c q1, s16, v17.4s
  sha1c q2, s16, v17.4s
  sha1c q3, s16, v17.4s
  sha1c q4, s16, v17.4s
  sha1c q5, s16, v17.4s
  sha1c q6, s16, v17.4s
  sha1c q7, s16, v17.4s
  sha1c q8, s16, v17.4s
  sha1c q9, s16, v17.4s
  sha1c q10, s16, v17.4s
  sha1c q11, s16, v17.4s
  sha1c q12, s16, v17.4s
  sha1c q13, s16, v17.4s
  sha1c q14, s16, v17.4s
  sha1c q15, s16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0002

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204640038479400007474798652516010010016000010016000050022876895164001906400386400386197970661999216010020016000820048002464003864003811160201100991001001600001000000011110117116006398850160000100640081640039640039640039640039
1602046400384795000337264798652516010010016000010016000050022876895164001906400386400386197970661999216010020016000820048002464008664003811160201100991001001600001000000011110117016006398850160000100640039640039640039640039640039
1602046400384794000063744798652516010010016000010016000050022876895064001906400386400386197970661999216010020016000820048002464008564003811160201100991001001600001000003011110117016006398850160000100640039640039640039640039640039
160204640038479500079817794798652516010010016000010016000050022876895164001906400386400386197973661999216010020016000820048002464003864003811160201100991001001600001004400011110117016106398850160000100640039640039640039640039640039
16020464003847940000614798652516010010016000010016000050022876895064001906400386400386197970661999216010020016000820048002464008664003811160201100991001001600001000000011110117116006398850160000100640039640039640039640087640039
1602046400384794000071914798652516010010016000010016000050022876895064001906400386400386198130661999216010020016000820048002464003864003811160201100991001001600001000000011110117016006398850160000100640039640039640039640039640039
1602046400384794000039347986525160100100160000100160000500228768951640019064003864003861979706619992160100200160008200480024640038640038111602011009910010016000010000012411110117016006398850160000100640039640076640615640184640039
1602046400384794000022994798652516012710016000010016000050022876895164001906400386400386197970661999216010020016000820048002464003864003811160201100991001001600001000000011110117016006398850160000100640039640086640039640039640039
1602046400384794000071954798653716010010016000010016000050022876895164001906400386400386197970661999216010020016000820048002464003864007211160201100991001001600001000000011110117016006398850160000100640039640039640039640039640039
16020464003847940000614798652516010010016000010016000050022876895164001906400386400386197970661999216010020016000820048002464003864003811160201100991001001600001000000011110117016106398850160000100640039640039640039640039640039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0002

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600246400384795000000072479865251600101016000010160000502287689511640019064003864003861981203620018160010201600002048000064003864003811160021109101016000010000000600001002231121161111818639873016161316000010640087640039640039640039640085
16002464003847940000000732479865251600101016000010160040502287689511640019064003864003861981203620018160010201600002048000064003864003811160021109101016000010203000000001002232114161121717639873016161316000010640039640039640039640039640039
160024640038479400000002680479865251600101016000010160000502287689511640019064003864003861981203620018160010201600002048020464003864003811160021109101016000010000000000001002431117161111817639873016161316000010640039640039640039640039640039
16002464003847940000000732479865251600101016000010160000502287689511640019064003864003861981203620018160010201600002048000064003864003811160021109101016000010000000000001002231116161112118639873016161316000010640039640039640039640039640039
16002464003847940000000765479865251600101016000010160000502287689501640019064003864003861981203620018160010201600002048000064008564003811160021109101016000010000080040001002231116161111515639873016162616000010640039640039640039640039640039
1600246400384794000000067479865251600101016000010160000502287689511640053064003864003861981203620018160010201600002048000064003864003811160022109101016000010000010000101002431219161111917639873016161316000010640039640039640039640039640078
16002464003847940000090732479865251600101016000010160000502287689511640019064003864003861981233620018160010201600002048000064003864003811160021109101016000010400000000001015831123161211716639873016162616000010640075640039640039640039640039
160024640038479500000420753479865251600101016000010160000502287689511640019064003864003861981203620018160010201600002048000064003864003811160021109101016000010000000000001002231116161111724639873016161316000010640039640039640039640039640039
160024640038479400000390732479865251600101016000012160000502287689511640019064003864003861981203620018160010201600002048000064007964003811160021109101016000010000000000001002231120161111818639873016161316000010640039640039640039640039640039
160024640038479400000006995479865251600101016000010160000502287689511640019064003864003861981203620018160010201600002048000064003864003811160021109101016000010000000000001002231117161111617639873016161316000010640039640039640039640039640039