Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA256H2

Test 1: uops

Code:

  sha256h2 q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000

Test 2: Latency 1->1

Code:

  sha256h2 q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100244003410021211000002010000705096081002020100042030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100244003410021211000002010000705096081002020100002030000111000010
100254006810026211000502010017705096081002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha256h2 q0, q0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
102045003310101101100001001000030063952010100200100062000300181010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063968610116200100282000300121010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010

Test 4: Latency 1->3

Code:

  sha256h2 q0, q1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030018110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020550066101051011000410010016300639520101002001000420030018110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1002450033100212110000201000070639520100202010006200300001101000010
1002450033100212110000201000070639686100362010032200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010
1002450033100212110000201000070639520100202010000200300001101000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha256h2 q0, q8, v9.4s
  movi v1.16b, 0
  sha256h2 q1, q8, v9.4s
  movi v2.16b, 0
  sha256h2 q2, q8, v9.4s
  movi v3.16b, 0
  sha256h2 q3, q8, v9.4s
  movi v4.16b, 0
  sha256h2 q4, q8, v9.4s
  movi v5.16b, 0
  sha256h2 q5, q8, v9.4s
  movi v6.16b, 0
  sha256h2 q6, q8, v9.4s
  movi v7.16b, 0
  sha256h2 q7, q8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602041600358010110180000010080000030001999757801002000800052002400121160000100
1602041600358010110180000010080000030001999758801002000800042002400121160000100
1602041600358010110180000010080000030001999930801222000800342002400151160000100
1602041600358010110180000010080000030001999758801002000800042002400121160000100
1602041600358010110180000010080000030001999758801002000800042002400121160000100
1602041600358010110180000010080000030001999758801002000800042002400121160000100
1602041600358010110180000010080000030001999758801002000800042022400992160000100
1602051600708011110180010010080022030001999758801002000800042002400121160000100
1602041600358010110180000010080000030001999758801002000800042002400961160000100
169642197455884715521799622988514579978030001999757801002000800052002400121160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002416003580011118000010800000300199975780010200800052002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010
160025160072800211180010108002203001999758800102008000031215631240290128226631600391898
16002416003580011118000010800000300199975880010200800002002400001016000010
16002416003580011118000010800000300199975880010200800002002400001016000010

Test 6: throughput

Count: 16

Code:

  sha256h2 q0, q16, v17.4s
  sha256h2 q1, q16, v17.4s
  sha256h2 q2, q16, v17.4s
  sha256h2 q3, q16, v17.4s
  sha256h2 q4, q16, v17.4s
  sha256h2 q5, q16, v17.4s
  sha256h2 q6, q16, v17.4s
  sha256h2 q7, q16, v17.4s
  sha256h2 q8, q16, v17.4s
  sha256h2 q9, q16, v17.4s
  sha256h2 q10, q16, v17.4s
  sha256h2 q11, q16, v17.4s
  sha256h2 q12, q16, v17.4s
  sha256h2 q13, q16, v17.4s
  sha256h2 q14, q16, v17.4s
  sha256h2 q15, q16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16020432003516010110116000010016000030039997571601002001600062004800181160000100
16020532007216011110116001010016002230039999301601222001600332004800121160000100
16020532007216011110116001010016002230039997581601002001600042004800121160000100
16020432003516010110116000010016000030039997581601002001600042004800961160000100
16020432003516010110116000010016000030039997581601002001600042004800121160000100
16020432003516010110116000010016000030039997581601002001600042004800121160000100
16020432003516010110116000010016000030039999161601222001600352004800121160000100
16020432003516010110116000010016000030039997581601002001600042004800121160000100
16020432003516010110116000010016000030039997581601002001600042004800121160000100
16020532007216011110116001010016002230039997581601002001600042004800121160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024320035160011111600000101600003039997571600102016000020480000116000010
160024320035160011111600000101600003039999161600322016003620480000116000010
160024320035160011111600000101600003039997581600102016000020480000116000010
160024320035160011111600000101600003040000881600542016006320480000116000010
160024320035160011111600000101600003039997581600102016000020480000116000010
160024320035160011111600000101600003039999161600322016003720480000116000010
160024320035160011111600000101600003039997581600102016000020480000116000010
160025320070160021111600100101600223039997581600102016000020480000116000010
160025320070160021111600100101600223039997581600102016000020480000116000010
160024320035160011111600000101600003039997571600102016000020480000116000010