Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA512H2

Test 1: uops

Code:

  sha512h2 q0, q1, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000

Test 2: Latency 1->1

Code:

  sha512h2 q0, q1, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
10024200341002121100000201000007002497691002020010004200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010
10024200341002121100000201000007002497691002020010000200300001101000010

Test 3: Latency 1->2

Code:

  sha512h2 q0, q0, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300379694101002001000620030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000307379850101202021003420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000703796941002020100042030018111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha512h2 q0, q1, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300379694101002001000620030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030105111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030096111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha512h2 q0, q8, v9.2d
  movi v1.16b, 0
  sha512h2 q1, q8, v9.2d
  movi v2.16b, 0
  sha512h2 q2, q8, v9.2d
  movi v3.16b, 0
  sha512h2 q3, q8, v9.2d
  movi v4.16b, 0
  sha512h2 q4, q8, v9.2d
  movi v5.16b, 0
  sha512h2 q5, q8, v9.2d
  movi v6.16b, 0
  sha512h2 q6, q8, v9.2d
  movi v7.16b, 0
  sha512h2 q7, q8, v9.2d
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020416003480101101800001008000030019997698010020080005200024001510160000100
16020516006880110101800091008002130019997698010020080004200024001510160000100
16020416003480101101800001008000030019997698010020080004200024009610160000100
16020416003480101101800001008000030019997698010020080004200024001210160000100
16020416003480101101800001008000030019997698010020080004200024009610160000100
16020416003480101101800001008000030019997698010020080004200024001210160000100
16020416003480101101800001008000030019997698010020080004200024001210160000100
16020416003480101101800001008000030019997698010020080004200024001210160000100
16020416003480101101800001008000030019997698010020080004200024001510160000100
16020416003480101101800001008000030019997698010020080004200024001210160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160024160034800111180000010800000300199976980010200800052002400001016000010
160025160068800201180009010800210300199976980010200800002002400001016000010
160024160034800111180000010800000300199976980010200800002002400001016000010
160024160034800111180000010800000300199976980010200800002002400001016000010
160024160034800111180000010800000300199976980010200800002002400001016000010
160024160034800111180000010800000300199991980031200800322002400931016000010
160024160034800111180000010800000300199976980010200800002002400001016000010
16002516006880020118000901080021030020000698005220080060366622531321201773838879501719
160025160068800201180009010800210300199976980010200800002002400001016000010
160024160034800111180000010800000300199976980010200800002002400001016000010

Test 6: throughput

Count: 16

Code:

  sha512h2 q0, q16, v17.2d
  sha512h2 q1, q16, v17.2d
  sha512h2 q2, q16, v17.2d
  sha512h2 q3, q16, v17.2d
  sha512h2 q4, q16, v17.2d
  sha512h2 q5, q16, v17.2d
  sha512h2 q6, q16, v17.2d
  sha512h2 q7, q16, v17.2d
  sha512h2 q8, q16, v17.2d
  sha512h2 q9, q16, v17.2d
  sha512h2 q10, q16, v17.2d
  sha512h2 q11, q16, v17.2d
  sha512h2 q12, q16, v17.2d
  sha512h2 q13, q16, v17.2d
  sha512h2 q14, q16, v17.2d
  sha512h2 q15, q16, v17.2d
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020432003416010110116000001001600003003999769160100200160004200048001810160000100
16020532006816011010116000901001600213003999769160100200160004200048010810160000100
16020432003416010110116000001001600003003999769160100200160004200048001210160000100
16020432003416010110116000001001600003003999769160100200160004200048009610160000100
16020432003416010110116000001001600003003999769160100200160004200048001210160000100
16020432003416010110116000001001600003003999769160100200160004200048001210160000100
16020432003416010110116000001001600003003999919160121200160032200048001210160000100
16020432003416010110116000001001600003003999769160100200160004200048001210160000100
16020432003416010110116000001001600003003999769160100200160004200048001210160000100
16020532006816011010116000901001600213003999769160100200160004200048001210160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1600243200341600111116000010160000030039999191600312001600362004800181016000010
1600243200341600111116000010160000030039997691600102001600002004800001016000010
1600243200341600111116000010160000030039997691600102001600002004800001016000010
1600243200341600111116000010160000030039997691600102001600002004800961016000010
1600243200341600111116000010160000030039997691600102001600002004800001016000010
1600243200341600111116000010160000030039997691600102001600002004800001016000010
1600243200341600111116000010160000030039999191600312001600322004800001016000010
1600243200341600111116000010160000030039997691600102001600002004800001016000010
1600243200341600111116000010160000030039999191600312001600362004801021016000010
1600243200341600111116000010160000030039997691600102001600002004800961016000010