Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA512H

Test 1: uops

Code:

  sha512h q0, q1, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000
1004203410011100010002476910001000300011000

Test 2: Latency 1->1

Code:

  sha512h q0, q1, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100
1020420034101011011000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200341002121100002010000702497671002020100042030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010
10024200341002121100002010000702497691002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha512h q0, q0, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300379694101002001000620030012110000100
1020430033101011011000010010000300379850101182001003220030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030096110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379850101182001003220030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha512h q0, q1, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000030003796941010020001000620030018110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100
1020430033101011011000010010000030003796941010020001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10025300661002721100062010018703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha512h q0, q8, v9.2d
  movi v1.16b, 0
  sha512h q1, q8, v9.2d
  movi v2.16b, 0
  sha512h q2, q8, v9.2d
  movi v3.16b, 0
  sha512h q3, q8, v9.2d
  movi v4.16b, 0
  sha512h q4, q8, v9.2d
  movi v5.16b, 0
  sha512h q5, q8, v9.2d
  movi v6.16b, 0
  sha512h q6, q8, v9.2d
  movi v7.16b, 0
  sha512h q7, q8, v9.2d
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160204160034801011018000001008000030019997698010020080005200024001510160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100
160204160034801011018000001008000030019997698010020080004200024009610160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100
160204160034801011018000001008000030019999198012120080032200024001210160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100
160204160034801011018000001008000030019997698010020080004200024001210160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024160034800111180000108000003001999768800102008000020240000116000010
160024160034800111180000108000003001999769800102008000020240000116000010
160024160034800111180000108000003001999919800312008003220240000116000010
160024160034800111180000108000003001999769800102008000020240096116000010
160024160049800111180000108000003001999769800102008000020240000116000010
16002416003480011118000010800006426162895607462002136978601497775928026620240177116000010
160024160034800111180000108000003001999919800312008003220240000116000010
160025160068800201180009108002103001998761800102008000020240000116000010
160024160034800111180000108000003001999919800312008003220240000116000010
160024160034800111180000108000003001999769800102008000020240000116000010

Test 6: throughput

Count: 16

Code:

  sha512h q0, q16, v17.2d
  sha512h q1, q16, v17.2d
  sha512h q2, q16, v17.2d
  sha512h q3, q16, v17.2d
  sha512h q4, q16, v17.2d
  sha512h q5, q16, v17.2d
  sha512h q6, q16, v17.2d
  sha512h q7, q16, v17.2d
  sha512h q8, q16, v17.2d
  sha512h q9, q16, v17.2d
  sha512h q10, q16, v17.2d
  sha512h q11, q16, v17.2d
  sha512h q12, q16, v17.2d
  sha512h q13, q16, v17.2d
  sha512h q14, q16, v17.2d
  sha512h q15, q16, v17.2d
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160204320034160101101160000010016000030039997691601002001600062004801081160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100
160204320034160101101160000010016000030039999191601212001600362004800121160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100
160205320068160110101160009010016002130039999191601212001600362004800121160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100
160204320034160101101160000010016000030039997691601002001600042004800121160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024320034160011111600000101600003039997691600102016000020480000116000010
160024320034160011111600000101600003039997691600102016000020480000116000010
160024320034160011111600000101600003039997691600102016000020480108116000010
160024320034160011111600000101600003039997691600102016000020480000116000010
160025320068160020111600090101600213039997691600102016000020480000116000010
160024320034160011111600000101600003039997691600102016000020480096116000010
160025320068160020111600090101600213039997691600102016000020480000116000010
160024320034160011111600000101600003039997691600102016000020480000116000010
160024320034160011111600000101600003039997691600102016000020480000116000010
160024320034160011111600000101600003039999191600312016003320480000116000010