Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA256H

Test 1: uops

Code:

  sha256h q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004403410011100010005060810001000300011000
1004403410011100010005060810001000307211000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000

Test 2: Latency 1->1

Code:

  sha256h q0, q1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024400341002121100002010000705096081002020100042030012111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705098821003820100242031065111000010
10024410051016121101402010360695142661032620104302030687111000010
10024411021017925101542410396705158111043320105872431674131000010
10024411541018221101612010414765156511041922105602031524111000010
10024407991012621101052010270675142661032620104312231359121000010
10024409071014721101262010324675145401034420104602231602121000010

Test 3: Latency 1->2

Code:

  sha256h q0, q0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030018110000100
1020450033101011011000010010000300639520101002001000620030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030084110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000706395201002020100062030018111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha256h q0, q1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450128101111011001010010034300639520101002001000620030012110000100
1020450365101381031003510210119316641088102072061015220030168110000100
1020450177101201051001510410051300639686101162001003220030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000706395201002020100062030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100042030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha256h q0, q8, v9.4s
  movi v1.16b, 0
  sha256h q1, q8, v9.4s
  movi v2.16b, 0
  sha256h q2, q8, v9.4s
  movi v3.16b, 0
  sha256h q3, q8, v9.4s
  movi v4.16b, 0
  sha256h q4, q8, v9.4s
  movi v5.16b, 0
  sha256h q5, q8, v9.4s
  movi v6.16b, 0
  sha256h q6, q8, v9.4s
  movi v7.16b, 0
  sha256h q7, q8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602041600358010110180000010080000300199975780100200800052002400121160000100
1602041600358010110180000010080000300199975880100200800042002400151160000100
1602041600358010110180000010080000300199975880100200800042002401021160000100
1602041600358010110180000010080000300200100280266200802082022403662160000100
1602041600358010110180000010080000300199975880100200800042002401021160000100
1602041600358010110180000010080000300199975880100200800042002400121160000100
1602041600358010110180000010080000300199975880100200800042002400121160000100
1602041600358010110180000010080000300199975880100200800042002400121160000100
1602041600358010110180000010080000300199975880100200800042002400121160000100
1602041600358010110180000010080000300199975880100200800042002401021160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600241600358001111800000108000003001999915800322008003320240015116000010
1600241600358001111800000108000003001999758800102008000020240000116000010
1600241600358001111800000108000003001999916800322008003420240000116000010
160024160192800111180000010800001253645344426482113811438469622853141515542920240015116000010
1600241600358001111800000108000003001999758800102008000020240102116000010
15256314952282703848468977524280186863703001999769800342008002820240000116000010
1600241600358001111800000108000003001999758800102008000020240000116000010
1600241600358001111800000108000003001999758800102008000020240000116000010
1600241600358001111800000108000003001999915800322008003320240096116000010
1600241600358001111800000108000003001999758800102008000020240000116000010

Test 6: throughput

Count: 16

Code:

  sha256h q0, q16, v17.4s
  sha256h q1, q16, v17.4s
  sha256h q2, q16, v17.4s
  sha256h q3, q16, v17.4s
  sha256h q4, q16, v17.4s
  sha256h q5, q16, v17.4s
  sha256h q6, q16, v17.4s
  sha256h q7, q16, v17.4s
  sha256h q8, q16, v17.4s
  sha256h q9, q16, v17.4s
  sha256h q10, q16, v17.4s
  sha256h q11, q16, v17.4s
  sha256h q12, q16, v17.4s
  sha256h q13, q16, v17.4s
  sha256h q14, q16, v17.4s
  sha256h q15, q16, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16020432003516010110116000010016000003000399975716010020001600062004800121160000100
16020432003516010110116000010016000003000399991616012220001600362004800121160000100
16020432003516010110116000010016000003000399975816010020001600042004800121160000100
16020432003516010110116000010016000003000399991516012220001600342004800121160000100
16020432003516010110116000010016000003000399975816010020001600042004801081160000100
16020432003516010110116000010016000003000399991616012220001600322004800121160000100
16020432003516010110116000010016000003000399975816010020001600042004800121160000100
16020432003516010110116000010016000003000399975816010020001600042004800121160000100
16020432003516010110116000010016000003000399975816010020001600042004800121160000100
16020432003516010110116000010016000003000399975816010020001600042004800121160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002432003516001111160000010160000303999757160010201600062004800001016000010
16002432003516001111160000010160000303999930160032201600362004800001016000010
16002432003516001111160000010160000303999930160032201600362004800001016000010
16002532007216002111160010010160022303999758160010201600042004800001016000010
16002432003516001111160000010160000303999758160010201600002004801051016000010
16002432003516001111160000010160000303999758160010201600002004800001016000010
16002432003516001111160000010160000303999758160010201600002004801081016000010
16002432003516001111160000010160000303999916160032201600362004800001016000010
16002432003516001111160000010160000303999758160010201600002004800001016000010
16002432003516001111160000010160000303999916160032201600362004800001016000010