Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA256SU1

Test 1: uops

Code:

  sha256su1 v0.4s, v1.4s, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000
1004303310011100010003769410001000300011000

Test 2: Latency 1->1

Code:

  sha256su1 v0.4s, v1.4s, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000673798501003820100322030012111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030096111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha256su1 v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300379694101002001000620030018110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020530066101091031000610210018300379694101002001000620030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha256su1 v0.4s, v1.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300379694101002001000620030018110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430129101191031001610210040300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100
1020430033101011011000010010000300379694101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000693798501003820100322030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10025300661002721100062010018703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010
10024300331002121100002010000703796941002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha256su1 v0.4s, v8.4s, v9.4s
  movi v1.16b, 0
  sha256su1 v1.4s, v8.4s, v9.4s
  movi v2.16b, 0
  sha256su1 v2.4s, v8.4s, v9.4s
  movi v3.16b, 0
  sha256su1 v3.4s, v8.4s, v9.4s
  movi v4.16b, 0
  sha256su1 v4.4s, v8.4s, v9.4s
  movi v5.16b, 0
  sha256su1 v5.4s, v8.4s, v9.4s
  movi v6.16b, 0
  sha256su1 v6.4s, v8.4s, v9.4s
  movi v7.16b, 0
  sha256su1 v7.4s, v8.4s, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16020480096801011018000001008000130032001080102200800072002400211160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100
16020480035801011018000001008000130032000580101200800062002400181160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600248018980011118000010800023032000080010208000020240000116000010
1600248005480011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032008780033208003320240000116000010
1600248003580011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032000080010208000020240000116000010
1600248003580011118000010800003032000080010208000020240000116000010

Test 6: throughput

Count: 16

Code:

  sha256su1 v0.4s, v16.4s, v17.4s
  sha256su1 v1.4s, v16.4s, v17.4s
  sha256su1 v2.4s, v16.4s, v17.4s
  sha256su1 v3.4s, v16.4s, v17.4s
  sha256su1 v4.4s, v16.4s, v17.4s
  sha256su1 v5.4s, v16.4s, v17.4s
  sha256su1 v6.4s, v16.4s, v17.4s
  sha256su1 v7.4s, v16.4s, v17.4s
  sha256su1 v8.4s, v16.4s, v17.4s
  sha256su1 v9.4s, v16.4s, v17.4s
  sha256su1 v10.4s, v16.4s, v17.4s
  sha256su1 v11.4s, v16.4s, v17.4s
  sha256su1 v12.4s, v16.4s, v17.4s
  sha256su1 v13.4s, v16.4s, v17.4s
  sha256su1 v14.4s, v16.4s, v17.4s
  sha256su1 v15.4s, v16.4s, v17.4s
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160205160091160120101160019100160025300640087160123200160038200048011710160000100
160204160035160101101160000100160001300640005160101200160008200048002410160000100
160204160035160101101160000100160001300640005160101200160008200048002410160000100
160204160035160101101160000100160001300640005160101200160008200048012310160000100
160204160035160101101160000100160001300640005160101200160008200048002410160000100
160205160081160119101160018100160024300640014160103200160010200048002410160000100
160205160070160119101160018100160024300640005160101200160008200048010810160000100
160204160035160101101160000100160001300640100160126200160040200048002410160000100
160206160112160137101160036100160046300640005160101200160008200048002410160000100
160204160035160101101160000100160001307640096160127202160038200048011410160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16002416003516001111160000010160001030064000016001020016000020480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010
16002416003516001111160000010160000030064010016003620016004020480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010
16002416003516001111160000010160000030064008716003320016003620480000116000010
16002416003516001111160000010160000030064000016001020016000020480000116000010