Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1SU0

Test 1: uops

Code:

  sha1su0 v0.4s, v1.4s, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000

Test 2: Latency 1->1

Code:

  sha1su0 v0.4s, v1.4s, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497671002020100042030012111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha1su0 v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300249769101002001000620030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000620030018110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497681002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha1su0 v0.4s, v1.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300249769101002001000420030018110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497681002020100062030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1su0 v0.4s, v8.4s, v9.4s
  movi v1.16b, 0
  sha1su0 v1.4s, v8.4s, v9.4s
  movi v2.16b, 0
  sha1su0 v2.4s, v8.4s, v9.4s
  movi v3.16b, 0
  sha1su0 v3.4s, v8.4s, v9.4s
  movi v4.16b, 0
  sha1su0 v4.4s, v8.4s, v9.4s
  movi v5.16b, 0
  sha1su0 v5.4s, v8.4s, v9.4s
  movi v6.16b, 0
  sha1su0 v6.4s, v8.4s, v9.4s
  movi v7.16b, 0
  sha1su0 v7.4s, v8.4s, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602048007780101101800001008000130032008880123200800342002400211160000100
1602048003480101101800001008000130032001080102200800072002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600248012580011118000010800013032001080012208000720240000116000010
1600248003880011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240096116000010
1600248003480011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240000116000010
1600248003480011118000010800003032000080010208000020240000116000010

Test 6: throughput

Count: 4

Code:

  sha1su0 v0.4s, v4.4s, v5.4s
  sha1su0 v1.4s, v4.4s, v5.4s
  sha1su0 v2.4s, v4.4s, v5.4s
  sha1su0 v3.4s, v4.4s, v5.4s
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
402044003440101101400000100400013001600144010320040010200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120024140000100
402044003440101101400000100400013001600054010120040008200120120140000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024400484001111400001040001301600004001020400002012000014000010
40025400684002911400181040024301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010
40024400344001111400001040000301600004001020400002012000014000010