Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA512SU1

Test 1: uops

Code:

  sha512su1 v0.2d, v1.2d, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000
1004203310011100010002476910001000300011000

Test 2: Latency 1->1

Code:

  sha512su1 v0.2d, v1.2d, v2.2d
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497681002020100062030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702499071004120100352030000111000010
10024200331002121100002010000702497691002020100002030012111000010
10024200331002121100002010000702497691002020100002030000111000010
10025200661002921100082010021702497691002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha512su1 v0.2d, v0.2d, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10204200331010110110000010010000300249769101002001000620030012110000100
10205200661010910110008010010021300249769101002001000420030012110000100
10205200661010910110008010010021300249769101002001000620030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100
10204200331010110110000010010000300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497691002020100042030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha512su1 v0.2d, v1.2d, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300249769101002001000620030018110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020420033101011011000010010000300249769101002001000420030012110000100
1020520066101091011000810010021300249769101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497691002020100042030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010
10024200331002121100002010000702497691002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha512su1 v0.2d, v8.2d, v9.2d
  movi v1.16b, 0
  sha512su1 v1.2d, v8.2d, v9.2d
  movi v2.16b, 0
  sha512su1 v2.2d, v8.2d, v9.2d
  movi v3.16b, 0
  sha512su1 v3.2d, v8.2d, v9.2d
  movi v4.16b, 0
  sha512su1 v4.2d, v8.2d, v9.2d
  movi v5.16b, 0
  sha512su1 v5.2d, v8.2d, v9.2d
  movi v6.16b, 0
  sha512su1 v6.2d, v8.2d, v9.2d
  movi v7.16b, 0
  sha512su1 v7.2d, v8.2d, v9.2d
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602048010880101101800001008000130032001080102200800072002400211160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002401021160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002400181160000100
1602048003480101101800001008000130032000580101200800062002401021160000100
1602048011680101101800001008000230032001080102200800072002400181160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16002480068800111180000010800023032001080012208000720240000116000010
16002480038800111180000010800003032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010
16002580068800281180017010800233032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010
16002480034800111180000010800003032000080010208000020240000116000010

Test 6: throughput

Count: 16

Code:

  sha512su1 v0.2d, v16.2d, v17.2d
  sha512su1 v1.2d, v16.2d, v17.2d
  sha512su1 v2.2d, v16.2d, v17.2d
  sha512su1 v3.2d, v16.2d, v17.2d
  sha512su1 v4.2d, v16.2d, v17.2d
  sha512su1 v5.2d, v16.2d, v17.2d
  sha512su1 v6.2d, v16.2d, v17.2d
  sha512su1 v7.2d, v16.2d, v17.2d
  sha512su1 v8.2d, v16.2d, v17.2d
  sha512su1 v9.2d, v16.2d, v17.2d
  sha512su1 v10.2d, v16.2d, v17.2d
  sha512su1 v11.2d, v16.2d, v17.2d
  sha512su1 v12.2d, v16.2d, v17.2d
  sha512su1 v13.2d, v16.2d, v17.2d
  sha512su1 v14.2d, v16.2d, v17.2d
  sha512su1 v15.2d, v16.2d, v17.2d
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160205160074160120101160019100160025030006400141601032000160010200048003010160000100
160204160034160101101160000100160001030006400051601012000160008200048002410160000100
160204160034160101101160000100160001030006400051601012000160008200048010810160000100
160204160034160101101160000100160001030006400051601012000160008200048003010160000100
160204160034160101101160000100160001030006400051601012000160008200048002410160000100
160204160034160101101160000100160001030006400051601012000160008200048002410160000100
160204160034160101101160000100160001030006400961601252000160040200048002410160000100
160204160034160101101160000100160001030006400051601012000160008200048002410160000100
160204160034160101101160000100160001030006400051601012000160008200048010810160000100
160205160068160119101160018100160024030006400051601012000160008200048002410160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16002516006816002811160017010160023306400051600112016000820480000116000010
16002416003416001111160000010160000306400001600102016000020480000116000010
16002416003416001111160000010160000306400001600102016000020480000116000010
16002416003416001111160000010160000306400001600102016000020480159116000010
16002416003416001111160000010160001306400001600102016000020480000116000010
16002416003416001111160000010160000306400001600102016000020480000116000010
16002416003416001111160000010160000306400001600102016000020480000116000010
16002416027516013111160120010160125306400001600102016000020480000116000010
16317317889116307520561600249951872160029306400001600102016000020480000116000010
16002416004116001211160001010160003306400141600132016001020480000116000010