Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1SU1

Test 1: uops

Code:

  sha1su1 v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000
1004203310011100010002476910001000200011000

Test 2: Latency 1->1

Code:

  sha1su1 v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300249769101002001000620020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249907101212001003620020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100
1020420033101011011000010010000300249769101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010

Test 3: Latency 1->2

Code:

  sha1su1 v0.4s, v0.4s
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10205200661010910110008010010021300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100
10204200331010110110000010010000300249769101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000702497691002020100042020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000682499071004120100322020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010
10024200331002121100002010000702497691002020100002020000111000010

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1su1 v0.4s, v8.4s
  movi v1.16b, 0
  sha1su1 v1.4s, v8.4s
  movi v2.16b, 0
  sha1su1 v2.4s, v8.4s
  movi v3.16b, 0
  sha1su1 v3.4s, v8.4s
  movi v4.16b, 0
  sha1su1 v4.4s, v8.4s
  movi v5.16b, 0
  sha1su1 v5.4s, v8.4s
  movi v6.16b, 0
  sha1su1 v6.4s, v8.4s
  movi v7.16b, 0
  sha1su1 v7.4s, v8.4s
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602048006680101101800001008000103000320005801012000800062001600121160000100
1602048003480101101800001008000203000320005801012000800062001600121160000100
1602048003480101101800001008000203000320409802012000801112001600121160000100
1602048003480101101800001008000103000320005801012000800062001600121160000100
1602048003480101101800001008000103000320005801012000800062001600121160000100
1602048003480101101800001008000103000320005801012000800062001600121160000100
1602048003480101101800001008000103000320005801012000800062001600121160000100
1602048003480101101800001008000103000320005801012000800062001600121160000100
1602048031880197101800961008010203000320088801232000800332001600121160000100
1602048003480101101800001008000203000320005801012000800062001600121160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600248014280011118000010800023032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600258006880028118001710800233032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010
1600248003480011118000010800003032000080010208000020160000116000010

Test 5: throughput

Count: 16

Code:

  sha1su1 v0.4s, v16.4s
  sha1su1 v1.4s, v16.4s
  sha1su1 v2.4s, v16.4s
  sha1su1 v3.4s, v16.4s
  sha1su1 v4.4s, v16.4s
  sha1su1 v5.4s, v16.4s
  sha1su1 v6.4s, v16.4s
  sha1su1 v7.4s, v16.4s
  sha1su1 v8.4s, v16.4s
  sha1su1 v9.4s, v16.4s
  sha1su1 v10.4s, v16.4s
  sha1su1 v11.4s, v16.4s
  sha1su1 v12.4s, v16.4s
  sha1su1 v13.4s, v16.4s
  sha1su1 v14.4s, v16.4s
  sha1su1 v15.4s, v16.4s
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160205160082160118101160017100160022300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100
160205160068160118101160017100160022300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100
160204160034160101101160000100160001300640005160101200160008200032007210160000100
160204160034160101101160000100160001300640005160101200160008200032001610160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600241600401600121116000110160003306400001600102016000020320000116000010
1600241600341600111116000010160000306400921600342016003720320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010
1600241600341600111116000010160000306400001600102016000020320000116000010