Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1C

Test 1: uops

Code:

  sha1c q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000

Test 2: Latency 1->1

Code:

  sha1c q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020440034101011011000010010000307509783101192021003220030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024400341002121100002010000705096081002020100062030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha1c q0, s0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030018110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020550066101051011000410010016300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000070063952010020200100062030000111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063952010020200100002030096111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063952010020200100002030000111000010
10024500331002121100002010000070063968610036200100282030018111000010

Test 4: Latency 1->3

Code:

  sha1c q0, s1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
102045003310101101100001001000030063952010100200100062000300181010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042020300902010000100
102045003310101101100001001000030063952010100200100042000300121010000100
102045003310101101100001001000030063952010100200100042000300121010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030096111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1c q0, s8, v9.4s
  movi v1.16b, 0
  sha1c q1, s8, v9.4s
  movi v2.16b, 0
  sha1c q2, s8, v9.4s
  movi v3.16b, 0
  sha1c q3, s8, v9.4s
  movi v4.16b, 0
  sha1c q4, s8, v9.4s
  movi v5.16b, 0
  sha1c q5, s8, v9.4s
  movi v6.16b, 0
  sha1c q6, s8, v9.4s
  movi v7.16b, 0
  sha1c q7, s8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602043200348010110180000010080000030004079608801002000800032002400091160000100
1602043200348010110180000010080000030004079608801002000800022002400061160000100
1602043200348010110180000010080000030004079783801172000800282002400061160000100
1602043200348010110180000010080000030004079608801002000800022002400061160000100
1602043200348010110180000010080000030004079608801002000800022002400061160000100
1602043200348010110180000010080000030004079608801002000800022002400841160000100
1602043200348010110180000010080000030004079608801002000800022002400061160000100
1602043200348010110180000010080000030004079608801002000800022002400061160000100
1602043200348010110180000010080000030004079608801002000800022002400061160000100
1602053200698010610180005010080017030004079608801002000800022002400061160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002432003480011118000010800000300407978380027200800272002400841016000010
16002432003480011118000010800000300407960880010200800002002400001016000010
16002432003480011118000010800000300407960880010200800002002400001016000010
16002632010280021118001010800340300407960880010200800002002400091016000010
16002432003480011118000010800000300407960880010200800002002400001016000010
16002432003480011118000010800000300407960880010200800002202400782016000010
16002432003480011118000010800000300407960880010200800002002400001016000010
16002432003480011118000010800000300407960880010200800002002400001016000010
16002432003480011118000010800000300407978380027200800282002400001016000010
16002432003480011118000010800000300407960880010200800002002400001016000010

Test 6: throughput

Count: 4

Code:

  sha1c q0, s4, v5.4s
  sha1c q1, s4, v5.4s
  sha1c q2, s4, v5.4s
  sha1c q3, s4, v5.4s
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
4020416003440101101400001004000003000203960840100200040006200120012140000100
4020416008340108101400071004001803000203960840100200040004200120012140000100
4020416003440101101400001004000003000203960840100200040004200120093140000100
4020516006840106101400051004001703000203960840100200040004200120012140000100
4020416003440101101400001004000003000204005740135200040056200120012140000100
4020516006840106101400051004001703000203960840100200040004200120012140000100
4020416003440101101400001004000003000203960840100200040004200120012140000100
4020416008340108101400071004001803000203978340117200040030200120018140000100
4020416008240108101400071004001803000203960840100200040006200120093140000100
4020416003440101101400001004000003000203960840100200040004200120012140000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
40024160034400111140000010400003020396084001020400062012000014000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012016814000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012000014000010
40024160034400111140000010400003020396084001020400002012000014000010