Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1M

Test 1: uops

Code:

  sha1m q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000

Test 2: Latency 1->1

Code:

  sha1m q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010

Test 3: Latency 1->2

Code:

  sha1m q0, s0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030018110000100
1020450033101011011000010010000300639520101002001000620030099110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10025500661002521100042010016706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha1m q0, s1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030018110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020550066101051011000410010016300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000666396861003620100322030018111000010
10024500331002121100002010000706395201002020100042030012111000010
10024500331002121100002010000706395201002020100042030012111000010
10024500331002121100002010000706395201002020100042030012111000010
10024500331002121100002010000706395201002020100042030087111000010
10024500331002121100002010000706395201002020100042030012111000010
10024500331002121100002010000686396861003620100292030012111000010
10025500661002521100042010016706395201002020100042030012111000010
10025500661002521100042010016696396861003620100322030012111000010
10024500331002121100002010000706395201002020100042030012111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1m q0, s8, v9.4s
  movi v1.16b, 0
  sha1m q1, s8, v9.4s
  movi v2.16b, 0
  sha1m q2, s8, v9.4s
  movi v3.16b, 0
  sha1m q3, s8, v9.4s
  movi v4.16b, 0
  sha1m q4, s8, v9.4s
  movi v5.16b, 0
  sha1m q5, s8, v9.4s
  movi v6.16b, 0
  sha1m q6, s8, v9.4s
  movi v7.16b, 0
  sha1m q7, s8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020432003480101101800001008000030040796088010020080003200024008410160000100
16020532006880106101800051008001730040796088010020080002200024000610160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100
16020432003480101101800001008000030040797838011720080028200024000610160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100
16020532006880106101800051008001730040796088010020080002200024000610160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100
16020532006880106101800051008001730040796088010020080003200024000610160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002432003480011118000010800003040796088001020800022002400001016000010
16002532006880016118000510800173040803318006320800772002400001016000010
16002432003480011118000010800003040797838002720800262002400001016000010
16002532006880016118000510800173040796088001020800002002400001016000010
16002432003480011118000010800003040796088001020800002002400781016000010
16002432003480011118000010800003040796088001020800002002400001016000010
16002432003480011118000010800003040796088001020800002002400841016000010
16002432003480011118000010800003040796088001020800002002400001016000010
16002432003480011118000010800003040797838002720800262002400001016000010
16002432003480011118000010800003040796088001020800002002400781016000010

Test 6: throughput

Count: 4

Code:

  sha1m q0, s4, v5.4s
  sha1m q1, s4, v5.4s
  sha1m q2, s4, v5.4s
  sha1m q3, s4, v5.4s
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
402041600344010110140000100400003002039608401002004000620001200181040000100
402041600344010110140000100400003002039608401002004000420001200121040000100
402041600344010110140000100400003002039608401002004000420001200121040000100
402051600684010610140005100400173002039608401002004000420001200121040000100
402041600344010110140000100400003002039608401002004000420001200121040000100
402041600344010110140000100400003002039608401002004000420001200961040000100
402041600344010110140000100400003002039608401002004000420001200961040000100
402051600684010610140005100400173002039608401002004000620001200121040000100
402041600344010110140000100400003002039608401002004000620001200121040000100
402041600344010110140000100400003002039608401002004000420001200121040000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
4002416003440011114000001040000302039608400102040006200120000104000010
44684177291442722598400491625249340050302039608400102040000200120000104000010
4002416003440011114000001040000302039608400102040000200120000104000010
4002416003440011114000001040000302039608400102040000200120000104000010
4002416003440011114000001040000302039608400102040000200120000104000010
4002516006840016114000501040017302039608400102040000200120000104000010
4002416003440011114000001040000302039608400102040000200120000104000010
4002416003440011114000001040000302039608400102040000200120000104000010
4002416003440011114000001040000302039608400102040000200120084104000010
4002416003440011114000001040000302039608400102040000200120000104000010