Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SHA1P

Test 1: uops

Code:

  sha1p q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000300011000
1004403410011100010005060810001000307511000

Test 2: Latency 1->1

Code:

  sha1p q0, s1, v2.4s
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100
1020440034101011011000010010000300509608101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0034

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024400341002121100002010000705096081002020100002030000111000010
10025400681002621100052010017705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030000111000010
10024400341002121100002010000705096081002020100002030078111000010

Test 3: Latency 1->2

Code:

  sha1p q0, s0, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020550066101051011000410010016300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010
806647015830687869314979936943706395201002020100042030000111000010
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010
100245003310021211000002010000706395201002020100002030000111000010

Test 4: Latency 1->3

Code:

  sha1p q0, s1, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020450033101011011000010010000300639520101002001000620030018110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000307639686101182021002820030018110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420230090110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100
1020450033101011011000010010000300639520101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 5.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024500331002121100002010000706395201002020100042030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030090111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010
10025500661002521100042010016706395201002020100002030000111000010
10024500331002121100002010000706395201002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  sha1p q0, s8, v9.4s
  movi v1.16b, 0
  sha1p q1, s8, v9.4s
  movi v2.16b, 0
  sha1p q2, s8, v9.4s
  movi v3.16b, 0
  sha1p q3, s8, v9.4s
  movi v4.16b, 0
  sha1p q4, s8, v9.4s
  movi v5.16b, 0
  sha1p q5, s8, v9.4s
  movi v6.16b, 0
  sha1p q6, s8, v9.4s
  movi v7.16b, 0
  sha1p q7, s8, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020432003480101101800001008000030040796088010020080002200024000910160000100
16020432003480101101800001008000030040796088010020080002200024008410160000100
16020432003480101101800001008000030040796088010020080002200024008110160000100
16020532006880106101800051008001730040796088010020080002200024000610160000100
16020732013780116101800151008005130040796088010020080002200024000610160000100
16020732013680116101800151008005130040799588013420080050200024000610160000100
16020432003480101101800001008000030040796088010020080002200024008410160000100
16020432003480101101800001008000030040796088010020080002200024029410160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100
16020432003480101101800001008000030040796088010020080002200024000610160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024321396801861180175108045033408577880425228057220241914116000010
160024321449801931180182108046837408700680498228065822241716216000010
160024321397801861180175108045030408399280298208039420240444116000010
160024320142800251180014108003630407960880010208000220240000116000010
160024320034800111180000108000030407960880010208000020240000116000010
160025320069800161180005108001730407960880010208000020240000116000010
160024320034800111180000108000030407960880010208000020240000116000010
160024320034800111180000108000030407960880010208000020240084116000010
160024320034800111180000108000030407978380027208002820240000116000010
160024320034800111180000108000030407960880010208000020240000116000010

Test 6: throughput

Count: 4

Code:

  sha1p q0, s4, v5.4s
  sha1p q1, s4, v5.4s
  sha1p q2, s4, v5.4s
  sha1p q3, s4, v5.4s
  movi v4.16b, 5
  movi v5.16b, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 4.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
4020416003440101101400001004000030020396084010020040006200120012140000100
4020416003440101101400001004000030020396084010020040004200120012140000100
4020416003440101101400001004000030020396084010020040004200120012140000100
4020516006840106101400051004001730020396084010020040004200120012140000100
4020416003440101101400001004000030020396084010020040004200120012140000100
4020416003440101101400001004000030020396084010020040004200120096140000100
4020416003440101101400001004000030020396084010020040004200120084140000100
4020516005840101101400001004000030020396084010020040004200120012140000100
4020416003440101101400001004000030020396084010020040004200120096140000100
4020416003440101101400001004000030020396084010020040004200120012140000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 4.0008

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400241600344001111400001040000302039608400102040006200120000104000010
400241600344001111400001040000302039608400102040000200120000104000010
400241600824001811400071040018302039608400102040000200120000104000010
400241600344001111400001040000302040156400462040049200120000104000010
400241600344001111400001040000302039608400102040000200120000104000010
400241600344001111400001040000302039608400102040000200120000104000010
400241600344001111400001040000302039608400102040000200120000104000010
400241600344001111400001040000302039783400272040029200120000104000010
400241600344001111400001040000302039608400102040000200120000104000010
400241601314002511400141040036302039608400102040000200120000104000010