Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (vector, 2S)

Test 1: uops

Code:

  fsqrt v0.2s, v0.2s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000
10041003310011100010000001280691000001000100011000

Test 2: Latency 1->2

Code:

  fsqrt v0.2s, v0.2s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
102041000331010110110000100100000300012890691010020001000620010030110000100
102041000331010110110000100100000300012890691010020001000420010004110000100
102041000331010110110000100100000300012890691010020001000420010004110000100
102041000331010110110000100100000300012890691010020001000420010004110000100
102041000331010110110000100100000300012890691010020001000420010004110000100
102041000331010110110000100100000300012890691010020001000420010004110000100
102041000331010110110000100100000300012892411011520001003220010004110000100
102041000331010110110000100100000300012890691010020001000420010004110000100
102041000331010110110000100100000300012892411011520001002820010004110000100
102041000331010110110000100100000300012890691010020001000420010004110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1002410003310021211000020100007012890691002020100062010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002510006610023211000220100157012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010000111000010
1002410003310021211000020100007012890691002020100002010028111000010

Test 3: throughput

Count: 8

Code:

  fsqrt v0.2s, v8.2s
  fsqrt v1.2s, v8.2s
  fsqrt v2.2s, v8.2s
  fsqrt v3.2s, v8.2s
  fsqrt v4.2s, v8.2s
  fsqrt v5.2s, v8.2s
  fsqrt v6.2s, v8.2s
  fsqrt v7.2s, v8.2s
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
8020516008280113101800120100800253001999769801002008000420080004180000100
8020416004180101101800000100800003001999769801002008000420080004180000100
8020416032880181101800800100801353001999769801002008000420080004180000100
8020416004180101101800000100800003001999769801002008000420080036180000100
8020416004180101101800000100800003001999769801002008000420080040180000100
8020416004180101101800000100800003001999769801002008000420080004180000100
8020416004180101101800000100800003002000007801252008004020080004180000100
8020416004180101101800000100800003001999769801002008000420080004180000100
8020416004180101101800000100800003001999769801002008000420080004180000100
8020416004180101101800000100800003002000007801252008004020080004180000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
8002416004180021218000020800000700199976980020200800002080004118000010
8002516008280033218001220800250700199976980020200800002080000118000010
8002416004180021218000020800000700199976980020200800002080000118000010
8002416004180021218000020800000700199976980020200800002080037118000010
8002416004180021218000020800000700199976980020200800002080000118000010
8002416004180021218000020800000700199976980020200800002080000118000010
8002416004180021218000020800000700199976980020200800002080038118000010
8002416004180021218000020800000700200000780045200800402080000118000010
8002416004180021218000020800000700199976980020200800002080000118000010
8002416004180021218000020800000700199976980020200800002080000118000010