Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (scalar, D)

Test 1: uops

Code:

  fdiv d0, d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000
100410033100111000100012806910001000200011000

Test 2: Latency 1->2

Code:

  fdiv d0, d0, d1
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
102041000331010110110000100100003001289069101002001000620020008110000100
102041000331010110110000100100003001289241101152001003220020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100
102051000661010310110002100100153001289069101002001000420020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100
102041000331010110110000100100003001289069101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
100251000661002321100022010015701289069100202010004200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200561101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010
100241000331002121100002010000701289069100202010000200200001101000010

Test 3: Latency 1->3

Code:

  fdiv d0, d1, d0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
102041000331010110110000100100000300012890691010020001000620020012110000100
102041000331010110110000100100000300012894131013020001005320020008110000100
102051000661010310110002100100150300012890691010020001000420020012110000100
102041000331010110110000100100000300012890691010020001000420020008110000100
102041000331010110110000100100000300012890691010020001000420020008110000100
102041000331010110110000100100000300012890691010020001000420020008110000100
102041000331010110110000100100000300012890691010020001000420020056110000100
102041000331010110110000100100000300012890691010020001000420020008110000100
102041000331010110110000100100000300012890691010020001000420020008110000100
102041000331010110110000100100000300012890691010020001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100006712892411003520100292020000111000010
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100007012890691002020100042020000111000010
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100007012890691002020100002020000111000010
1002410003310021211000020100007012890691002020100002020000111000010

Test 4: throughput

Count: 8

Code:

  fdiv d0, d8, d9
  fdiv d1, d8, d9
  fdiv d2, d8, d9
  fdiv d3, d8, d9
  fdiv d4, d8, d9
  fdiv d5, d8, d9
  fdiv d6, d8, d9
  fdiv d7, d8, d9
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
80204800568010110180000100800013003200148010320080010200160020180000100
80204800428010110180000100800013003200058010120080008200160016180000100
80204800428010110180000100800013003200058010120080008200160016180000100
80204800428010110180000100800013003201208013120080046200160088180000100
80204800428010110180000100800013003200058010120080008200160016180000100
80204800428010110180000100800013003200148010320080010200160016180000100
80204800498010210180001100800033003200058010120080008200160016180000100
80204800498010210180001100800033003200058010120080008200160016180000100
80204800428010110180000100800013003200058010120080008200160016180000100
80204800428010110180000100800013003200058010120080008200160016180000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
800248005780021218000020800027032001080022208000820160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010
800248004280021218000020800007032000080020208000020160000118000010