Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FDIV (vector, 2D)

Test 1: uops

Code:

  fdiv v0.2d, v0.2d, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000
100411033100111000100014097810001000200011000

Test 2: Latency 1->2

Code:

  fdiv v0.2d, v0.2d, v1.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 11.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
102041100331010110110000010010000030001418978101002000100062020200522010000100
102041100331010110110000010010000030001419252101152000100272000200081010000100
102041100331010110110000010010000030001418978101002000100042000200081010000100
102041100331010110110000010010000030001418978101002000100042000200081010000100
102041100801010410110003010010015030001418978101002000100042000200081010000100
102041100331010110110000010010000030001419424101302000100522000200081010000100
102051100661010510310002010210015030001418978101002000100042000200081010000100
102041100331010110110000010010000030001418978101002000100042000200081010000100
102041100331010110110000010010000030001418978101002000100042000200081010000100
102041100331010110110000010010000030001418978101002000100042000200081010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1002411003310021211000020100007014189781002020100042020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100006914191501003520100322020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002511006610023211000220100157014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010

Test 3: Latency 1->3

Code:

  fdiv v0.2d, v1.2d, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 11.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10205110066101051031000210210015722303551040114191281192917388511002620020008110000100
102041100331010110110000100100000300014189781010020001000620020008110000100
102041100331010110110000100100000300014189781010020001000420020008110000100
102041100331010110110000100100000300014189781010020001000420020008110000100
102041100331010110110000100100000300014189781010020001000420020008110000100
102041100331010110110000100100000300014189781010020001000420020008110000100
102041100331010110110000100100000300014189781010020001000420020008110000100
102041100331010110110000100100000300014189781010020001000420020064110000100
102041100331010110110000100100000300014189781010020001000420020008110000100
102041100331010110110000100100000300014189781010020001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1002411003310021211000020100007014189781002020100062020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002511006610023211000220100157014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002511006610023211000220100157014189781002020100002020000111000010
1002411003310021211000020100007014189781002020100002020000111000010
1002411003310021211000020100007014191501003520100282020000111000010

Test 4: throughput

Count: 8

Code:

  fdiv v0.2d, v8.2d, v9.2d
  fdiv v1.2d, v8.2d, v9.2d
  fdiv v2.2d, v8.2d, v9.2d
  fdiv v3.2d, v8.2d, v9.2d
  fdiv v4.2d, v8.2d, v9.2d
  fdiv v5.2d, v8.2d, v9.2d
  fdiv v6.2d, v8.2d, v9.2d
  fdiv v7.2d, v8.2d, v9.2d
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
8020416004280101101800001008000003000199976980100200080006200160008180000100
8020416004280101101800001008000003000199976980100200080004200160008180000100
8020416004280101101800001008000003000200001980125200080040200160008180000100
8020416004280101101800001008000003000200001980125200080040200160008180000100
8020416004280101101800001008000003000199976980100200080004200160008180000100
8020416004280101101800001008000003000199976980100200080004200160008180000100
8020416004280101101800001008000003000199976980100200080004200160008180000100
8020416004280101101800001008000003000200001980125200080036200160008180000100
8020416004280101101800001008000003000199976980100200080004200160008180000100
8020416004280101101800001008000003000199976980100200080004200160008180000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
80024160042800212180000208000070199976880020208000620160000118000010
80024160042800212180000208000070199976980020208000020160074118000010
80024160042800212180000208000070199976980020208000020160000118000010
80024160042800212180000208000070199976980020208000020160000118000010
80024160042800212180000208000070199976980020208000020160000118000010
80024160042800212180000208000070199976980020208000020160136118000010
80024160042800212180000208000070199976980020208000020160000118000010
80024160042800212180000208000070199976980020208000020160000118000010
80024160042800212180000208000070199976980020208000020160000118000010
80024160042800212180000208000066200001980045208003720160000118000010