Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FMLS (vector, 4H)

Test 1: uops

Code:

  fmls v0.4h, v1.4h, v2.4h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000
10044033100111000100010155710001000300011000

Test 2: Latency 1->1

Code:

  fmls v0.4h, v1.4h, v2.4h
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030132110000100
10204400331010110110000100100003001028557101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100244003310021211000020100007010288901005020100412030012111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100006610289041005020100442030012111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010

Test 3: Latency 1->2

Code:

  fmls v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1020440033101011011000010010000300102855710100200100062000300121010000100
1020440033101011011000010010000300102855710100200100042000300121010000100
1020440033101011011000010010000300102855710100200100042000300121010000100
1020440033101011011000010010000300102855710100200100042000300121010000100
1020440033101011011000010010000300102855710100200100042000300121010000100
1020440033101011011000010010000300102855710100200100042020301382010000100
1020440033101011011000010010000300102855710100200100042000300121010000100
102044003310101101100001001000011438525738197637962000300121010000100
1020440033101011011000010010000300102855710100200100042000300121010000100
1020440033101011011000010010000300102855710100200100042000300121010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030132111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010

Test 4: Latency 1->3

Code:

  fmls v0.4h, v1.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10204400331010110110000100100003001028557101002001000620030018110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030012110000100
10204400331010110110000100100003001028557101002001000420030132110000100
10204400331010110110000100100003001028557101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 4.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100254006610025211000420100307010285571002020100002030000111000010
100254006610025211000420100307010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010289041005020100462030000111000010
100244003310021211000020100007010285571002020100002030129111000010
100244003310021211000020100007010285571002020100002030000111000010
100244003310021211000020100007010285571002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  fmls v0.4h, v8.4h, v9.4h
  movi v1.16b, 0
  fmls v1.4h, v8.4h, v9.4h
  movi v2.16b, 0
  fmls v2.4h, v8.4h, v9.4h
  movi v3.16b, 0
  fmls v3.4h, v8.4h, v9.4h
  movi v4.16b, 0
  fmls v4.4h, v8.4h, v9.4h
  movi v5.16b, 0
  fmls v5.4h, v8.4h, v9.4h
  movi v6.16b, 0
  fmls v6.4h, v8.4h, v9.4h
  movi v7.16b, 0
  fmls v7.4h, v8.4h, v9.4h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160204404918011010180009100800133003200568011320080013200024003910160000100
160204401108011010180009100800133003200528011220080012200024003910160000100
160204400868010910180008100800123003200528011220080012200024003610160000100
160204400868010910180008100800123003200528011220080012200024003610160000100
160204400868010910180008100800123003200528011220080012200024003610160000100
160204400868010910180008100800123003200528011220080012200024003610160000100
160204400868010910180008100800123003200528011220080012200024014410160000100
160204400868010910180008100800123003200528011220080012200024003610160000100
160204400968010910180008100800123003200528011220080012200024003610160000100
160204400868010910180008100800123003200528011220080012200024003610160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
160024437638002011800091080013303200008001020800002002401501016000010
160024413518001111800001080000303200008001020800002002400001016000010
160024403928001111800001080000303200008001020800002002400001016000010
160024404068001111800001080000303200008001020800002002400001016000010
160024404428001111800001080000303200008001020800002002400001016000010
160024404428001111800001080000303200008001020800002002400001016000010
160024405288001111800001080000303200008001020800002002400001016000010
160024404158001111800001080000303200008001020800002002400001016000010
160024404088001111800001080000303200008001020800002002401471016000010
160024405498001111800001080000303200008001020800002002400001016000010

Test 6: throughput

Count: 16

Code:

  fmls v0.4h, v16.4h, v17.4h
  fmls v1.4h, v16.4h, v17.4h
  fmls v2.4h, v16.4h, v17.4h
  fmls v3.4h, v16.4h, v17.4h
  fmls v4.4h, v16.4h, v17.4h
  fmls v5.4h, v16.4h, v17.4h
  fmls v6.4h, v16.4h, v17.4h
  fmls v7.4h, v16.4h, v17.4h
  fmls v8.4h, v16.4h, v17.4h
  fmls v9.4h, v16.4h, v17.4h
  fmls v10.4h, v16.4h, v17.4h
  fmls v11.4h, v16.4h, v17.4h
  fmls v12.4h, v16.4h, v17.4h
  fmls v13.4h, v16.4h, v17.4h
  fmls v14.4h, v16.4h, v17.4h
  fmls v15.4h, v16.4h, v17.4h
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020480082160105101160004100160008030006400441601102000160014200048004210160000100
16020480036160107101160006100160010030006400361601082000160012200048004210160000100
16020480036160107101160006100160010030006402241601582000160066200048003910160000100
160204800361601051011600041001600080300064004416011020001600131112755480192413285160001502
16020480036160107101160006100160010030006400441601102000160013200048004210160000100
16020480036160107101160006100160010030006400441601102000160013200048003610160000100
16020480036160105101160004100160008030006400361601082000160012200048003610160000100
16020480036160105101160004100160008030006400361601082000160012200048003610160000100
16020480036160105101160004100160008030006400361601082000160012200048003610160000100
16020480036160105101160004100160008030006400361601082000160012200048003610160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024801991600171116000610160010306400001600102016000020480000116000010
160024800711600111116000010160000306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010
160025800861600561116004510160057306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010
160024800431600111116000010160000306400001600102016000020480000116000010