Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCSEL (scalar, S)

Test 1: Latency 1->2

Code:

  fcsel s0, s0, s1, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000030005092481010020001000620030018110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000420030012110000100
1020420033101011011000010010000030005092481010020001000620030261110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092471002020100062030012111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10025200661002921100082010034705092481002020100002030000111000010

Test 2: uops

Code:

  fcsel s0, s1, s0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000

Test 3: Latency 1->3

Code:

  fcsel s0, s1, s0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000620030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420188101411051003610410108300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 4: Latency 1->4

Chain cycles: 2

Code:

  fcsel s0, s1, s2, lt
  fcmp d0, d3
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2020440033202012012000020020000700101924720200200200062005001510110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020540066202092012000820020034700101924820200200200042005001010110000100
2020440033202012012000020020000650101924820200200200072005001510110000100
2020440033202012012000020020000700101924820200200200042005001010110000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
200244003320021212000020200007010192472002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200254006620029212000820200347010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010

Test 5: throughput

Count: 8

Code:

  fcsel s0, s8, s9, lt
  fcsel s1, s8, s9, lt
  fcsel s2, s8, s9, lt
  fcsel s3, s8, s9, lt
  fcsel s4, s8, s9, lt
  fcsel s5, s8, s9, lt
  fcsel s6, s8, s9, lt
  fcsel s7, s8, s9, lt
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
80204400438010710180006100800103003992528011020080014200240042180000100
80204400458010710180006100800103003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204401088015610180055100800593003992428010820080012200240189180000100
80204401128015510180054100800593003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
800244013280028218000720800116736806280032208001820240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006535621380074208006520240000118000010