Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCSEL (scalar, D)

Test 1: Latency 1->2

Code:

  fcsel d0, d0, d1, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000620030018110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030144110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092471002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10025200661002921100082010034705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 2: uops

Code:

  fcsel d0, d1, d0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000

Test 3: Latency 1->3

Code:

  fcsel d0, d1, d0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000620030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092471002020100002030000111000010
10024200331002121100002010000705097801005620100402030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 4: Latency 1->4

Chain cycles: 2

Code:

  fcsel d0, d1, d2, lt
  fcmp d0, d3
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2020440033202012012000020020000700101924720200200200062005001510110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
200244003320021212000020200007010192472002020200062050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050110111000010

Test 5: throughput

Count: 8

Code:

  fcsel d0, d8, d9, lt
  fcsel d1, d8, d9, lt
  fcsel d2, d8, d9, lt
  fcsel d3, d8, d9, lt
  fcsel d4, d8, d9, lt
  fcsel d5, d8, d9, lt
  fcsel d6, d8, d9, lt
  fcsel d7, d8, d9, lt
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
80204400378010510180004100800093003992528011020080014200240039180000100
80204400458010710180006100800103003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240180180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083004040728015420080064200240036180000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
800244059380027218000620800106837600080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006536983080076208006620240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240138118000010
800244003580021218000020800006738156180069208004920240000118000010