Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCSEL (scalar, H)

Test 1: Latency 1->2

Code:

  fcsel h0, h0, h1, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000620030018110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000620030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100242003310021211000020100002559121685465435096831698861362956100552030018111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010
10024200331002121100002010000070050924810020200100002030000111000010

Test 2: uops

Code:

  fcsel h0, h1, h0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000

Test 3: Latency 1->3

Code:

  fcsel h0, h1, h0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000620030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092471002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705095801005420100442030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 4: Latency 1->4

Chain cycles: 2

Code:

  fcsel h0, h1, h2, lt
  fcmp d0, d3
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3
  movi v3.16b, 4

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924720200200200062005001510110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042005001010110000100
2020440033202012012000020020000700101924820200200200042025012010210000100
2020440033202012012000020020000700101924820200200200042005001010110000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
200244003320021212000020200007010192462002020200042050015111000010
200244003320021212000020200007010192482002020200002050000111000010
200254006620029212000820200347010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010
200244003320021212000020200007010192482002020200002050000111000010

Test 5: throughput

Count: 8

Code:

  fcsel h0, h8, h9, lt
  fcsel h1, h8, h9, lt
  fcsel h2, h8, h9, lt
  fcsel h3, h8, h9, lt
  fcsel h4, h8, h9, lt
  fcsel h5, h8, h9, lt
  fcsel h6, h8, h9, lt
  fcsel h7, h8, h9, lt
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
80204400758010710180006100800103004000528011020080014200240042180000100
80204400458010710180006100800103004221138020020080112200240039180000100
80204400378010510180004100800093003976528011020080014200240036180000100
80204400358010510180004100800083003984528011020080014200240036180000100
80205400718014210180041100800563003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083004244898015620080064200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100
80204400358010510180004100800083003992428010820080012200240036180000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
800254015280062218004120800536837600080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010
800244003580021218000020800006538400080020208000020240000118000010