Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCCMP (scalar, S)

Test 1: uops

Code:

  fccmp s0, s1, #0, lt
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001
100420331001110001000247691000100030001

Test 2: Latency 3->1

Chain cycles: 2

Code:

  fccmp s0, s1, #0, lt
  fcsel d0, d2, d3, eq
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20204400332010110120000100200003001019248201002002000620060018110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003071019580201362022004520060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100
20204400332010110120000100200003001019248201002002000420060012110000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20024400332001111200001020000301019248200102020006206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010
20024400332001111200001020000301019248200102020000206000011000010

Test 3: Latency 3->2

Chain cycles: 2

Code:

  fccmp s0, s1, #0, lt
  fcsel d1, d2, d3, eq
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20204400332010110120000100200000300010192482010020002000620060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100
20204400332010110120000100200000300010192482010020002000420060012110000100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20024400332001111200001020000030010192482001020020004206000011000010
20024400332001111200001020000030010195802004420020048206000011000010
20024400332001111200001020000030010192482001020020000206000011000010
20024400332001111200001020000030010192482001020020000206000011000010
20024400332001111200001020000030010192482001020020000206000011000010
20024400332001111200001020000030010192482001020020000206000011000010
20024400332001111200001020000030010192482001020020000206000011000010
20024400332001111200001020000030010192482001020020000206000011000010
20026400992002711200161020068030010195802004420020048206001811000010
20024400332001111200001020000030010192482001020020000206000011000010

Test 4: Latency 3->3

Code:

  fccmp s0, s1, #0, lt
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
1020420033102012011000020010000700249768102002001000620030018101100
1020420033102012011000020010000700249769102002001000420030012101100
1020520066102092011000820010021700249769102002001000420030012101100
1020420033102012011000020010000700249769102002001000420030012101100
1020520066102112031000820210021700249769102002001000420030012101100
1020420033102012011000020010000700249768102002001000620030018101100
1020420033102012011000020010000700249769102002001000420030012101100
1020420033102012011000020010000700249769102002001000420030012101100
1020420033102012011000020010000700249769102002001000420030012101100
1020420033102012011000020010000700249769102002001000420030012101100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
100242003310021211000020100007024976810020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110
100242003310021211000020100007024976910020201000020300001110

Test 5: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  ands xzr, xzr, xzr
  fccmp s0, s1, #0, lt
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0736

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
1602048594516010780105800028010680005024032105583651601148020708000720024002180008100
1602048472516011180108800038010780007024032104769711601138020708000620024001880005100
1602048588616010780105800028010680005024031805600311601118020608000620024001880008100
1602048588616010780105800028010680005024031805600311601118020608000620024001880005100
1602048588616010780105800028010680005024031805600311601118020608000620024001880005100
1602048588616010780105800028010680005024031805600311601118020608000620024001880005100
1602048588616010780105800028010680005024037805601721601528022608002620024001880008100
1602048588616010780105800028010680005024031805600311601118020608000620024007580026100
1602048588616010780105800028010680005024031805600311601118020608000620024001880005100
1602048588616010780105800028010680005024031805600311601118020608000620024001880005100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0482

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002485713160017800158000280016800052400305516941600108002080000200240000800010010
16002485583160011800118000080010800002400304764191600108002080000200240000800010010
16002483856160011800118000080010800002400304764191600108002080000200240000800010010
16002483858160011800118000080010800002400304764191600108002080000200240000800010010
16002483856160011800118000080010800002400304764191600108002080000200240000800010010
16002483856160011800118000080010800002401084765551600608004680025200240075800250010
16002483856160011800118000080010800002400304762081600108002080000200240000800010010
16002483856160011800118000080010800002400304764191600108002080000200240000800010010
16002483856160011800118000080010800002400304764191600108002080000200240000800010010
16002483856160011800118000080010800002400304764191600108002080000200240000800010010