Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CCMN (immediate, 32-bit)

Test 1: uops

Code:

  ccmn w1, #3, #0, hi
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001

Test 2: Latency 2->1

Chain cycles: 1

Code:

  ccmn w1, #3, #0, hi
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20204200302010120101201075191122010820216302182000110100
20204200302010120101201075194132010720214302242000110100
20204200302010120101201085195482010820216302242000110100
20205200612011720117201495195482010820216302212000110100
20204200302010120101201075195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20024200302001120011200185195072001720032300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010

Test 3: Latency 2->2

Code:

  ccmn w0, #3, #0, hi
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
102041003010201102011020925477010208102082022810101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825473410208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825629110323103232021610101100
102041003010201102011020825435710210102122022810101100
102041003010201102011020825470910208102082022810101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
10024100301002110021100292552361002910032200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7889

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
16020463249160116160116160120687346160155160257160222160015100
16020463164160117160117160121688456160124160226160220160012100
16020463134160115160115160120689052160124160226160220160015100
16020463093160111160111160118689181160120160220160216160010100
16020463103160111160111160115689796160118160220160220160012100
16020463117160112160112160118689226160118160220160220160014100
16020463174160113160113160118689288160118160220160224160014100
16020463119160112160112160118686358160118160220160220160012100
16020463081160114160114160120687369160120160220160260160048100
16020463117160112160112160118688910160118160220160216160012100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7882

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
16002464562160026160026016003069802816003016004016002016000110
16002463343160011160011016001070031116001016002016002016000110
16002463086160011160011016001070172516001016002016002016000110
16002463084160011160011016001070097116001016002016002016000110
16002463030160011160011016001069695616001016002016002016000110
16002463031160011160011016001070013716001016002016002016000110
16002463059160011160011016001069896616001016002016002016000110
16002463018160011160011016001069765016001016002016002016000110
16002463089160011160011016001070184116001016002016002016000110
16002463046160011160011016001069460216001016002016002016000110

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
5020424017501034010110002401091000331522640012501124020910003802242000840002100
5020424006501044010110003401121000431506340013501124020910003802322000840005100
5020423991501034010110002401121000431487540013501124020910003802182000640003100
5020423999501054010210003401121000431512940018501164021210004802322000840005100
5020423991501044010110003401121000431598740017501164021210004802182000640001100
5020423988501034010110002401091000331539540018501164021210004802242000840001100
5020423992501064010310003401091000331541140017501194021610004802242000840001100
5020423999501054010210003401121000431501140012501124020910003802182000640001100
5020423990501044010110003401121000431540740012501124020910003802182000640001100
5020424001501054010210003401091000331528340012501124020910003802182000640001100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
500242409850018400151000340021100030315750040000500104002001000080020200004000110
500242398950011400111000040010100000317808040000500104002001000080020200004000110
500242401250011400111000040010100000315510040000500104002001000080020200004000110
500242395650011400111000040010100000316523040000500104002001000080020200004000110
500242402150011400111000040010100000315542040000500104002001000080020200004000110
500242404250011400111000040010100000315547040000500104002001000080020200004000110
500242397550011400111000040010100000315510040000500104002001000080020200004000110
500242397750011400111000040010100000316748040000500104002001000080020200004000110
500242398550011400111000040010100000316653040000500104002001000080020200004000110
500242396250011400111000040010100000315455040000500104002001000080048200084000610

Test 6: throughput

Count: 7

Code:

  ands xzr, xzr, xzr
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  ccmn w0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5568

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
8020439010801098010980114551096801168021614022880005100
8020438986801078010780114548919801118021214023080007100
8020438930801098010980117549785801168021614022880005100
8020438952801088010880116547003801088020814022880003100
8020438944801068010680116549899801118021214022880005100
8020439012801068010680116550182801118021214022080004100
8020438969801038010380114549972801168021614022880005100
8020439005801058010580114550481801118021214022880005100
8020438944801078010780116549785801168021614022080003100
8020438969801038010380114550183801168021614022880007100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5561

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
800243916380034800348004605490520800428004401400208001110
800243891280021800218002005490760800208002001400208001110
800243891780021800218002005505810800208002001400208001110
800243894680021800218002005499160800208002001400208001110
800253891880053800538007605506420800208002001400208001110
800243885980021800218002005500420800208002001400208001110
800243897080021800218002005506420800208002001400208001110
800243891380021800218002005486660800208002001400208001110
800243896980021800218002005503410800208002001400208001110
800243889480021800218002005503410800208002001400208001110