Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CCMP (immediate, 64-bit)

Test 1: uops

Code:

  ccmp x1, #3, #0, hi
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001
10041030100110011000251921000100020001001

Test 2: Latency 2->1

Chain cycles: 1

Code:

  ccmp x1, #3, #0, hi
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20204200302010120101201075194352010720212302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100
20204200302010120101201085195482010820216302242000110100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
20024200302001120011200185194542001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010
20024200302001120011200105195982001020020300202000110010

Test 3: Latency 2->2

Code:

  ccmp x0, #3, #0, hi
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
102041003010201102011021125477010208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100
102041003010201102011020825470910208102082021610101100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
10024100301002110021100292555571006810072200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110
10024100301002110021100202551931002010020200201001110

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.7889

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
16020463313160112160112160118690240160118160220160220160013100
16020463111160115160115160120688477160118160220160216160011100
16020463098160111160111160118689652160118160220160216160010100
16020463112160112160112160118689546160157160259160220160013100
16020463175160110160110160116688591160120160220160224160014100
16020463091160112160112160118690894160120160220160220160012100
16020463115160112160112160118689536160118160220160224160014100
16020463134160115160115160120689652160118160220160220160012100
16020463127160112160112160118687037160118160220160220160012100
16020463091160112160112160118691935160118160220160220160014100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.7881

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1609587172716091316058233116054969719816003316004416002016000110
16002464567160023160023016003069750316002616003816003816001110
16002463076160021160021016002669935516003316004416003816001110
16002463049160011160011016001070092816001016002016002016000110
16002463045160011160011016001070206416001016002016002016000110
16002463054160011160011016001069614716001016002016002016000110
16002563198160053160053016007169604616001016002016002016000110
16002463032160011160011016001069416816001016002016002016000110
16002463077160011160011016001070056816001016002016002016000110
16002463040160011160011016001070404216001016002016002016000110

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
5020424010501074010410003401141000431520940012501124020910003802242000840001100
5020424001501094010510004401161000431552240017501164021210004802242000840002100
5020423997501034010110002401121000431552240017501164021210004802182000640003100
5020423979501064010310003401091000331492440017501194021610004802242000840002100
5020423997501034010110002401121000431552240017501164021210004802182000640001100
5020423990501044010110003401121000431502940018501164021210004802182000640001100
5020423982501044010110003401121000431539340013501124020910003802242000840002100
5020423981501064010310003401091000331512240016501204021610004802242000840002100
5020423993501034010110002401091000331561740017501164021210004802242000840002100
5020423976501054010210003401121000431539340013501124020910003802182000640001100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5998

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? int retires (ef)
500242410950018400151000340021100033158934000050010400201000080020200004000110
500242401850011400111000040010100003164614000050010400201000080020200004000110
500242400150011400111000040010100003165784000050010400201000080054200104000810
500242398650013400111000240019100033161314000050010400201000080020200004000110
500242401950011400111000040010100003160794000050010400201000080020200004000110
500242427750168401371003140136100313156214000050010400201000080020200004000110
500242397450011400111000040010100003164594000050010400201000080020200004000110
500242399350011400111000040010100003166814000050010400201000080020200004000110
500242398650011400111000040010100003155424000050010400201000080020200004000110
500242404950011400111000040010100003156214000050010400201000080020200004000110

Test 6: throughput

Count: 7

Code:

  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5568

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80204392968026080260802695505758011480216140220008000400100
80204389848010580105801125491858010880208140220008000400100
80204389868010680106801145506328011680216140228008000700100
80204389818010580105801165509858010880208140220008000600100
80204389748010980109801165463808015280252140228008000400100
80205390038013480134801485506408015480256140224008000700100
80205390038014280142801525496088011180212140214008000500100
80204389778010980109801145498078018580285140228008000400100
80204390098010680106801165499688011180212140228008000600100
80204389758010580105801145490628011480216140214008000400100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5561

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
800243913480033800338004454773980020800201400208001110
800253894780060800608007654748080020800201400208001110
800243894880021800218002054797180020800201400208001110
800243889680021800218002055001480020800201400208001110
800243891280021800218002054849580020800201400208001110
800243889380021800218002055145380020800201400208001110
800243891780021800218002054926280020800201400208001110
800243888180021800218002054539580020800201400208001110
800243899880021800218002054849580020800201400208001110
800243889780021800218002055210280020800201400208001110