Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CCMP (immediate, 64-bit)

Test 1: uops

Code:

  ccmp x1, #3, #0, hi
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)f5f6f7f8fd
100410358006191725100010001000622500103510358053882100010002000103510411100110001000037312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
1004103570043991725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622500103510358053882100010002000103510411100110001000007312711990100010361036103610361036
100410358006191725100010001000622501103510358053882100010002000103510411100110001000007312711990100010361036103610361036

Test 2: Latency 2->1

Chain cycles: 1

Code:

  ccmp x1, #3, #0, hi
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003515010061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010020100101111320316332001120000101002003620036200362003620036
2020420035150100611993025201002010020112129723349169552003520035174256174872011220224302362003510411202011009910020100201002731111320316132001120000101002003620036200362003620036
202042003515010061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010020100001111320216312001120000101002003620036200362003620036
202042003515010061199302520100201002011212972334917045200352003517425617487201122022430236200351041120201100991002010020100001111320116332001120000101002003620036200362003620036
202042003515010061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010020100001111320316132001120000101002003620036200362003620036
2020420035150100611993025201002010020112129723349169552003520035174256174872011220224302362003510411202011009910020100201002891111320316312001120000101002003620036200362003620036
2020420035150100611993025201002010020112129723349169552003520035174256174872011220224302362003510411202011009910020100201003431111320316302001120000101002003620036200362003620036
2020420035150100611993025201002010020112129723349169552003520035174256174872011220224302362003510411202011009910020100201002731111318316332001120000101002003620036200362003620036
202042003515000061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010020100001111320116332001120000101002003620036200362003620036
202042003515010061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010020100001111320216332001120000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20024200351500006119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010200100001270427441999520000100102003620036200362003620036
20024200351500006919918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010200100001270427541999520000100102003620036200362003620036
2002420035150000611991825200102001020010129724704916955200352003517428317504200102002030020200351041120021109102001020010001021270427441999520000100102003620036200362003620036
20024200351500006119918252001020010200101297247049169552003520035174283175042001020020300202003510411200211091020010200100001270427441999520000100102003620036200362006720036
20024200351500006119918252001020010200101297247049169552003520081174283175042001020020300202003510411200211091020010200100001270427441999520000100102003620036200362003620036
200242003515000072619918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010200100001270427441999520000100102003620036200362003620036
20024200351500006119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010200100001270427441999520000100102003620036200362003620036
20024200351500008219918252001020010200101297247049169552003520035174283175042001020020300202003510411200211091020010200100001270327541999520000100102003620036200362003620036
20024200351500006119918252001020010200101297247049169552003520035174283175042001020020300202003510411200211091020010200100001270427441999520000100102003620036200362003620036
200242003515000010319918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010200100001270427451999520000100102003620036200362003620036

Test 3: Latency 2->2

Code:

  ccmp x0, #3, #0, hi
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750013199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000200011171901610010101001001003610036100361003610036
102041003575006199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
102041003575006199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000010311171901610010101001001003610036100361003610036
102041003575006199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
102041003575106199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
102041003575006199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
102041003575006199272510200102001021064771214969551003510035867378737102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
102041003575106199272510200102001021064771214969551003510035867378737102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
1020410035750012499272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036
102041003575006199272510200102001021064771214969551003510035867378736102101022420248100351101110201100991010010000000011171901610010101001001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)030918191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100357500006199182510020100201002064729614969551003510035867838754100201002020020100351041110021109100101000000000064042744999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969561003510035867838754100201002020020100351041110021109100101000000000064042743999310010101003610036100361003610036
10024100357500006199182510020100201002064729604969561003510035867838754100201002020268100351041110021109100101000000000964052734999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969551003510124867838754100201002020020100351041110021109100101000000000064032734999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969551003510035867838754100201002020020100351041110021109100101000000000064032734999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969551003510035867838754100201002020020100351041110021109100101000000000064032744999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969551003510035867838754100201002020020100351041110021109100101000000000064042745999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969561003510035867838754100201002020020100351041110021109100101000000000064032735999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969551003510035867838754100201002020020100351041110021109100101000000000064032735999310010101003610036100361003610036
10024100357500006199182510020100201002064729614969551003510035867838754100201002020020100351041110021109100101000000000064042754999310010101003610036100361003610036

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  ands xzr, xzr, xzr
  ccmp x0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6676

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1602045343440000000000028271601201601201601281059833000495032853408534083334763335716012816024016031253408661116020110099100160100801000005000111101195101600534051600201005340953409534095340953409
16020453408401000000000282716012016012016012810637380154950328534085340833347633357160128160240160240534086611160201100991001601008010000054000111101195101600534051600201005340953409534095340953409
1602045340840000000000028271601201601201601281063738115495032853408534083334763335716012816024016024053408661116020110099100160100801000001000111101195012400534051600201005340953409534095340953409
1602045340840000000000028271601201601201601281063738015495032853408534083334763335716012816024016024053408661116020110099100160100801000001090111101195001600534051600201005340953409534095340953409
16020453408400000000120028271603911601201601281063738015495032853408534083334763335716012816024016024053654661116020110099100160100801000003000111101195001600534051600201005340953409534095340953599
16020453408400000000483520693271601201601201601281063738015495032853408534083334763335716012816024016024053408661116020110099100160100801000000000111101195101600534051600201005340953409534095340953409
1602045340840000010000028271601201601201601281063738015495032853408534083334763335716012816024016024053408661116020110099100160100801000001000111101195101600534051600201005340953409534095340953409
16020453408400010000360028271601201601201601281063738015495032853408534083334763335716041916024016024053408661116020110099100160100801000000030111101195011600534051600201005340953409534095345553409
1602045340840000000000028271601201601201601281063738015495032853408534083334763335716012816024016024053408661116020110099100160100801000002000111101195001600534051600201005340953409534095340953409
1602045340840000000300028481601201601201601281063738015495032853408534083334763335716012816024016024053408661116020110099100160100801000001000111101195101600534051600201005340953409534095340953409

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)031e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
16002453379400004325160010160010160010102938811549502945337453374333310333351160010160020160020533746611160021109101600108001000000001003283113192118653370160000157105337553375533755337553375
16002453374399004325160010160010160010102938811549502945337453374333310333351160010160020160020533746611160021109101600108001000000001002684181921151053370160000157105337553375533755337553375
1600245337440000432516001016001016001010293881154950294533745337433331033335116001016002016002053374661116002110910160010800100000000100268414192119653370160000157105337553375533755337553375
16002453374400004325160077160010160010102938811549502945337453374333310333351160010160020160020533746611160021109101600108001000000001003284151921141053370160000157105337553375533755337553375
1600245337440000432516001016001016001010293881154950294533745337433331033335116001016002016002053374661116002110910160010800100000000100328418192118553370160000157105337553375533755337553375
160024533743990043251600101600101600101029388115495029453374533743333103333511600101600201600205337466111600211091016001080010000000010026841819211101153370160000157105337553375533755337553375
160024533743990043251600101600101600101029388100495029453374533743333103333511600101600201600205337466111600211091016001080010000000010026841111921110653370160000157105337553375533755337553375
160024533743990043251600101600101600101029388100495029453374534143333103333511600101600201600205337466111600211091016001080010000000010026841101921171153370160000157105337553375533755337553375
160024533744006450432516001016001016001010293881154950294533745337433331033335116001016002016002053374661116002110910160010800100000000100268414192118653370160000157105337553375533755337553375
16002453374400007082516001016001016001010293881154950294533745337433331033335116001016002016002053374661116002110910160010800100000000100268429192118553370160000157105337553375533755337553375

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  ccmp x0, #3, #0, hi
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3354

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
502041345810100000114255012240112100104014310013575127800971339513416134156137246777110501564025110013803022002613416134161150201100991004010010000401000011132200160013413400121001341713417134161341713417
502041341610100000217245012240112100104014310013575127800971339513416134166137245677110501564025110013803022002613416134161150201100991004010010000401000011132210160013413400121001341713417134171341713417
502041341610100000204255012240112100104014310013575127800971339513416134166139246777110501564025110013803022002613416134151150201100991004010010000401000011132210160013413400121001341713417134171341713417
50204134161000000093255012240112100104014310013575127800971339513416134166139246777110501564025110013803022002613416134161150201100991004010010000401000311132210160013413400121001341713417134171341713417
50204134161000000028255012240112100104014310013575127800971339513416134166139246777110501564025110013803022002613415134161150201100991004010010000401001311132210160013413400121001341713417134171341713416
502041341610100000201255012240112100104014310013575127800971339513416134166139246767110501564025110013803022002613416134161150201100991004010010000401000011132210160013413400121001341613417134171341713417
50204134161010000028255012240112100104014310013575127800971339513416134166139245677110501564025110013803022002613416134161150201100991004010010000401000011132210160013413400121001341713417134171341713417
50204134161000000028255012240112100104014310013575127800971339513416134166139245677110501564025110013803022002613416134161150201100991004010010000401000011132210160013413400121001341713417134171341713417
502041341610000000133255012240112100104014310013575127800971339513416134166139246777110501564025110013803022002613416134161150201100991004010010000401000311132210160013413400121001341713417134171341713416
502041341610000000112255012240112100104014310013575127800971339513416134166139245677110501564025110013803022002613416134161150201100991004010010000401000011132210160013413400121001341713417134171341713417

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
5002413404100026225500104001010000400101000057345680000113353133821338255753784371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338255773795371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338255753784371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383
5002413382100011025500104001010000400101000057345680000013353133821338255753795371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383134401338313383
500241338210006625500104001010000400101000057345680000013353133821338255773795371095013540020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383
500241338210004525500104001010000400101000057345680000113353133821338255773795371095001040020100008002020000133821338211500211091040010100004001000003140119311337940000101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338255773795371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383
500241338210108725500104001010000400101000057345680000013353133821338255773795371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383
500241338210004525500104001010000400101000057345680000113353133821338255753795371095001040020100008002020000133821338211500211091040010100004001000003140119311337940000101338313383133831338313383
5002413382100030325500104001010000400101000057345680000113353133821338255773795371095001040020100008002020000133821338211500211091040010100004001000003140119321337940000101338313383133831338313383