Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CMN (sxtw, 32-bit)

Test 1: uops

Code:

  cmn w0, w1, sxtw
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)f5f6f7f8fd
1004369200362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369300362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369200362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
10043693027362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369200362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369306362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369300362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369300362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369300362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370
1004369300362510001000100050000369369206322510001000200036966111001100073118113661000370370370370370

Test 2: Latency 3->1

Chain cycles: 1

Code:

  cmn w0, w1, sxtw
  cset x0, cc
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2020420035150061199262520100201002010012971500491695520035200351740631748120100202003020020035104112020110099100201001010000013101628221999220000101002003620036200362003620036
2020420035150061199262520100201002010012971501491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036
2020420035150061199262520100201002010012971500491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036
2020420035150061199262520100201002010012971501491695520035200351740631748120100202003020020035104112020110099100201001010016013101228321999220000101002003620036200362003620036
2020420035150061199262520100201002010012971501491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036
20204200351500191199262520100201002010012971500491695520035200351740631748120100202003020020035104112020110099100201001010000013101228321999220000101002003620036200362003620036
2020420035150061199262520100201002010012971500491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036
20204200351500711199262520100201002010012971500491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036
20204200351500801199262520100201002010012971501491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036
20204200351500679199262520100201002010012971501491695520035200351740631748120100202003020020035104112020110099100201001010000013101228221999220000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
200242003515000935199182520010200102001012972471491695520035200351742831750420010200203002020035104112002110910200101001000127013271361999520000100102003620036200362003620036
2002420035150002101991825200102001020010129724714916955200352003517428317504200102002030020200351041120021109102001010010001270112712101999520000100102003620036200362003620036
2002420035150002921991832200102001020010129724714916955200352003517428317504200102002030020200351041120021109102001010010001270142713131999520000100102003620036200362003620036
20024200351500061199182520010200102001012972470491695520035200351742831750420010200203002020035104112002110910200101001000127010271361999520000100102003620036200362003620036
200242003515000611991825200102001020010129724714916955200352003517428317504200102002030156200351041120021109102001010010101270122713131999520000100102003620036200362003620036
2002420035150002311991825200102001020010129724714916955200352003517428317504200102002030020200351041120021109102001010010001270122713131999520000100102003620036200362003620036
200242003515000611991825200102001020010129724714916955200352003517428317504200102002030020200351041120021109102001010010001270122713131999520000100102003620036200362003620036
20024200351500010631991825200102001020010129724704916955200352003517428317504200102002030020200351041120021109102001010010001270132713121999520000100102003620036200362003620036
2002420035150002141991825200102001020010129724704916955200352003517428317504200102002030020200351041120021109102001010010001270132714141999520000100102003620036200362003620036
20024200351500061199182520010200102001012972471491695520035200351742831750420010200203002020035104112002110910200101001000127052711132004220000100102003620036200362003620036

Test 3: Latency 3->2

Chain cycles: 1

Code:

  cmn w0, w1, sxtw
  cset x1, cc
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20204200351500030611993025201002010020112129723304916955200352003517425617487201122022430236200351041120201100991002010010100011113180116112001120000101002003620036200362003620036
2020420035150000611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
20204200351500012611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
2020420035150000611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
202042003515000321611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
202042003515000411611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228231999220000101002003620036200362003620036
20204200351500007261992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
2020420035150003611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
2020420035150030611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036
20204200351500024611992625201002010020100129715004916955200352003517406317481201002020030200200351041120201100991002010010100000013101228221999220000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)accfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2002420035150000000170199182520010200102001012978751491700002008120079174373175282001020020300202003510411200211091020010100100084012870127111999520022100102008020081200822003620036
2002420035150001101321321991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001001312700127111999520000100102003620036200362003620036
2002420035150000000611991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001000012700127111999520000100102003620036200362003620036
20024200351500000002521991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001001312700127111999520000100102003620036200362003620036
20024200351500000002111991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001000012700127111999520000100102003620036200362003620036
2002420035150000000611991825200102001020010129724714916955020035200351742871750420093200203002020035104112002110910200101001000012700127111999520000100102003620036200362003620036
20024200351490000002571991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001001312700127111999520000100102003620036200362003620036
2002420035150010000611991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001010012700127111999520000100102003620036200362003620036
20024200351500000006119918252001020010200101297247149169550200352003517428131750420010200203002020035104112002110910200101001000012700127111999520000100102003620036200362006720036
2002420035150000000611991825200102001020010129724714916955020035200351742831750420010200203002020035104112002110910200101001001012700127111999520000100102003620036200362003620036

Test 4: throughput

Count: 8

Code:

  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  cmn w0, w1, sxtw
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)030f18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020426770200000003525801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051102191126731800001002673626736267362673626736
80204267352000005403525801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
8020426735200000003525801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
802042673520000012070025801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
8020426735200020390070025801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
80204267352000000070025801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
8020426735200000003525801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
80204267352000000010025801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
8020426735201000003525801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736
8020426735201010003525801008010080100400500492365526735267351667231669080100802001602002673566118020110099100801001000000051101191126731800001002673626736267362673626736

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
800242672220000000000352580010800108001040005004923625026705267051666531668380010800201600202670566118002110910800101000000000502211340108267018000000102670626706267062670626706
8002426705200000000007002580010800108001040005004923625026705267051666531668380010800201600202670566118002110910800101000000000502271801010267018000000102670626706267062670626706
800242670520000000000562580010800108001040005004923625026705267051666531668380010800201600202670566118002110910800101000000000503010180710267018039100102693426983269782702627028
80024267972024000679246161179177803998045980080405554149239450270292679616744271688780412804931609862697766818002110910800101000214318000513411590816269108045100102702527026270272698327027
8002427030201210067804704656170803968045980213408899149239480268422698016738351688880145804221608502702266818002110910800101042010347120512120670108269438045400102698027027269812702927027
800242698420200116792466612371738046280464804164028830492394402711926979167674016918802128063416110427116669180021109108001010202024120005156198301014270138058200102706926841271152707426891
8002426982202011062411616146011080395802078034740557804923945026983268451673428168288007880426160296269786611800211091080010100230427502050309182129267018000000102670626706267062670626706
800242670520000000000352580010800108001040005004923625026705267051666531668380010800201600202670566118002110910800101000000000503091801212267018000000102670626706267062670626706
800242670520000000000352580010800108001040005004923625026705267051666531668380010800201600202670566118002110910800101000000000503012180129267018000000102670626706267062670626706
800242670520000000000352580010800108001040005004923625026705267051666531668380010800201600202670566118002110910800101000000120050309180912267018000000102670626706267062670626706