Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SBCS (64-bit)

Test 1: uops

Code:

  sbcs x0, x0, x1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100410357000000006191725100010001000622501103510358053882100010003000103540111001100000000073227339931000100010361036103610361036
100410358000000006191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
1004103570000002406191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
100410358000000009591725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
100410358000000006191725100010001000622501103510358053882100010003000103540111001100006000073227339931000100010361036103610361036
100410357000000006191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
100410358000000006191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
100410358000000006191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
100410358000000306191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036
100410358000000006191725100010001000622501103510358053882100010003000103540111001100000000073327339931000100010361036103610361036

Test 2: Latency 1->2

Code:

  sbcs x0, x0, x1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035751185699920251010010100101006471521496955100351008286563873210100102003020010035401110201100991001010010009071012711999510000101001003610036100361003610084
102041003575103629920251010010100101006471521496955100971008286563873210100102003020010035401110201100991001010010000373512711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010010071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000371012711999510000101001003610036100361003610036
10204100357500619920911010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010010071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000071022721999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575006199182510010100101001064724604969551003510035867838754100101002030020100354011100211091010010106064032722999710000100101003610036100361003610036
100241003575006199182510010100101001064724604969551003510035867838754100101002030020100354011100211091010010104364022722999710000100101003610036100361003610036
1002410035750061991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101001564022722999710000100101003610036100361003610036
100241003576006199182510010100101001064724604969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003575006199182510010100101001064724604969551003510035867838754100101002030020100354011100211091010010100964022722999710000100101003610036100361003610036
100241003575006199182510010100101001064724604969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003576930619918251001010010100106472460496955100351003586783875410010100203002010035401110021109101001010010264022722999710000100101003610036100361003610036
100241003575006199182510010100101001064724604969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750061991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101013064022722999710000100101003610036100361003610036
1002410035750061991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101020064022722999710000100101003610036100361003610036

Test 3: Latency 1->3

Code:

  sbcs x0, x1, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204100357502829920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
10204100357604769920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471521496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035760619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955100351003586563873210100102003020010035401110201100991001010010000000071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)033f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100357561991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101020064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101000064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246149695510035100358678387541001010020300201003540111002110910100101010064022722999710000100101003610036100361003610036
10024100357582991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101000064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101000064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101009064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246149695510035100358678387541001010020300201003540111002110910100101000064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101000064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246049695510035100358678387541001010020300201003540111002110910100101000064022722999710000100101003610036100361003610036
10024100357561991825100101001010010647246049695510035100358678387541010610020300201003540111002110910100101000064022722999710000100101003610036100361003610036

Test 4: Latency 1->4

Chain cycles: 1

Code:

  sbcs x0, x1, x2
  tst x0, 1
  mov x0, 1
  mov x1, 2
  mov x2, 3

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2020420035150000611993025202002020020212129773349169552003520035174257174862021220224402482003564112020110099201000001111319216112001520100101002003620036200362003620036
2020420035150110611993025202002020020212129773349169552003520035174257174862021220224402482003564112020110099201000001111319116112001520100101002003620036200362003620036
20204200351501108411992625202002020020200129765049169552003520035174063174812020020200402002003564112020110099201000430001310228211999220100101002003620036200362003620036
2020420035149000611992625202002020020200129765049169552003520035174063174812020020200402002003564112020110099201000100001310228221999220100101002003620036200362003620036
202042003515000015619926252020020200202001297650491695520035200351740631748120200202004020020035126112020110099201000000001310228221999220100101002003620036200362003620036
20204200351500006119926252020020200202001297650491695520035200351740631748120200202004020020035641120201100992010002960001310228221999220100101002003620036200362003620036
2020420035150000611992625202002020020200129765049169552003520035174063174812020020200402002003564112020110099201000431770001310228221999220100101002003620036200362003620036
20204200351490004221992625202002020020200129765049169552003520035174063174812020020200402002003564112020110099201000000001310228221999220100101002003620036200362003620036
20204200351500001451992625202002020020200129765049169552003520035174063174812020020200402002003564112020110099201000000001310228221999220100101002003620036200362003620036
2020420035150000611992625202002020020200129765049169552003520035174063174812020020200402002003564112020110099201000000001310228221999220100101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20024200351500061199182520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000001270127111999520032100102003620036200362003620036
20024200351500061199182520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000001270127111999520010100102003620036200362003620036
20024200351500061199172520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000001270227111999520010100102003620036200362003620036
200242003515036061199182520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000001270127111999520010100102003620036200362003620036
2002420035150006119918252002020020200201297297049169550200352003517428317504200202002040020200356411200211092001000012001270127111999520010100102003620036200362003620036
200242003515000283199182520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000001270127111999520010100102003620036200362003620036
20024200351500061199182520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000011270127111999520010100102003620036200362003620036
20024200351500061199182520020200202002012972970491695502003520035174283175042002020020400202003564112002110920010000301270127111999520010100102003620036200362003620036
2002420035150006119918252002020020200201297297049169550200352003517428317504200202002040020200356411200211092001000010201270127121999520010100102003620036200362003620036
200242003515000611991825200202002020020129729704916955020035200351742831750420020200204002020035641120021109200100006601270127111999520010100102003620036200362003620036

Test 5: Latency 4->2

Chain cycles: 1

Code:

  sbcs x0, x1, x2
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003515015147199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100001111320162001220000201002003620036200362003620036
2020420035150061199302520100201002011212972331491695520035200351742581748620112202244024820035641120201100991002010010100001111320162001220000201002003620036200362003620036
2020420035150061199302520100201002011212972331491695520035200351742571748620112202244024820035641120201100991002010010100001111320162001220000201002003620036200362003620036
20204200351500166199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100001111319162001220000201002003620036200362003620036
20204200351490726199304520100201002011212972331491695520035200351742571748520112202244024820035641120201100991002010010100001111320162001220000201002003620036200362003620036
20204200351500124199302520100201002011212972331491695520035200351742571748620112202244024820035641120201100991002010010100001111319162001220000201002003620036200362003620036
2020420035150061199302520100201002011212972331491695520035200351742571748620112202244024820035641120201100991002010010100001111319162001220000201002003620036200362003620036
202042003515044484199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100001111319162001220000201002003620036200362003620036
20204200351500103199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100501111320162001220000201002003620036200362003620036
2020420035150061199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100001111319162001220000201002003620082200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)0318191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20024200351500000170199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010001001270327461999520000200102003620036200362003620036
2002420035150000061199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010000001270427471999520000200102003620036200362003620036
20024200351500000170199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010000001270427341999520000200102003620036200362003620036
200242003515000001051991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100037801270427471999520000200102003620036200362003620036
2002420035150000061199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010000001270427471999520000200102003620036200362003620036
2002420035150000061199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010000001270427471999520000200102003620036200362003620036
20024200351500000145199182520010200102001012972470491695520035200351742831750420010200204002020035641120021109102001010010000001270427541999520000200102003620036200362003620036
20024200351500000105199182520010200102001012972470491695520035200351742831750420010200204002020035641120021109102001010010000001270527571999520000200102003620036200362003620036
2002420035150000061199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010000001270427481999520000200102003620036200362003620036
20024200351500000145199182520010200102001012972471491695520035200351742831750420010200204002020035641120021109102001010010000001270427581999520000200102003620036200362003620036

Test 6: Latency 4->3

Chain cycles: 1

Code:

  sbcs x0, x1, x2
  cset x2, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)091e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fst unit uop (a7)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003515000001001993025201002010020112129723314916955200352003517425817485201122022440248200356411202011009910020100101000103111132016012001220000201002003620036200362003620036
202042003515000006119930252010020100201121297233149169552003520035174258174852011220224404382003564212020110099100201001010000012111131916102001220000201002003620036200362003620036
20205200351500000611993025201002010020112129723314916955200352003517425717486201122022440248200356411202011009910020100101000000111131916102001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425717486201122022440248200356411202011009910020100101000000111132016102001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425717485201122022440248200356411202011009910020100101000000111131916102001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425817485201122022440248200356411202011009910020100101000000111131916102001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723314916955200352003517425717486201122022440248200356411202011009910020100101000000111131916002001220000201002003620036200362003620036
202042003515000002491993025201002010020112129723304916955200352003517425717486201122022440248200356411202011009910020100101000000111131916102001220000201002003620036200362003620036
202042003515000002541993025201002010020112129723304916955200352003517425717486201122022440248200356411202011009910020100101000000111132016002001220000201002003620036200362003620036
202042003515000004961993025201002010020112129723314916955200352003517425717485201122022440248200356411202011009910020100101000000111132016102001220000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2002420035149000001031991825200102001020010129724714916955200352003517428717504200102002040020200356411200211091020010100101001270227121999520000200102003620036200362003620036
200242003515000000821991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
200242003515000000841991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
200242003515000030611991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
200242003515000000611991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100601270127111999520000200102003620036200362003620036
200242003515000000841991825200102001020010129724714916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
2002420035149000210611991825200102001020010129724714916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
2002420035150000001261991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
2002420035150100001031991825200102001020010129724704916955200352003517428317504200102002040020200356411200211091020010100100001270127111999520000200102003620036200362003620036
2002420035150000001621991825200102001020010129724714916955200352003517428317504200102002040020200356411200211091020010100100001270127121999520000200102003620036200362003620036

Test 7: Latency 4->4

Code:

  sbcs x0, x1, x2
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03191e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575000251992725102001020010210647712049695510035100358673887351021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
10204100357500061992725102001020010210647712049695510035100358673887351021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
10204100357500061992725102001020010210647712049695510035100358673887361021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
10204100357500061992725102001020010210647712049695510035100358673887351021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
102041003575000520992725102001020010210647712049695610035100358673887351021010224302721003540111020110099101000023111720161001310100101001003610036100361003610036
10204100357500061992725102001020010210647712049695510035100358673887351021010224302721003540111020110099101000010111720161001310100101001003610036100361003610036
102041003575000344992725102001020010210647712049695510035100358673887351021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
10204100357500061992725102001020010210647712049695610035100358673887351021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
1020410035750001123992725102001020010210647712049695510035100358673887361021010224302721003540111020110099101000000111720161001310100101001003610036100361003610036
10204100357500061992725102001020010210647712049695510035100358673887351021010224302721003540111020110099101002200111720161001310100101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575063299182510020100201002064729649695610035100358678387541002010020300201003540111002110910010000064022743999710010100101003610036100361003610036
10024100357506199182510020100201002064729649695510035100358678387541002010020300201003540111002110910010000064032743999710010100101003610036100361003610036
10024100357506199182510020100201002064729649695510035100358678387541002010020300201003540111002110910010000064032733999710010100101003610036100361003610036
10024100357606199182510020100201002064729649695510035100358678387541002010020300201003540111002110910010001364022743999710010100101003610036100361003610036
10024100357506199182510020100201002064729649695510035100358678387541002010020303081003540111002110910010000064032733999710010100101003610036100361003610036
10024100357506199182510020100201002064729649695510035100358678387541002010020300201003540111002110910010000064032722999710010100101003610036100361003610036
10024100357506199182510020100201002064729649695610035100358678387541002010020300201003540111002110910010000364042743999710010100101003610036100361003610036
10024100357506199182510020100201002064729649695510035100358678387541002010020300201003540111002110910010000964022733999710010100101003610036100361003610036
10024100357606199182510020100201002064729649695510035100358678387541002010020300201003540111002110910010000064032733999710010100101003610036100361003610036
100241003575010399182510020100201002064729649695510035100358678387541002010020304161003540111002110910010000064032723999710010100101003610036100361003610036

Test 8: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  sbcs x0, x8, x9
  ands xzr, xzr, xzr
  sbcs x1, x8, x9
  ands xzr, xzr, xzr
  sbcs x2, x8, x9
  ands xzr, xzr, xzr
  sbcs x3, x8, x9
  ands xzr, xzr, xzr
  sbcs x4, x8, x9
  ands xzr, xzr, xzr
  sbcs x5, x8, x9
  ands xzr, xzr, xzr
  sbcs x6, x8, x9
  ands xzr, xzr, xzr
  sbcs x7, x8, x9
  mov x8, 9
  mov x9, 10
  mov x10, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
16020453408399101001001706251601001601001601001063588049503245340453404333396333591601001602002402005340452111602011009910016010010000000101141019131453401160000801005340553405534055340553405
1602045345240010100000141251601001601001601001063588049503245340453404333393333591601001602002402005340452111602011009910016010010000000101121219121253401160000801005340553405534055340553405
160204534044001010000014125160100160100160100106358814950324534045340433339333359160100160200240200534045211160201100991001601001000000010112121911853401160000801005340553405534055340553405
160204534043991010003901412516010016010016010010635880495032453404534043333933335916010016020024020053404521116020110099100160100100000001011281911853401160000801005340553405534055340553405
1602045340440010100000141251601001601001601001063588049503245340453404333393333591601001602002402005340452111602011009910016010010000460101121219121353401160000801005340553405534055340553405
16020453404400101000001706251601001601001601001063588049503725340453404333393333591601001602002402005340452111602011009910016010010000000101121319131353401160000801005340553405534055340553405
1602045340440010100000141251602351601001601001063588149503245340453404333393333591601001602002402005340452111602011009910016010010000000101141419151353401160000801005340553405534055340553405
160204534044001010001650241251601001601001601001063588049503245340453404333393333591601001602002402005340452111602011009910016010010000000101141519131153401160000801005340553405534055340553405
160204534044001010000011692516010016010016010010635881495032453404534043333933335916010016020024020053404521116020110099100160100100000001011271971053401160000801005340553405534055340553405
16020453452400101000720018325160100160100160100106358814950324534535340433339333359160100160200240200534045211160201100991001601001000000110112719121353401160000801005340553405534055340553405

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
1600245338039910100000146251600101600101600101029388115495029453374533743332433334416001016002024002053374521116002110910160010100000000001002481124192381172053371160000159800105337553375533755337553375
1600245337440010100000146251600101600101600101029388115495029453374533743332433334416001016002024002053374521116002110910160010100000000001002481119192321161853371160000159800105337553375533755337553375
16002453374400101000001742516001016001016001010293881154950294533745337433324333344160010160020240020533745211160021109101600101000000000010024811141923121718533711600003017800105337553375533755337553375
16002453374400101000001462516001016001016001010293881154947259533745337433324333344160010160020240020533745211160021109101600101000000000010028811211943111618533711600001517800105337553375533755337553375
1600245337439910100000146251600101600101600101029388110495029453374533743332433334416001016002024002053374521116002110910160010100000000001002481116192331141753371160000159800105337553375533755337553375
1600245337440010100000146251600101600101600101029388115495029453374533743332433334416001016002024002053374521116002110910160010100000000001002881117192341181653371160000159800105337553375533755337553375
16002453374400101000001462516001016001016001010293881154950294533745337433324333344160010160020240020533745211160021109101600101000000000010024111116192311141553371160000309800105337553375533755337553375
160024533744001010000017112516001016001016001010293881154950294533745337433324333344160010160020240020533745211160021109101600101000000000010028111118192301112253371160000159800105337553375533755337553375
1600245337440010100000146251600101600101600101029388115495029453374533743332433334416001016002024002053374521116002110910160010100000000001002681118192331171253371160000159800105337553375533755337553375
160024533743991010000014322516001016001016001010293881154950294533745337433324333344160010160020240020533745211160021109101600101000000000010026112112172291211953371160000309800105337553375533755337553375

Test 9: throughput

Count: 4

Code:

  fcmp s0, s0
  sbcs x0, x4, x5
  sbcs x1, x4, x5
  sbcs x2, x4, x5
  sbcs x3, x4, x5
  mov x4, 5
  mov x5, 6
  mov x6, 7

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
502041343310000045025501004010010000401001000058744280000133831341213412602633533711750100402001000012020020000134121341211502011009910040100100001000003210119111340840000401001341313413134131341313413
5020413412100000450255010040100100004010010000587442800001338313412134126026308137117501004020010000120200200001341213412115020110099100401001000010001333210119111340840000401001341313413134131341313413
502041341210000045025501004010010000401001000058744280000133831341213412555333423711750100402001000012020020000134121341211502011009910040100100001000003210119111340940000401001341313413134131341313413
5020413412100002445025501004010010000401001000057474680000133831341213412602630813711750100402001000012020020000134121341211502011009910040100100001000003210119111340940000401001341313413134131341313413
5020413412100000450255010040100100004010010000574746800001338313412134125553334237117501004020010000120200200001341213412115020110099100401001000010000123210119111340840000401001341313413134131341313413
502041341210000045025501004010010000401001000057474680000133831341213412602633533711750100402001000012020020000134121341211502011009910040100100001000033210119111340940000401001341313413134131341313413
502041341210000045025501004010010000401001000057474680000133831341213412602630813711750100402001000012020020000134121341211502011009910040100100001000003210119111340940000401001341313413134131341313413
502041341210000045025501004010010000401001000057474680000133831341213412555333533711750100402001000012020020000134121341211502011009910040100100001000003210119111340940000401001341313413134131341313413
5020413412100000450255010040100100004010010000574746800001338313412134125553335387117501004020010000120200200001341213412115020110099100401001000010000153210119111340840000401001341313413134131341313413
50204134121010009270255010040100100004010010000586432800001338313412134126026334237117501004020010000120200200001341213412115020110099100401001000010000153210119111340940000401001341313413134131341313413

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
5002413383100000000045255001040010100004001010000573456800000133531338213382595139133710950010400201000012002020000133821338211500211091040010100001000003142819971337940072400101338313383133831338313383
50024133821000000000452550010400101000040010100005734568000011335313382133825580391337109500104002010000120020200001338213382115002110910400101000010000031401019981337940000400101338313383133831338313383
5002413382100000000045255001040010100004001010000573456800001133531338213382558039133710950010400201000012002020000133821338211500211091040010100001000003140819971337940000400101338313383133831338313383
5002413382100000000048255001040010100004001010000573456800001133531338213382558039133710950010400201000012002020000133821338211500211091040010100001000003140919871337940000400101338313383133831338313383
50024133821000000088045255001040010100004001010000573456800001133531338213382558039133710950010400201000012002020000133821338211500211091040010100001000003140819781337940000400101338313383133831338313383
50024133821000000000452550010400101000040010100005734568000001335313382133825951391337109500104002010000120020200001338213382115002110910400101000010005703140719781337940000400101338313383133831338313383
50024133821010010000872550010400101000040010100005734568000001335313382133825580391337109500104002010000120020200001338213382115002110910400101000010000031408291091337940000400101338313383133831338313428
50024133821000000000512550010400101000040010100005734568000001335313382133825580325437109500104002010000120020200001338213382115002110910400101000010000031408191081337940000400101338313383133831338313383
5002413382100000300045255001040010100004020810000573456800001133531338213382595139133710950010400201000012002020000133821338211500211091040010100001000003140719881337940000400101338313383133831338313383
50024133821000000000452550010400101000040010100005734568000001335313382133825951325437109500104002010000120020200001338213382115002110910400101000010000031407191091337940000400101338313383133831338313383