Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SBCS (32-bit)

Test 1: uops

Code:

  sbcs w0, w0, w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100410357061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
100410358061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
100410358061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
10041035830061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
1004103583619172510001000100062250103510358053882100010003000103540111001100001573127119931000100010361036103610361036
100410358061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
100410358061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
100410358061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
10041035829161917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036
100410357061917251000100010006225010351035805388210001000300010354011100110000073127119931000100010361036103610361036

Test 2: Latency 1->2

Code:

  sbcs w0, w0, w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204100357606199202510100101001010064715204969550100351003586563873210100102003020010035401110201100991001010010007571012711999510000101001003610036100361003610036
1020410035750719920251010010100101006471520496955010035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955010035100358656387321010010200302001003540111020110099100101001000371012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955010035100358656387321010010200302001003540111020110099100101001008371012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955010035100358656387321010010200302001003540111020110099100101001001071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955010035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471520496955010035100358656387321010010200302001003540111020110099100101001001671012711999510000101001003610036100361003610036
102041003575020239920251010010100101006471521496955010035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
10204100357506199202510100101001010064715204969550100351003586563873210100102003020010035401110201100991001010010047971012711999510000101001003610036100361003610036
1020410035750619920251010010100101006471521496955010035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575276199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003575126199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003576456199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100664022722999710000100101003610036100361003610036
100241003575666199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
10024100357506199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003575012499182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
10024100357506199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
10024100357536199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003575516199182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003575010399182510010100101001064724614969551003510035867838754100101002030020100354011100211091010010100064022722999710000100101003610036100361003610036

Test 3: Latency 1->3

Code:

  sbcs w0, w1, w0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003576910399202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
10204100357506199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102003020010035401110201100991001010010018071012721999510000101001003610036100361003610036
102041003576216199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
10204100357506199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001000371012711999510000101001003610036100361003610036
10204100357506199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001001073712711999510000101001003610036100361003610036
10204100357506199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001000071012711999510000101001003610036100361003610036
10204100357506199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001005071012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102003020010035401110201100991001010010008471012711999510000101001003610036100361003610036
10204100357506199202510100101001010064715249695510035100358656387321010010200302001003540111020110099100101001001071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575051619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052745999710000100101003610036100361003610036
10024100357500619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052754999710000100101003610036100361003610036
10024100357500619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052754999710000100101003610036100361003610036
100241003575002109918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052745999710000100101003610036100361003610036
100241003575030619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010464042745999710000100101003610036100361003610036
100241003575001249918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064042755999710000100101003610036100361003610036
10024100357600619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052754999710000100101003610036100361003610036
100241003575001249918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052755999710000100101003610036100361003610036
10024100357500619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064042755999710000100101003610036100361003610036
100241003575045619918251001010010100106472461496955100351003586783875410010100203002010035401110021109101001010064052754999710000100101003610036100361003610036

Test 4: Latency 1->4

Chain cycles: 1

Code:

  sbcs w0, w1, w2
  tst x0, 1
  mov x0, 1
  mov x1, 2
  mov x2, 3

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20204200351500000180061199262520200202002020012976501049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000061199262520200202002020012976501049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000061199262520200202002020012976501049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000061199262520200202002020012976501049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
20204200351500000300061199262520200202002020012976500049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000082199262520200202002020012976500049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000061199262520200202002020012976500049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
20204200351500000000251199262520200202002020012976501049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000061199262520200202002020012976501049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036
2020420035150000000061199262520200202002020012976500049169552003520035174063174812020020200402002003564112020110099201000000000131000128111999220100101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20025200351580006119918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000211999520010100102003620036200362003620036
200242003515000018919918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000111999520010100102003620036200362003620036
200242003514900456119918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000111999520010100102003620036200362003620036
20024200351500036119918252002020020201021297297149169552003520035174283175042002020020400202003564112002110920010000001270236000111999520010100102003620036200362003620036
200242003515000015619918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000111999520010100102003620036200362003620036
20024200351500006119918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000121999520010100102003620036200362003620036
20024200351500006119918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000111999520010100102003620036200362003620036
20024200351500006119918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000111999520010100102003620036200362003620036
2002420035150001476119918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000111999520010100102003620036200362003620036
200242003515000093519918252002020020200201297297149169552003520035174283175042002020020400202003564112002110920010000001270127000211999520010100102003620036200362003620036

Test 5: Latency 4->2

Chain cycles: 1

Code:

  sbcs w0, w1, w2
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2020420035150116119930252010020100201121297233049169552003520035174257174862011220224402482003564112020110099100201001010001351111320116112001520000201002003620036200362003620036
20204200351501161199302520124201002011212972330491695520035200351742581748520112202244024820035641120201100991002010010100001111319116112001520000201002003620036200362003620036
2020420035150114540199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100001111320116112001520000201002003620036200362003620036
20204200351501161199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100201111319116112001520000201002003620036200362003620036
20204200351501161199302520100201002011212972331491695520035200351742581748520112202244024820035641120201100991002010010100001111320116112001520000201002003620036200362003620036
2020420035150116119930252010020100201121297233149169552003520035174257174852011220224402482003564112020110099100201001010001111111319116112001520000201002003620036200362003620036
20204200351501161199302520122201002011212972331491695520035200351742571748620112202244024820035641120201100991002010010100101111319116112001520000201002003620036200362003620036
202042003515011821993025201002010020112129723304916955200352003517425817486201122022440248200351271120201100991002010010100001111319116112001520000201002003620036200362003620036
20204200351501161199302520100201002011212972330491695520035200351742581748520112202244024820035641120201100991002010010100101111320116112001520000201002003620036200362003620036
20204200351501161199302520100201002011212972331491695520035200351742581748620112202244024820035641120201100991002010010100001111320116112001520000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20024200351500611991825200102001020010129724701491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724711491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724701491695520035200351742831750420010200204002020035641120021109102001010010101270127111999520000200102003620036200362003620036
200242003515006541991825200102001020010129724711491695520035200351742831750420010200204002020035641120021109102001010010031270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724701491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724701491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724711491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724711491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500821991725200102001020010129724701491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500611991825200102001020010129724701491695520035200351742831750420010200204002020035641120021109102001010010001270127111999520000200102003620036200362003620036

Test 6: Latency 4->3

Chain cycles: 1

Code:

  sbcs w0, w1, w2
  cset x2, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003515000000006119930252010020100201121297233049169552003520035174258174862011220224402482003564112020110099100201001010000000001111320016002001220000201002003620036200362003620036
202042003515000000001561993025201002010020112129723304916955200352003517425717486201122022440248200356411202011009910020100101000000011401111320016002001220000201002003620036200362003620036
202042003515000000006119930452014620100201121297233049169552003520035174257174862011220224402482003564112020110099100201001010000000001111319016002001220000201002003620036200362003620036
202042003515000000006119930252010020100201121297233149169552003520035174257174862011220224402482003564112020110099100201001010000000001111319016002001220000201002003620036200362003620036
202042003515000000006119930252010020100201121297233049169552003520035174258174852011220224402482003564112020110099100201001010000000001111319016002001220000201002003620036200362003620036
202042003515000000006119930252010020100201121297233049169552003520035174257174862011220224402482003564112020110099100201001010000000001111320016002001220000201002003620036200362003620036
202042003515000000006119930252010020100201121297233049169552003520035174257174862011220224402482003564112020110099100201001010000015010801111319016002001220000201002003620036200362003620036
202042003515000000006119930252010020100201121297233049169552003520035174257174862011220224402482003564112020110099100201001010000070001111319016002001220000201002003620036200362003620036
2020420035150000000061199302520100201002011212972331491695520035200351742571748620112202244024820035641120201100991002010010100000008101111319016002001220000201002003620036200362003620036
202042003515000000006119930252010020100201121297233049169552003520035174258174852011220224402482003564112020110099100201001010000030163001111319016002001220000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2002420035150000000910806119918252001020010200101297247014916955200352003517428317504200102002040020200356411200211091020010100100000000001270127121999520000200102003620036200362003620036
20024200351500000006006119918252001020010200101297247114916955200352003517428317504200102002040020200356411200211091020010100100000000001270227111999520000200102003620036200362003620036
20024200351500000000006119918452001020032200101297247114916955200352003517428317504200102002040020200356411200211091020010100100000000001270127111999520000200102003620036200362003620036
200242003515000000000010319918252001020010200101297247114916955200352003517428317504200102002040020200356411200211091020010100100000000001270127111999520000200102003620036200362003620036
20024200351490000006006119918252001020010200101297247114916955200352003517428317504200102002040020200356411200211091020010100100000000001270127111999520000200102003620036200362003620036
20024200351500000000006119918252001020010200101297247014916955200352003517428317504200102002040020200356411200211091020010100100000000001270127111999520000200102003620036200362003620036
20024200351500000000006119918252001020010200101297247114916955200352003517428317504200102002040020200356411200211091020010100100000000001270127111999520000200102003620036200362003620036
20024200351500000000006119918252001020010200101297247114916955200352003517428317504200102002040020200356411200211091020010100100000000001270127111999520000200102003620036200362003620036
20024200351500000000006119918252001020010200101297247104916955200352003517428317504200102002040020200356411200211091020010100100000100001270127111999520000200102003620036200362003620036
20024200351500001000006119918252001020010200101297247014916955200352003517428317504200102002040020200356411200211091020010100104000000001270127111999520000200102003620036200362003620036

Test 7: Latency 4->4

Code:

  sbcs w0, w1, w2
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750000061992725102001020010210647712104969551003510035867388735102101022430272100354011102011009910100001117200161001310100101001003610036100361003610036
10204100357600024061992725102001020010210647712104969561003510035867388735102101022430272100354011102011009910100001117200161001210100101001003610036100361008610036
1020410035750000061992725102001020010210647712104969561003510035867388735102101022430272100354011102011009910100001117200161001210100101001003610036100361003610036
10204100357500018061992725102001020010210647712104969551003510035867388735102101022430272100354011102011009910100001117200161001310100101001003610036100361003610036
10204100357500012061992725102001020010210647712104969551003510035867388735102101022430272100354011102011009910100001117200161001210100101001003610036100361003610036
1020410035750000061992725102001020010210647712104969551003510035867388735102101022430272100354011102011009910100001117200161001310100101001003610036100361003610036
1020410035750000061992725102001020010210648361104969551003510035867388735102101022430272100354011102011009910100001117190161001310100101001003610036100361003610036
10204100357500030536992725102001020010210647712104969551003510035867378735102101022430272100354011102011009910100001117200161001210100101001003610036100361003610036
10204100357500051061992725102001020010210647712104969551003510035867378736102101022430272100354011102011009910100001117200161001310100101001003610036100361003610036
1020410035750000061992725102001020010210647712104969551003510035867378735102101022430272100354011102011009910100001117200161001310100101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002410035750061991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101008410036100361003610036
100241003575012061991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101003610036100361003610036
10024100357502161991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101003610036100361003610036
100241003575029461991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101003610036100361003610036
100241003575029161991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064042733999710010100101003610036100361003610036
1002410035750061991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101003610036100361003610036
10024100357500103991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101003610036100361003610036
1002410035751661991872100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032736999710010100101003610036100361008310036
1002410035750061991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010064032733999710010100101003610036100361003610036
10024100357503361991825100201002010020647296149695510035100358678387541002010020300201003540111002110910010164032733999710010100101003610036100361003610036

Test 8: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  sbcs w0, w8, w9
  ands xzr, xzr, xzr
  sbcs w1, w8, w9
  ands xzr, xzr, xzr
  sbcs w2, w8, w9
  ands xzr, xzr, xzr
  sbcs w3, w8, w9
  ands xzr, xzr, xzr
  sbcs w4, w8, w9
  ands xzr, xzr, xzr
  sbcs w5, w8, w9
  ands xzr, xzr, xzr
  sbcs w6, w8, w9
  ands xzr, xzr, xzr
  sbcs w7, w8, w9
  mov x8, 9
  mov x9, 10
  mov x10, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1602045343240000016925160100160100160100106358814950324053404534523333933335916010016020024020053453521116020110099100160100100010101101191153401160000801005340553405534055340553405
1602045340440000027625160100160100160100106358814950324053404534043333933335916010016020024020053404521116020110099100160100100010101101191153401160000801005340553405534055340553405
160204534044001012183025160100160100160100106358814950324053404534043333933335916010016020024020053404521116020110099100160100100000101101191153401160000801005340553405534055340553405
160204534044000002027251601001601001601001063588149503240534045340433339333359160100160200240200534045211160201100991001601001000015101101191153401160000801005340553405534055340553405
16020453404400000189025160100160100160100106358814950324053404534043333933335916010016020024031153404521116020110099100160100100000101101191153401160000801005340553405534055340553405
16020453404400000175925160100160100160215106358814950324053404534043333933335916010016020024020053404521116020110099100160100100000101101191153401160000801005340553405534055340553405
160204534044000003725160100160100160100106358814950324053404534043333933335916010016020024020053404521116020110099100160100100000101101191153401160000801005340553405534055340553405
16020453404400009114225160100160100160100106358814950324053404534043333933335916010016020024020053404521116020110099100160100100000101101191153401160000801005340553405534055340553405
16020453404400000193025160100160100160100106358814950324053404534043333933335916010016020024020053404521116020110099100160100100000101101191153401160000801005340553405534055340553405
1602045340440000014225160100160100160100106358814950324053404534043333933335916010016027324020053404521116020110099100160100100411755101101193253715160140801005340553405534055340553405

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
160024533804000000000000049251600101600101600101029388110495029453374533743333133335116001016002024002053374521116002110910160010100004900001002384113919111162653370160000168800105337553375533755337553375
160024533744001001000000092251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100002600001002385112219111282553370160000168800105337553375533755337553375
160024533744001001000000092251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100004000001002385112619111212553370160000168800105337553375533755337553375
1600245337440010010000000922516001016001016001010293881104950294533745337433331333351160010160020240020533745211160021109101600101000042471001002385112619111262653370160068168800105337553375533755337553375
160024533744001001000000092251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100005403001002385112719111252653370160000168800105337553375533755337553375
160024533743991001000000092251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100002600001002385111819111242053370160000168800105337553375533755337553375
160024533743991001000000092251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100002300001002385112519111262653370160000168800105337553375533755337553375
1600245337440010010000000115251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100005200001002385111619111261653370160000168800105337553375533755337553375
160024533744001001000000047225160010160010160010102938811549502945337453374333313333511600101600202400205337452111600211091016001010000423001002385112119111282053370160000168800105337553375533755337553375
160024533743991001000000092251600101600101600101029388115495029453374533743333133335116001016002024002053374521116002110910160010100003500001002385112719111272653370160000168800105337553375533755337553375

Test 9: throughput

Count: 4

Code:

  fcmp s0, s0
  sbcs w0, w4, w5
  sbcs w1, w4, w5
  sbcs w2, w4, w5
  sbcs w3, w4, w5
  mov x4, 5
  mov x5, 6
  mov x6, 7

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4d5051schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
50204134551000000045013396255010040100100004010010000587442800001133831341213412602633423711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412101000304500255010040100100004010010000574746800000133831341213412602630813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412101000004500255010040100100004010010000574746800000133831341213412602633423711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412100001004500255010040100100004010010000574746800000133831341213412602630813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412100000004500255010040100100004010010000574746800000133831341213412553830813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340840000401001341313413134131341313413
5020413412100000004500255010040100100004010010000574746800000133831341213412602630813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340840000401001341313413134131341313413
5020413412101000004500255010040100100004010010000587442800000133831341213412555330813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412101000004500255010040100100004010010000574746800001133831341213412592933533711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412100000004500255010040100100004010010000574746800000133831341213412602630813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340940000401001341313413134131341313413
5020413412100000004500255010040100100004010010000587442800000133831341213412555330813711750100402001000012020020000134121341211502011009910040100100001000000003210119111340840000401001341313413134131341313413

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
500241340410012450255001040010100004001010000573456800001133531338213382558032543710950010400201000012002020000133821338211500211091040010100001000314015190881337940000400101338313383133831338313383
50024133821000106025500104001010000400101000057345680000113353133821338255803913371405001040020100001200202000013382133821150021109104001010000100031408190881337940000400101338313383133831338313383
5002413382100045025500104001010000400101000057345680000113353133821338255803254371095001040020100001200202000013382133821150021109104001010000100031408190881337940000400101338313383133831338313383
50024133821002145025500104001010000400101000057345680000113353133821338255803913371095001040020100001200202000013382133821150021109104001010000100031408190881337940000400101338313383133831338313383
5002413382100345025500104001010000400101000057345680000113353133821338259513913371095001040020100001200202000013382133821150021109104001010000100031407190771337940000400101338313383133831338313383
5002413382100045025500104001010000400101000057345680000113353133821338255803254371095001040020100001200202000013382133821150021109104001010000100031406190881337940000400101338313383133831338313383
5002413382101045025500104001010000400101000057345680000113353133821338259513254371095001040020100001200202000013382133822150021109104001010000100031408190881337940000400101338313383133831338313383
5002413382100045025500104001010000400101000057345680000113353133821338259513254371095001040020100001200202000013382133821150021109104001010000100031407190781337940000400101338313383133831338313383
5002413382100045025500104001010000400101000057345680000113353133821338255803913371095001040020100001200202000013382133821150021109104001010000100031408190781337940000400101338313383133831338313383
5002413382100345025500104001010000400101000057345680000113353133821338259513913371095001040020100001200202000013382133821150021109104001010000100031408190981337940000400101338313383133831338313383