Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SUBS (register, 32-bit)

Test 1: uops

Code:

  subs w0, w0, w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)ld unit uop (a6)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100410357216191725100010001000622501035103580538821000100020001035401110011000073327239931000100010361036103610361036
10041035806191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036
10041035806191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036
10041035706191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610841036
10041035706191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036
10041035806191725102210001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036
10041035806191725100010001000622501035103580538821000100020001035401110011000073227339931000100010361036103610361036
100410358126191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036
10041035706191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036
10041035866191725100010001000622501035103580538821000100020001035401110011000073327339931000100010361036103610361036

Test 2: Latency 1->2

Code:

  subs w0, w0, w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000371012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000971012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575084992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
1020410035750233992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575061992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
1020410035750189992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575000619918251001010010100106472460496955100351003586783875410010100202002010035401110021109101001010064022722999710000100101003610036100361003610036
100241003575000619918251001010010100106472460496955100351003586783875410010100202002010035401110021109101001010064022722999710000100101003610036100361003610036
100241003575000619918251001010010100106472460496955100351003586783875410010100202002010035401110021109101001010064022722999710000100101003610036100361003610036
100241003575000619918251001010010100106472460496955100351003586783875410010100202002010035401110021109101001010364022722999710000100101003610036100361003610036
100241003575000619918251001010010100106472460496955100351003586783875410010100202002010035401110021109101001010064022722999710000100101003610036100361003610036
1002410035760006199182510010100101001064724604969551003510035867838754100101002020020100354011100211091010010101864022722999710000100101003610036100361003610036
10024100357500061991825100101001010010647246049695510035100358678387541001010020200201003540111002110910100101011764022722999710000100101003610036100361003610036
10024100357500061991825100101001010010647246049695510035100358678387541001010020200201003540111002110910100101012964022722999710000100101003610036100361003610036
100241003575000619918251001010010100106472461496955100351003586783875410010100202002010035401110021109101001010064022722999710000100101003610036100361003610036
100241003575010619918251001010010100106472460496955100351003586783875410010100202002010035401110021109101001010064022723999710000100101003610036100361003610036

Test 3: Latency 1->3

Code:

  subs w0, w1, w0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204100357500849920251010010100101006471521496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471521496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
10204100357600619920251010010100101006471521496955100351003586563873210100102002020010035401110201100991001010010006071012711999510000101001003610036100361003610036
102041003575001059920251010010100101006471521496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575003479920251010010100101006471520496955100351003586563873210100102002020010035401110201100991001010010013071012711999510000101001003610036100361003610036
102041003575002129920251010010100101006471520496955100351003586563876110100102002020010035401110201100991001010010009071012721999510000101001003610036100361003610036
1020410035750033499202510100101001010064715204969551003510035865638732101001020020200100354011102011009910010100100012071012711999510000101001003610036100361003610036
10204100357500619920251010010100101006471520496955100351003586563873210100102002020010035401110201100991001010010016071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715204969551003510035865638732101001020020200100354011102011009910010100100012071012711999510000101001003610036100361003610036
102041003575001059920251010010100101006471520496955100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002410035750000619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100364022722999710000100101003610036100361003610036
1002410035750000619918251001010010100106472464969551003510035867898754100101002020020100834011100211091010010100064022722999710000100101003610036100361003610036
1002410035750000619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100364032722999710000100101003610036100361003610036
1002410035750000619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
10024100357500001039918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750100619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
10024100357500002519918251001010010100106472464969551003510035867838754100101002020020100354041100211091010010100064022722999710000100101003610036100361003610036
1002410035750000619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750000619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
10024100357500008299182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101006364022722999710000100101003610036100361003610036

Test 4: Latency 4->2

Chain cycles: 1

Code:

  subs w0, w1, w2
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)030918191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003515000008219930252010020100201121297233049169552003520035174257174852011220224302362003564112020110099100201001010000000111132016102001220000201002003620036200362003620036
202042003515000006119930252010020100201121297233049169552003520035174258174862011220224302362003564112020110099100201001010000000111132016002001220000201002003620036200362003620036
202042003515000008419930252010020115201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000111131916002001220000201002003620036200362003620036
2020420035150000043219930252010020100201121297233049169552003520035174258174852011220224302362003564112020110099100201001010000000111132016002001220000201002003620036200362003620036
20204200351500002114519930252010020100201121297233049169552003520035174257174852011220224302362003564112020110099100201001010000000111131916102001220000201002003620036200362003620036
202042003515000006119930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000111131916102001220000201002003620036200362003620036
202042003515001006119930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000111131916002001220000201002003620036200362003620036
202042003515000006119930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000111132016102001220000201002003620036200362003620036
202042003515000006119930252010020100201121297233049169552003520035174258174862011220224302362003564112020110099100201001010000000111132016012001220000201002003620036200362003620036
202042003515000006119930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000111131916102001220000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003564112002110910200101001000301270227111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003564112002110910200101001000001270227111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003564112002110910200101001001301270127111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003564112002110910200101001000001270127111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174563175042001020020300202003564112002110910200101001000001270127121999520000200102003620036200362003620036
2002420035150061199182520010200102001012972471491695520035200351742831750420010200203002020035641120021109102001010010001801270127111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003564112002110910200101001002301270127111999520000200102003620036200362003620036
2002420035150156119918252001020010200101297247149169552003520035174283175042001020020300202003564112002110910200101001000001270127111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247049169552003520035174283175042001020020300202003564112002110910200101001010001270127111999520000200102003620036200362003620036
200242003515006119918252001020010200101297247049169552003520035174283175042001020020300202003564112002110910200101001000001270127111999520000200102003620036200362003620036

Test 5: Latency 4->3

Chain cycles: 1

Code:

  subs w0, w1, w2
  cset x2, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20204200351500000028119930252010020100201121297233049169552003520035174258174852011220224302362003564112020110099100201001010000000001111319016102001220000201002003620036200362003620036
202042003515000000611993025201002010020112129723304916955200352003517425717486201122022430236200356411202011009910020100101000001010201111319016012001220000201002003620036200362003620036
2020420035150000006119930252010020100201121297233049169552003520035174258174852011220224302362003564112020110099100201001010000000001111320016002001220000201002003620036200362003620036
2020420035150000006119930252010020100201121297233049169552003520035174257174852011220224302362003564112020110099100201001010000000001111320016102001220000201002003620036200362003620036
20204200781500000072619930252010020100201121297233149169552003520035174258174852011220224302362003564112020110099100201001010000000001111319016012001220000201002003620036200362003620036
2020420035150000006119930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000001111319016012001220000201002003620036200362003620036
2020420035150000008219930252010020100201121297233049169552003520035174258174852011220224302362003564112020110099100201001010000000001111320016002001220000201002003620036200362003620036
20204200351500000017019930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000001111320016102001220000201002003620036200362003620036
2020420035150000006119930252010020100201121297233049169552003520035174257174862011220224302362003564112020110099100201001010000000001111320016102001220000201002003620036200362003620036
2020420035150000006119930252010020100201121297233149169552003520035174257174862011220224302362003564112020110099100201001010000000001111320016102001220000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20024200351500441199182520010200102001012972470049169552003520035174283175042001020020300202003564112002110910200101001010031272827461999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470149169552003520035174283175042001020020300202003564112002110910200101001000211272527581999520000200102003620036200362003620036
20024200351502461199182520010200102001012972470149169552003520035174283175042001020020300202003564112002110910200101001000181270527691999520000200102003620036200362003620036
2002420035150126119918252001020010200101299733014916955200352003517428317504200102002030020200356411200211091020010100100091272927751999520000200102003620036200362003620036
2002420035152061199182520010200102001012972470149169552003520035174283175042001020020300202003564112002110910200101001000151272627961999520000200102003620036200362003620036
20024200351490536199182520010200102001012972470049169552003520035174283175042001020020300202003564112002110910200101001010012729279101999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470149169552003520035174283175042001020020300202003564112002110910200101001000121272727781999520000200102003620036200362003620036
20024200351500147199182520010200102001012972470149169552003520035174283175042001020020300202003564112002110910200101001000121272842851999520000200102003620036200362003620036
20024200351502759819918252001020010200101297247014916955200352003517428317504200102002030020200356411200211091020010100103081112727278101999520000200102003620036200362003620036
200242003515002731991825200102001020010129724701491695520035200351742817175042001020020300202003564112002110910200101001000121272927591999520000200102003620036200362003620036

Test 6: throughput

Count: 8

Code:

  subs w0, w8, w9
  subs w1, w8, w9
  subs w2, w8, w9
  subs w3, w8, w9
  subs w4, w8, w9
  subs w5, w8, w9
  subs w6, w8, w9
  subs w7, w8, w9
  mov x8, 9
  mov x9, 10
  mov x10, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)033f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020426762201352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110219112673180000801002673626736267362673626736
8020426735200352580100801008010040084504923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
8020426735200352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119122673180000801002673626736267362673626736
8020426735200352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
8020426735200352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
8020426735201352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
8020426735200352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
8020426780201352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
80204267352001252580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736
8020426735200352580100801008010040050004923655267352673516672316690801008020016020026735391180201100991008010010000005110119112673180000801002673626736267362673626736

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)031e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8002426711200004125800108001080010400050049236250267052670516665316683800108002016002026705391180021109108001010039065020121811122670280000800102670626706267062670626706
800242670520000351488001080010800104000500492362502670526705166653166838001080020160020267053911800211091080010100001085020131812102670280000800102670626706267062670626706
800242670520000352580010800108001040005004923625026705267051666531668380010800201600202670539118002110910800101003035020101811122670280000800102670626706267062670626706
80024267052000035258001080010800104000500492362502670526705166653166838001080020160020267053911800211091080010100009502013187112670280000800102670626706267062670626706
80024267052000132352580010800108001040005004923625026705267051666531668380010800201600202670539118002110910800101001005020111812122670280000800102670626706267062670626706
800242670520000352580010800108001040005004923625026705267051666531668380010800201600202670539118002110910800101000090502071811112670280000800102684626706267062670626706
800242670520000352580010800108001040005004923625026705267051666531668380010800201600202670539118002110910800101000065020101812112670280000800102670626706267062670626706
80024267052000032025800108001080010400050049236250267052670516665316683800108002016002026705391180021109108001010000162502091812132670280000800102670626706267062670626706
800242670520000352580010800108001040005004923625026705267051666531668380078800201600202670539118002110910800101000012502071812102670280000800102670626706267062670626706
80024267052000035258001080010800104000500492362502670526705166653166838001080020160020267053911800211091080010100001145020131812122670280000800102670626706267062670626706