Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

NGCS (register, 64-bit)

Test 1: uops

Code:

  ngcs x0, x0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004103570619172510001000100062250110351035805388210001000200010354011100110000073227119931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103570619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103570619172510001000100062250110351035805388210001000200010354011100110002073127119931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103570619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103570619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036
1004103570619172510001000100062250110351035805388210001000200010354011100110000073127119931000100010361036103610361036

Test 2: Latency 1->2

Code:

  ngcs x0, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03091e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003576006199202510100101001010064715204969550100351003586563873210100102002020010035401110201100991001010010001071012711999510000101001003610036100361003610036
1020410035750044199202510100101001010064715214969550100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715204969550100351003586563873210100102002020010035401110201100991001010010003071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715204969550100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715204969550100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003576006199202510100101001010064715204969550100351003586563873210100102002020010035401110201100991001010010001071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715204969550100351003586563873210100102002020010035401110201100991001010010000071012711999510000101001003610036100361003610036
102041003575006199202510195101001010064715204969550100351003586563873210100102002020010035401110201100991001010010001071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715214969550100351003586563873210100102002020010035401110201100991001010010001371012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715214969550100351003586563873210100102002020010035401110201100991001010010001071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002410035760619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064042722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010101064022722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750619918251001010010100946472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
100241003575011019918251001010035100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036
1002410035750619918251001010010100106472464969551003510035867838754100101002020020100354011100211091010010100064022722999710000100101003610036100361003610036

Test 3: Latency 1->3

Chain cycles: 1

Code:

  ngcs x0, x1
  tst x0, 1
  mov x0, 1
  mov x1, 2
  mov x2, 3

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003515001506199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
2020420035150061199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128112000220100101002003620036200362003620036
2020420035150082199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
20204200351500747199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
2020420035150061199262520200202002020012982774916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
20204200351490147199262520222202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
20204200351500145199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
20204200351500208199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
2020420035150061199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000001310128111999220100101002003620036200362003620036
2020420035150061199262520200202002020012976504916955020035200351740631748120200202003020020035641120201100992010000101310128111999220100101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
200242003515000000006891991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010000010001270227111999520032100102003620036200362003620036
20024200351500000000611991825200202002020020129729749169552008120035174283175042002020020300202003564112002110920010000000301270127111999520010100102003620036200362003620036
200242003515000000008219918252002020020200201297297491695520035200351742831750420020200203002020035641120021109200100000001801270127111999520010100102003620036200362003620036
20024200351500000000821991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010002000001270227111999520010100102003620036200362003620036
20024200351500000000611991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010000010001270227111999520010100102003620036200362003620036
20024200351500000000611991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
20024200351500000000611991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000001471991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010020000001270127111999520010100102003620036200362003620036
200242003515000001200611991825200202002020020129729749169552003520035174283175042002020020300202008064112002110920010020010001270227111999520010100102003620036200362003620036
20024200351500000000611991825200202002020020129729749169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036

Test 4: Latency 3->2

Chain cycles: 1

Code:

  ngcs x0, x1
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)0318191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2020420035150000028701993025201002010020112129723304916955200352003517425717485201122022430236200356411202011009910020100101000000011113190162001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723314916955200352003517425817486201122022430236200356411202011009910020100101000000011113191162001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425717485201122022430236200356411202011009910020100101000000011113200162001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425717486201122022430236200356411202011009910020100101000000011113200162001220000201002003620036200362003620036
20204200351490000611993025201002010020112129723304916955200352003517425717486201122022430236200356411202011009910020100101000000011113190162001220000201002003620036200362003620036
202042003515000001471993025201002010020112129723304916955200352003517425817485201122022430236200356411202011009910020100101000000011113190162001220000201002006820036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425717485201122022430236200356411202011009910020100101000003011113200162001220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425817485201122022430236200356411202011009910020100101000000011113200162003220000201002003620036200362003620036
20204200351500000611993025201002010020112129723304916955200352003517425717486201122022430236200356411202011009910020100101000000011113200162001220000201002003620036200362003620036
202042003515000001561993025201002010020112129723304916955200352003517425817486201122022430236200356411202011009910020100101000000011113190162001220000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)033f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2002420035150941991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270227211999520000200102003620036200362003620036
200242003515011651991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270127121999520000200102003620036200362003620036
2002420035150611991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270227211999520000200102003620036200362003620036
20024200351501031991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270227121999520000200102003620036200362003620036
2002420035150611991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270227211999520000200102003620036200362003620036
20024200351501701991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270127121999520000200102003620036200362003620036
2002420035150611991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270227111999520000200102003620036200682003620036
20024200351492721991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270627211999520000200102006720036200362003620036
2002420035150611991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270127121999520000200102003620036200362003620036
2002420035150611991825200102001020010129724704916955200352003517428317504200102002030020200356411200211091020010100100001270127111999520000200102003620036200362003620036

Test 5: Latency 3->3

Code:

  ngcs x0, x1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)fetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750619927251020010200102106477121496955100351003586738873510210102242024810035401110201100991010000001117201601001310100101001003610036100361003610036
1020410035750619927251020010200102106477121496955100351003586738873510210102242024810035401110201100991010000001117201601001210100101001003610036100361003610036
1020410035750619927251020010200102106477120496955100351003586738873510210102242024810035401110201100991010000001117201601001310100101001003610036100361003610082
10204100357502129927251020010200102106477121496955100821003586738873610210102242024810035401110201100991010001001117201601001310100101001003610036100361003610036
1020410035750619927251020010200102106477121496955100351003586738873510210102242024810035401110201100991010000001117201601001310100101001003610036100361003610036
1020410035760619927251020010200102106477121496955100351003586738873510210102242024810035401110201100991010000001117201601001310100101001003610036100361003610036
1020410035750619927251020010200102106477121496955100351003586738873510210102242024810035401110201100991010000001117201601001310100101001003610036100361003610036
1020410035750619927251020010200102106477120496955100351003586738873510210102242024810035401110201100991010000001117201601001310100101001003610036100361003610036
10204100357501899927251020010200102106477120496955100351003586738873510210102242024810035401110201100991010001001117201601001310100101001003610036100361003610036
1020410035750619927251020010200102106477120496956100351003586738873610210102242024810035401110201100991010000001117201601001310100101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)0318193f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064022722999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000364022732999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064022732999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064022722999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064022722999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064022732999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969561003510035867838754100201002020020100354011100211091001000064032722999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064022722999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001000064032724999710010100101003610036100361003610036
100241003575006199182510020100201002064729614969551003510035867838754100201002020020100354011100211091001010064022722999710010100101003610036100361003610036

Test 6: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  ngcs x0, x8
  ands xzr, xzr, xzr
  ngcs x1, x8
  ands xzr, xzr, xzr
  ngcs x2, x8
  ands xzr, xzr, xzr
  ngcs x3, x8
  ands xzr, xzr, xzr
  ngcs x4, x8
  ands xzr, xzr, xzr
  ngcs x5, x8
  ands xzr, xzr, xzr
  ngcs x6, x8
  ands xzr, xzr, xzr
  ngcs x7, x8
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03mmu table walk data (08)191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1602045344240000123725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011002192253401160000801005340553405534055340553405
1602045340440000123725160100160100160100106358814950324534045340433339033335916010016020016020053404521116020110099100160100100002001011002192253477160278801005340553405534055340553405
160204534044000003725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011024192253401160000801005340553405534055340553405
160204534044000003725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011002192253401160000801005340553405534055340553405
160204534044000003725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011002194253439160000801005340553405534055340553405
160204534044001005725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011004192253401160000801005340553405534055340553405
160204534044000003725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011002192253401160000801005340553405534055340553405
160204534044000003725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011002192253401160000801005340553405534055340553405
1602045340440000068925160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011002192253401160000801005340553405534055340553405
160204534044000003725160100160100160100106358804950324534045340433339033335916010016020016020053404521116020110099100160100100000001011003192453401160000801005340553405534055340553405

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)181e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
1600245339040011011177912516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100300100221352320192112022533701600002011800105337553375533755337553375
16002453374399000000852516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221347328192111820533701600002011800105337553375533755337553375
160024533743990000033432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221354318191111520533701600002111800105337553375533755337553375
16002453374400000000432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221367319192111921533701600002111800105337553375533755337553375
16002453374400000000432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221363318192112117533701600002011800105337553375533755337553375
16002453374400000000432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221361331192112018533701600002111800105337553375533755337553375
1600245337440000000068442516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221352317192111921533701600002111800105337553375533755337553375
16002453374400111000432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221351320191111720533701600002111800105337553375533755337553375
16002453374400000000432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100010100221353320192111921533701600002111800105337553375533755337553375
16002453374399000000432516001016001016001010293881110495029453374533743333133335116001016002016002053374521116002110910160010100000100221350326192111920533701600002011800105337553375533755337553375

Test 7: throughput

Count: 4

Code:

  fcmp s0, s0
  ngcs x0, x4
  ngcs x1, x4
  ngcs x2, x4
  ngcs x3, x4
  mov x4, 5
  mov x5, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)181e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
5020413464100000045255010040100100004010010000587442800001133831341213412553830813711750100402001000080200200001341213412115020110099100401001000010000003210219241340840000401001341313413134901341313413
502041347610100013545455010040100100004010010000574746800001133831341213412602633423711750100402001000080200200001347213412115020110099100401001000010020003210234221340840000401001341313413134131341313413
50204134121000001245255010040100100004010010000574746800001133831341213412602633423711750100402001000080200200001341213412115020110099100401001000010000003210319221340940000401001341313413134131341313413
5020413412100000066255010040100100004010010000587442800001133831341213412602633423711750100402001000080200200001341213412115020110099100401001000010000003210219221340940000401001341313413134131341313413
5020413412100000045255012540100100004010010000587442800001133831341213420602624463711750100402001000080200200001341213412115020110099100401001000010000003212219221340840000401001342113421134131341313413
5020413412100100045255010040125100004010010000587442800001133911341213420610324443712550125402001000080200200001341213412115020110099100401001000010001303210419441340940000401001341313413134131341313413
5020413412100000045255010040100100004010010000574746800001133831341213412602633533711750100402001000080200200001341213412115020110099100401001000010000303210219621341740000401001341313413134131341313413
5020413412101000045255010040100100004010010000574746800001133831347313474555330813711750100402001002680200200001342013420115020110099100401001000010000623210219321340840000401001341313413134131341313413
5020413412100000045255010040100100004010010000574746800001133831342013412555324213712550100402001000080200200001341213412115020110099100401001000010000003210219221340940000401001341313413134131341313413
5020413412100000045255010040100100004010010000574746800001133831341213412553833423711750100402001000080200200001341213412115020110099100401001000010000003210219221340840000401001341313413134131341313413

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
50024134081000872550010400101000040010100005734568000001335313382133825580325437109500104002010000800202000013382133821150021109104001010000100314000021900022133794000000400101338313383133831338313383
50024133821010452550010400101000040010100005734568000001335313382133825580325437109500104002010000800202000013382133821150021109104001010000100314000021900022133794000000400101338313383133831338313383
50024133821000452550010400101000040010100005734568000001335313382133825580391337109500104002010000800202000013382133821150021109104001010000100314000021900022133794000000400101338313383133831338313383
50024133821000452550010400101000040010100005734568000001335313382133825580325437109500104002010000800202000013382133821150021109104001010000100315900021900022133794000000400101338313383133831338313383
50024133821000992550010400101000040010100005734568000011335313382133825951391337109500104002010000800202000013382133821150021109104001010000100314030141920143133794000000400101338313383133831338313383
50024133821000452550010400101000040010100005734568000001335313382133825580391337109500104002010000800202000013382133821150021109104001010000100314231141900022133794000000400101338313383133831338313383
50024133821000452550010400101000040010100005734568000011335313382133825580391337109500104002010000800202000013382133821150021109104001010000100314000021901033133794000000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338259513254371095001040020100008002020000133821338211500211091040010100001003140000219000221337940000200400101344113383133831338313383
50024133821000452550010400101000040010100005734568000001335313382133825580325437109500104002010000800202000013382133821150021109104001010000100314000021900022133794000000400101338313383133831338313383
50024133821000452550010400101000040010100005734568000001335313382133825580391337109500104002010000800202000013382133821150021109104001010000100314000021900022133794000000400101338313383133831338313383