Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

NGCS (register, 32-bit)

Test 1: uops

Code:

  ngcs w0, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
10041035706191725100010001000622501103510358053882100010002000103540111001100001273227229931000100010361036103610361036
1004103570619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
10041035801429172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036
1004103580619172510001000100062250110351035805388210001000200010354011100110000073227229931000100010361036103610361036

Test 2: Latency 1->2

Code:

  ngcs w0, w0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000371022711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000071012711999510000101001003610036100361003610036
10204100357500255992025101001010010100647152496955100351003586563873210100102002020010035401110201100991001010010001271012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001001071012711999510000101001003610036100361003610036
102041003575006199202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001000071012711999510000101001003610036100361003610036
1020410035750015499202510100101001010064715249695510035100358656387321010010200202001003540111020110099100101001001071012711999510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575006199182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064042765999710000100101003610036100361003610036
100241003575006199182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064062755999710000100101003610036100361003610036
100241003575006199182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064052766999710000100101003610036100361003610036
1002410035750018799182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064062744999710000100101003610036100361003610036
100241003575006199182510010100101001064724649695510035100358678387541001010020202141003540111002110910100101000064062756999710000100101003610036100361003610036
1002410035750010799182510010100101001064724649695510035100358678387801001010020200201003540111002110910100101000264062766999710000100101003610036100361008410036
1002410035750010399182510010100101001064724649695510035100358678387541001010020200201008240111002110910100101000064062765999710000100101003610036100361003610036
100241003575006199182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064042756999710000100101003610036100361003610036
1002410035750934699182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064062756999710000100101003610130100361003610036
100241003575006199182510010100101001064724649695510035100358678387541001010020200201003540111002110910100101000064072766999710000100101003610036100361003610036

Test 3: Latency 1->3

Chain cycles: 1

Code:

  ngcs w0, w1
  tst x0, 1
  mov x0, 1
  mov x1, 2
  mov x2, 3

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)0318191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2020420035150000009319926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201000000001310128111999220100101002003620036200362003620036
20204200351500000012419926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201000001001310128111999220100101002003620173200362003620036
20204200351500000012419926252020020200202001298912049169552003520035174063174812020020200302002003564112020110099201000000001310128111999220100101002003620036200362003620036
2020420098150000008219926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201000000001310128111999220100101002003620036200362003620036
202042003515000570014519926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201000000001310128111999220100101002003620036200362003620036
20204200351500000012619926252020020200202001297650049170012003520035174063174812020020200302002003564112020110099201000031001310128111999220100101002003620036200362003620036
2020420035150009006119926252020020200202001297650049169552003520035174068175082020020200302002003564112020110099201000000001310128111999220100101002003620036200362003620036
2020420035149000008219926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201000000031310128111999220100101002003620036200362003620036
20204200351500000012419926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201002400001310128111999220168101002003620036200362003620036
20204200351502100018719926252020020200202001297650049169552003520035174063174812020020200302002003564112020110099201000000001310128111999220100101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2002420035150000000019519918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270227211999520010100102003620036200362003620036
200242003515000000006119918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000006119918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000003001270127111999520010100102003620036200362003620036
2002420035150000000012419918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127121999520010100102003620036200362003620036
2002420035150000000036319918252002020020200201297297149169552003520035174283175042002020020300202006664112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000006119918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000008219918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000006119918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000006119918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036
200242003515000000006119918252002020020200201297297149169552003520035174283175042002020020300202003564112002110920010000000001270127111999520010100102003620036200362003620036

Test 4: Latency 3->2

Chain cycles: 1

Code:

  ngcs w0, w1
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03l1i tlb fill (04)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202042003514910166199302520100201002011212972331491695502003520035174257174862011220224302362003564112020110099100201001010000111132016002001220000201002003620036200362003620036
20204200351500061199302520100201002011212972330491695502003520035174257174852011220224302362003564112020110099100201001010000111132016002001220000201002003620036200362003620036
202042003515000149199302520100201002011212972331491700102003520035174258174862011220224302362003564112020110099100201001010000111132016112001220000201002003620036200362003620036
20204200351500061199302520100201002011212972331491695502003520035174258174852011220224302362003564112020110099100201001010000111131916002001220000201002003620036200362003620036
20204200351500061199302520100201002011212972331491695502003520035174257174862011220224302362003564112020110099100201001010000111131916002001220000201002003620036200362003620036
20204200351500061199302520100201002011212972331491695502003520035174257174852011220224302362003564112020110099100201001010000111131916012001220000201002003620036200362003620036
20204200351500061199302520100201002011212972331491695502003520035174257174852011220224302362003564112020110099100201001010000111131916002001220000201002003620036200362003620036
20204200351500082199302520100201222011212972330491695502003520035174258174852011220224302362003564112020110099100201001010000111131916002001220000201002003620036200362003620036
20204200351500061199302520100201002011212972331491695502003520035174258174852011220224302362003564112020110099100201001010000111131916002001220000201002003620036200362003620036
20204200351490061199302520100201002011212972331491695502003520035174257174852011220224302362003564112020110099100201001010000111132016002001220000201002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20024200351500214199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420066150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035149061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
20024200351500124199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270127111999520000200102003620036200362003620036
2002420035150061199182520010200102001012972470491695520035200351742831750420010200203002020035641120021109102001010010001270227111999520000200102003620036200362003620036

Test 5: Latency 3->3

Code:

  ngcs w0, w1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)dde0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750536992725102001020010210647712149695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712049695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712149695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712149695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712149695610035100358673887351021010224202481003540111020110099101000001117201611001310100101001003610036100361003610036
102041003575061992725102001020010210647712149695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712149695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712049695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712149695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036
102041003575061992725102001020010210647712049695510035100358673887351021010224202481003540111020110099101000001117201601001310100101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100357837561991825100201002010020647296049695510035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
10024100357730061991825100201002010020647296049695610035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
10024100357736961991825100201002010020647296049695610035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
100241003578061991825100201002010020647296049695510035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
100241003578061991825100201002010020647296049695610035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
1002410035750844991825100201002010020647296049695510035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
100241003575061991825100201002010020647296049695610035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
1002410035750664991825100201002010020647296049695510035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
1002410035750604991825100201002010020647296049695510035100358678387541002010020200201003540111002110910010064022722999710010100101003610036100361003610036
10024100357542619918251002010020100206472960496955100351003586783875410020100202002010035401110021109100100640327221002410010100101003610036100361003610036

Test 6: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  ngcs w0, w8
  ands xzr, xzr, xzr
  ngcs w1, w8
  ands xzr, xzr, xzr
  ngcs w2, w8
  ands xzr, xzr, xzr
  ngcs w3, w8
  ands xzr, xzr, xzr
  ngcs w4, w8
  ands xzr, xzr, xzr
  ngcs w5, w8
  ands xzr, xzr, xzr
  ngcs w6, w8
  ands xzr, xzr, xzr
  ngcs w7, w8
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss data (0b)1e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
1602045340940000000372516010016010016010010635880495032453404534043333903333591601001602001602005340452111602011009910016010010000010110061966534011600002801005340553405534055340553405
1602045340440000000372516010016010016010010635880495032453404534043333903333591601001602001602005340452111602011009910016010010000010110061955534011600000801005340553405534055340553405
1602045340440000000372516010016010016010010635880495032453404534043333903333591601001602001602005340452111602011009910016010010000310110051955534011600000801005340553405534055340553405
1602045340440000000372516010016010016010010635881495032453404534043333903333591601001602001602005340452111602011009910016010010000010110051955534011600000801005340553405534055340553405
16020453404400011013730416017816017716010010635880495032453404534043333903333591601001602731602005340452111602011009910016010010000010113073466534011600000801005340553405534055340553405
1602045340440000060372516010016010016010010635881495032453404534043333903333591601001602001602005340452111602011009910016010010000010110051966534011600000801005340553405534055340553405
16020453404400000005122516010016010016010010635880495032453404534043333903333591601001602001602005340452111602011009910016010010000010110051965534011600000801005340553405534055340553460
1602045340440000000372516010016010016010010635881495032453404534043333903333591601001602001602005340452111602011009910016010010000010110061946534011600000801005340553405534055340553405
16020453404400000005122516010016010016010010635880495032453404534043333903333591601001602001602005340452111602011009910016010010000010110061965534011600000801005340553405534055340553405
1602045340440000000372516010016010016010010635880495032453404534043333903333591601001602001602005340452111602011009910016010010000010110651956534011600000801005340553405534055340553405

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)1e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
160024533803990000432516001016001016001010293880154950294533745337433331033335116001016002016002053374521116002110910160010100006100228212119211199533701600002011800105337553375533755337553375
16002453374400000064251600101600761600101029388215495029453374533743333103333511600101600201600205337452111600211091016001010000010022841919211199533701600002011800105337553375533755337553375
16002453374400000043251600101600101600101029388015495029453374533743333103333511600101600201600205337452111600211091016001010000010022841919211199533701600002011800105337553375533755337553375
1600245337440000004925160010160010160010102938811049502945337453374333310333351160010160020160020533745211160021109101600101024001002284119192612719533701600002011800105337553375533755337553375
160024533744000000492516001016001016001010293881154950294533745337433331033335116001016002016002053374521116002110910160010100000100228411919211919536491600002011800105337553375533755366353375
1600245337440001102904325160010160010160083102938821549502945337453374333770163344216001016024116002053374521116002110910160010100000100228511919211919533701600002011800105337553375533755337553375
160024533744001000702516001016001016001010293880154950294533745337433331033335116001016002016002053374521116002110910160010100000100241162719422199533701600004020800105337553375533755337553375
160024533744000000492516001016001016001010293880154950294533745337433331033337216001016002016002053411521116002110910160010100003100228511919211199533701600002011800105337553375533755337553375
16002453374400000043251600101600101600101029388115495029453374533743333103333511600101600201600205337452111600211091016001010000010022851919211197533701600002011800105337553375533755337553375
16002453374400100070825160010160010160010102938801549502945337453374333310333351160010160020160020533745211160021109101600101000001002411621919422719533701600004020800105337553375533755337553375

Test 7: throughput

Count: 4

Code:

  fcmp s0, s0
  ngcs w0, w4
  ngcs w1, w4
  ngcs w2, w4
  ngcs w3, w4
  mov x4, 5
  mov x5, 6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
5020413415101045255010040100100004010010000574746800000133831341213412553833533711750100402001000080200200001341213412115020110099100401001000010003210119111340940000401001341313413134131341313413
50204134121014545255010040100100004010010000587442800001133831341213412602630813711750100402001000080200200001341213412115020110099100401001000010003210119111340840000401001341313413134131341313413
50204134121001545255010040100100004010010000587442800000133831341213412602630813711750100402001000080200200001341213412115020110099100401001000010033210119111340940000401001341313413134131341313413
5020413412100045255010040100100004010010000574746800001133831341213412553833533711750100402001000080200200001341213412115020110099100401001000010003210119111340940000401001341313413134131341313413
5020413412101045255010040100100004010010000587442800000133831341213412553830813711750100402001000080200200001341213412115020110099100401001000010003210119111355040000401001341313413134131341313413
50204134121012445255010040100100004010010000587442800000133831341213412553833423711750100402001000080200200001341213412115020110099100401001000010003210119111340940000401001341313413134131341313413
502041341210028545255010040100100004010010000574746800000133831341213412555333423711750100402001000080200200001341213412115020110099100401001000010003210119111340840000401001341313413134131341313413
5020413412100045255010040100100004010010000574746800000133831341213412555330813711750100402001000080200200001341213412115020110099100401001000010003210119111340840000401001341313413134131341313413
5020413412100045255010040100100004010010000574746800000133831341213412602630813711750100402001000080200200001341213412115020110099100401001000010003210119111340840000401001341313413134131341313413
5020413412101045255010040100100004010010000587442800001133831341213412555333533714650100402001000080200200001341213412115020110099100401001000010003210119111340940000401001341313413134131341313413

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
500241338310004525500104001010000400101000057345680000013353133821338259513913371095001040020100008002020000133821338211500211091040010100001000031400319111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338259513913371095001040020100008002020000133821338211500211091040010100001000031403119111337940000400101338313383133831338313383
500241338210104525500104001010000400101000057345680000013353133821338255803254371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338259513913371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338255803254371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338259513913371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338255803913371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338255803913371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
500241338210004525500104001010000400101000057345680000013353133821338259513254371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383
5002413382100484525500104001010000400101000057345680000013353133821338255803913371095001040020100008002020000133821338211500211091040010100001000031400119111337940000400101338313383133831338313383