Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

MNEG (32-bit)

Test 1: uops

Code:

  mneg w0, w0, w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)st unit uop (a7)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
10043033230251280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000003731161128631000100030343034303430343034
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000003731161128631000100030343034303430343034
1004303322061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
1004303322061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034
1004303323061280925100010001000161686403033303326763289110001000200030332961110011000000731161128631000100030343034303430343034

Test 2: Latency 1->2

Code:

  mneg w0, w0, w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102043003322506129809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
1020430033225012029809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100230710116122986310000101003003430034300343003430120
102043007722406129809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
1020430033225025129809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
1020430033225072629809251010010100101001665186049269533003330033285269287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
1020430033225072629809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
102043003322506129809251010010100101001665186149269533003330033285263287411010010200202003003329011102011009910010100100000710116102986310000101003003430034300343003430034
1020430076224034629809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
102043003322506129809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034
102043003322506129809251010010100101001665186049269533003330033285263287411010010200202003003329011102011009910010100100000710116112986310000101003003430034300343003430034

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03091e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002430033225206129809251001010018100101664736049269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736149269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736049269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736149269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033224006129809251001010010100101664736049269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736149269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736049269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736049269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736049269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034
1002430033225006129809251001010010100101664736149269533003330033285483287631001010020200203003329611100211091010010100000640316332986410000100103003430034300343003430034

Test 3: Latency 1->3

Code:

  mneg w0, w1, w0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204300332250000000021529809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000006129809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000008429809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000008229809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003007730207300773003430034
1020430033225000000006129809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000006129809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
10204300332240000000053629809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000006129809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000006129809251010010100101001665186492695330033300332852632874110100102002020030033290111020110099100101001000000000000710116112986310000101003003430034300343003430034
1020430033225000000006129809251010010100101001665186492695330033300332852632874110100102002020030033290211020110099100101001000000000000710116112986310000101003003430034300343003430034

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024300332240061298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640316222986410000100103003430034300343003430034
100243003322501261298092510010100101001016647361492695330033300332854832876310010100202002030033296111002110910100101000010640216222986410000100103003430034300343003430034
10024300332240061298092510010100101001016647361492695330033300332854832878910073100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332250061298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332240061298092510020100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332250061298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332250361298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332250061298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332250066298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034
10024300332240061298092510010100101001016647360492695330033300332854832876310010100202002030033296111002110910100101000000640216222986410000100103003430034300343003430034

Test 4: throughput

Count: 8

Code:

  mneg w0, w8, w9
  mneg w1, w8, w9
  mneg w2, w8, w9
  mneg w3, w8, w9
  mneg w4, w8, w9
  mneg w5, w8, w9
  mneg w6, w8, w9
  mneg w7, w8, w9
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)030918191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204400523000000124258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000105110316114003280000801004003640036400364003640036
8020440035300000040258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
8020440035300000040258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
8020440035300000063258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001001005110116114003280000801004003640036400364003640036
80204400353000000168258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
8020440035299000040258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
8020440035300000040258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
8020440035300000040258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
80204400353000000145258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114003280000801004003640036400364003640036
8020440035300000061258010080100801004005000493695540035400352997032999380100802001602004003590118020110099100801001000005110116114006580000801004003640036400364003640036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5004

retire uop (01)cycle (02)03191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
800244003731000040258001080010800104000501049369554003540035299853300088001080020160020400359011800211091080010100000502050116114003280000800104003640036400364003640036
800244003529900040258001080010800104000501549369554003540035299853300088001080020160020400359011800211091080010100000502050116114003280000800104003640036400364003640036
800244003530000040258001080010800104000501549369554003540035299853300088001080020160020400359011800211091080010100000502050116114003280000800104003640036400364003640036
800244003530000040258001080010800104000501049369554003540035299853300088005580020160020400359011800211091080010100000502052116114003280000800104003640036400364003640036
800244003529900040258001080010800104000501049369554003540035299853300088001080020160020400359011800211091080010100000502052116114003280000800104003640036400364003640036
800244003530000040258001080010800104000501049369554003540035299853300088001080020160020400359011800211091080010100000502050116114003280000800104003640036400364003640036
800244003530000040258001080010800104000500549369554003540035299853300088001080020160020400359011800211091080010100000502052116114003280034800104003640036400364003640036
800244003530000040258001080010800104000501049369554003540035299853300088001080020160020400359011800211091080010100000502053116114003280000800104003640036400364003640036
800244003530000040258001080010800104000501549369554003540035299853300088001080020160020400359011800211091080010100000502050116114003280000800104003640036400364003640036
800244003530000040258001080010800104000501549369554003540035299853300088001080020160020400359011800211091080010100000502054116114003280000800104003640036400364003640036