Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SUB (immediate, 64-bit)

Test 1: uops

Code:

  sub x0, x0, #3
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03181e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410357006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
100410357006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036
1004103580126186225100010001000169160103510357283868100010001000103541111001100000094141119371000100010361036103610361036
100410358006186225100010001000169160103510357283868100010001000103541111001100000073141119371000100010361036103610361036

Test 2: Latency 1->2

Code:

  sub x0, x0, #3
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575426198772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
102041003575023298772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
102041003575246198772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664149695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664149695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664049695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036
10204100357506198772510100101001010088664149695510035100358580387221010010200102001003541111020110099100101001000071013711994110000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100357600156198632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
1002410035760066198632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357500025198632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
1002410035750036198632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010200064024122994010000100101003610036100361003610036
1002410035760036198632510010100101001088784496955100351003586023874010318100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
1002410081750006198632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357500910398632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010000064024722994010000100101003610036100361003610036
10024100357500061986325100101001010010887844969551003510035860211874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
1002410035750006198632510010100101001088784496955100351003586023874010010100201002010082411110021109101001010000064024122994010000100101003610036100361003610036
1002410035750006198632510010100101001088784496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036

Test 3: throughput

Count: 8

Code:

  sub x0, x8, #3
  sub x1, x8, #3
  sub x2, x8, #3
  sub x3, x8, #3
  sub x4, x8, #3
  sub x5, x8, #3
  sub x6, x8, #3
  sub x7, x8, #3
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.1674

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802041341710100028278013680136801484007100491031013390133903326633368014880264802641339039118020110099100801001000101115119016001338780036801001339113391133911339113391
802041339010110028278013680136801484007100491031013390133903326633368014880264802641339039118020110099100801001000001115119016001338780036801001339113391133911339113391
802041339010000028278013680136801484007100491031013390133903326633368014880264802641339039118020110099100801001000001115119016001338780036801001339113391133911339113391
802041339010000028278013680136801484007100491031013390133903326633368014880264802641339039118020110099100801001000001115119016001338780036801001339113391133911339113391
8020413390100000112278013680136801484007101491031013390133903326633368014880264802641339039118020110099100801001000001115119016001338780036801001339113391133911339113391
802041339010000028278013680136801484007101491031013390133903326633368014880264802641339039118020110099100801001000101115119016001338780036801001339113391133911339113391
8020413390101000282780136801368014840071004910310133901339033266333680148802648026413390391180201100991008010010002501115119016001338780036801001339113391133911339113391
802041339010000028278013680136801484007101491031013390133903326633368014880264802641339039118020110099100801001000001115119016001338780036801001339113391133911339113391
8020413390100012028278013680136801484007101491031013390133903326633368014880264802641339039118020110099100801001000001115119016001338780036801001339113391133911339113391
80204133901010270503278013680136801484007100491031013390133903326633368014880264802641339088118020110099100801001000101115119016001338780036801001339113391133911339113391

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.1671

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024135251000000101500041425801368013780010400050000491035113371134373331333488014680149800201343139118002110910800101000000305020004190341336880128800101337213372134971343413431
8002413494100100110132005625800108013780010400686000491029113371134323330333688026880288800201349239118002110910800101020310325020003760341336880000800101344113493134331337213372
80024133711000101112790098825802708001080010400688000491029113371134303330333488013980286800201349439118002110910800101040210305020004190341336880131800101337213433133721337213372
8002413371100000021279005658800108001080010401374000491035313497134913330333868013980157800201337139118002110910800101000010305020004190431336880000800101343313372134351349413562
800241337110000001402640593258001080397802764000500004910291135631344733311233678014080020800201356639218002110910800101000300005020003440341346880131800101343013372133721343913372
80024133711000000310003525800108001080010400687110491029113371133713331333488001080157800201337139118002110910800101000012005020004430441347280000800101361013496134321343413372
80024135641010000110003589801398001080010400050000491029113371133713330333488001080020800201337139118002110910800101000010345020004190441336880000800101337213372133721337213372
80024133711000110121410135258001080010800104006860004910291136201343333307334880010800208015713371391180021109108001010040003050200041310351336880254800101337213372133721337213626
80024134931000000010004755780010800108001040068500049104141337113371333073348800108028680150133713911800211091080010102000248805020004190341336880000800101337213372133721337213372
80024133711000000000003525800108001080010400050000491029113371133713330333488001080020800201337139118002110910800101000010005020004190341336880000800101337213372133721337213372