Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, post-index, 1 reg, 8B)

Test 1: uops

Code:

  ld1 { v0.8b }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
6100528739222201100004010479928292011236262000100010001000100050005000100015950281892849931020001000200028478285621161001100010000100002100100110000100013165965970243200148210363241380374245281161000156211285914329100010002854728675286852862328640
61004286802221000010040104812283260112365620001000100010001000500550001300159302810828538310200010002000285832855511610011000100001000031001201100021000132389563699831940392092131963802114445282431000153111270814195100010002863128603285932861128572
61004288202221000000040104887281880012361120001000100010001000500050001200159652810328675310200010002000283972852111610011000100001000001001101100101300131809423705532660452102732533805154544280301000152171278314220100010002860928644285792862028653
6100428582221100000004010480328359011235682000100010001000100050005002130015958281392860131020001000200028481284741161001100010000100002100100110012130130131859655695132130462091931493803114044281101000149331284414249100010002857928675285102843928565
61004286662211000000000104678283410102351620001000100010001000500050021500159382819228716310200010002000284782845411610011000100001000001000140110012030013375933969523160045211083183380484447281421000152911258914011100010002859728573287722867728527
61004286172221000001030004832282460002361620001000100010001000500050001400159252814728490310200010002000286412846511610011000100001000001000000100021200133019572701532250542098631553805124140279871000154761326513972100010002855828485287242869328633
6100428658221200000004000484828248010235782000100010001000100050005000140015959281552861331020001000200028437285141161001100010000100002100000401100000000132239607705832130472097732033814124552281121000154871307814345100010002862028582287862862728638
610042851122210000000088104792283481002374820001000100010001000500050001210159272823328555310200010002000284942870811610011000100001000021000000100020000133489725699931941402099332563809114248281201000153311246914568100010002864128614283512851428714
6100428682221100000003000479528274000235932000100010001000100050005000800159742818528595310200010002000285742855111610011000100001000021000000100020000134199607692831460472096932323809164140281141000155051279213999100010002860628713286662869728638
61004286802221000000030004846282440002361620001000100010001000500050001000159762821028635310200010002000286042848211610011000100001000021000101100020000132919440696931950472093931683806144543280891000152391299914371100010002871128582285942863028608

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.8b }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020512005193100100001133010001200391196922570116501021000110002401001004010000106367445381314584473012010412005412014111330531137416032130200100001003960200200801000012005412013811502011009910040100100001000011001000101100000001000000102000323617611119763500081310121000050100120055120147120055120055120055
502041201429300000000113010001200391196712570103501021000110000401001000010000106214445381314585374012003012003512005411330531137166010030200100001000060200200001000012005112005311502011009910040100100001000001001000001100000031000010100000321017611119744500021013121000050100120055120055120055120055120055
50204120051930000000001010001200391197572570103501021000110000401001000010000106214445381694585101012003012005412013911330531137056010030200100001004060200200001004212005412005121502011009910040100100001000001001000071100000001000210100000323417611119889500021310121000050100120056120055120058120063127920
502041200629651112000112700010012003811975725701035010210001100004010010000100001062144453817045851010120031120055120051113307311370260100302001000010000602002000010000120056120052115020110099100401001000010000010010000011000010310000101000023255110011119840500139651000050100120051120052120051120228120051
502041200519700000100022000101200361197574870103501101000110000403861000010000106212645381314585139112003012014212005611333331137056010030200100001004160200200001004112005112005111502011009910040100100001000001001000000100001027981000110102000321017611119760500131310121000050100120055120055120055120144120231
5020412005497300000010130000112004111976125701065011610002100024011010007100071061908453852145888690120017120056120233113371161137956012430344100491000860248200161004812015412005311502011009910040100100001000001001000121100010327721000011111111324401600119823500049681000050100120060120155120042120146120059
5020412006097310010000138800101200371197586970103501021000110000405301000010039106210845380964584945012002612005012005011330331137296032230200100411004160200200801000012014212013211502011009910040100100001000001001000000100031031000010100000321027611119838500029681000050100120146120051120051120051120146
502041200509750100000113176000012010611975349701195012410001100004024410000100001064100453797945891290120100120051120234113301181136846054930200100811000060688200001000012005212014111502011009910040100100001000001001000001100000031000010100000321017611119759500029981000050100120051120036120054120048120051
502041200549640000000019000001200351197532570103501021000110000401001000010000106210845374124584945012002612005012005011328631136986010030200100001000060200200001000012005012005011502011009910040100100001000001001000001100001031000010100000321017610119759500029681000050100120051120051120052120054120051
502041200519640000000016000001200361197502570103501021000110000401001000010000106210845380934584945012002612005012004711328631136836010030200100001000060200200001000012005012004711502011009910040100100001000001001000001100000001000010100000321017611119759500029681000050100120052120052120051120052120048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002512005196400000010004000120036119734257001350012100011000040010100001000010629494540226458606301200301200541200541133113113718600103002010000100006002020000100001200541200511150021109104001010000100000101000001100001001000011003141048254119770500021311121000050010120055120053120055120055120055
5002412005496400000000001000120039119731257001350012100011000040010100001000010626704540695458554601200321200541200551133283113718600103002010000100006002020000100001200571200521150021109104001010000100000101000001100002031000011003141058277119773500021313121000050010120055120052120055120055120054
50024120054964001000000013000120040119734257001350012100011000040010100001000010621754541562458633601200301200541200541133293113715600103002010000100006002020000100001200911200511150021109104001010000100000101000001100000001000011003141068277119779500041310121000050010120061120055120055120053120055
5002412005496400000000004000120039119734257001350012100011000040010100001000010621484542305458617901200301200541200951133283113718600103002010000100006002020000100001200511200541150021109104001010000100000101000601100000031000011003141068255119774500021313121000050010120154120061120055120055120055
5002412005496400000100004000120039119714257001350012100011000040010100001000010664524539062458606301200301200541200521133281211370160010300201000010000600202000010000120035120051115002110910400101000010000010100000110000000100001100314105826611977350002010101000050010120061120144120055120055120055
500241200579640000001000100012003911973125700135001010001100004001010000100001062742453967645862170120031120055120054113328311376660010300201000010000600202000010000120054120051115002110910400101000010000010100000110000103100001100314104826411985050002131401000050010120056120142120055120055120037
500241200519650000010000400012003611973425700135001210001100004001010000100001063030453988445861800120030120054120051113328111137206067530020100001000060020200001000012005412005111500211091040010100001000001010000011000000310000110031410682551197735000210091000050010120055120054120055120055120055
5002412005196500000000001300012003911973425700135001210001100004001010000100001062796453910345861020120030120054120054113328311371860010300201000010000600202000010000120054120054115002110910400101000010000010100000110000003100001100314106825611977350000013121000050010120036120055120055120036120059
500241200549640000000000900012003911973425700135001210001100004001010000100001062877454095745855840120030120054120054113328311371660010300201000010000600202000010000120051120051115002110910400101000010000010100000110000000100001100314106825511975450000131391000050010120055120036120037120052120052
500241200519640000000000000012003911973425700135001210001100004001010000100001062585454067845856590120027120054120054113328311371860010300201000010000600202000010000120054120054115002110910400101000010000010100000110000006100001100314106828511977050000131001000050010120055120055120055120055120055

Test 3: throughput

Count: 8

Code:

  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  ld1 { v0.8b }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020580040621000000220008002516672516017680100800008010080000417967937588240800158004080040699243699971601002008000020016000080090800401180201100991001008000080000110080000019800170016800006115213051111172080037180000131080000801008004180041800418004180041
80204800406200000002700080025166725160100801008000080100800004179671375882418001580040800406992436999716010020080000200160000800408004011802011009910010080000800001100800000198001600080017600210051102171180037080000101380000801008004180041800418004180041
8020480040620000000230008002506662516010080100800008010080000417967937588230800158004080040699243699971601002008000020016000080040800401180201100991001008000080000110080000019800170016800006014210051102172180037080000151380000801008004180041800418004180041
80204800406200000002300080025166725160316807488067981797800004179695375882208015280211801506994211700941603972008000020016000080040800401180201100991001008000080000010080000021800170016800176115210051101172180037180000131380000801008004180041800418004180092
8020480040621000000200108002516602516010080100800008010080000417967137588240800158004080040699243699971601002008000020016000080040800401180201100991001008000080000010080000021800171017800166114210051101171280037080000131380000801008004180041800418004180041
80204800406210011003500080025166112516010080100800008010080000417967137588220800158004080040699243699971601002008000020016000080040800401180201100991001008000080000010080000019800000024800176115210051102171280037180000131380000801008004180041800418004180041
8020480040620000000230008002516672516010080100800008010080000417967937588230800158004080040699243699971601002008000020016000080040800401180201100991001008000080000110080000021800170016800166117210051102252180037180000141080000801008004180041800418004180041
802048004062000000023000800251667251601008010080000801008000041796953758823080015800408004069924369997160100200800002001600008004080040218020110099100100800008000011008000002180017000800176115210051101171180037180000131380000801008004180041800418004180041
8020480040649000000230008002516682516010080100800008010080000417967137598770800158004080040699243699971601002008000020016000080040800401180201100991001008000080000110080024019800160017800166114210051252172180037180000131080000801008004180041800418004180041
8020480040620000010238800801261661394516021480135800258017880069417895837598830800958009080140700146270519160389202801602001603208014280091218020110099100100800008000001008002301980038001472800606114212051411331280194180460111380000801008009080142800918014380092

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch call indir mispred nonspec (ca)branch mispred nonspec (cb)cfd0l1i tlb miss demand (d4)d5map dispatch bubble (d6)daddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002580040621100000000018000008002516642516001080010800008001080000417788537588178005580040800406994637002016001020800002016000080040800401180021109010108000080000010800000080000000108004160101700005020005160338003718000066080000800108004180041800418004180092
800248004062010000011103101000800250664251600108001080000800108000041785733758796800158004080040699463700201600102080000201600008004080090118002110901010800008000001080000016800090000800190101700005020004160438003708000060080000800108009280041800418004180041
80024800406210000001100000000800251061222516001080010800008001080000417787737631108001580040800406998237002016001020800002016016080170800401180021109010108000080000110800000148001000008000960121700005020004160538003718000096080000800108004180041800418004180041
800248004062000000011001688000080025160525160010800108000080010800004177717375877680015800408004069946370049160010208000020160000800408004011800211090101080000800001108000000800120000800226113000005020006160368007708000096080000800108004180041800418009280041
8002480040620000000000019010008002516652516001080010800008001080000417863737587618001580040800406994637002016001020800002016000080040800401180021109010108000080000010800000080013000108000060131700005020005160538003718000096080000800108004180041800418004180041
80024800406200000000100190100080025106525160010800108000080010800004177701375879980015800408004069946370020160010208000020160000800408004011800211090101080000800000108000001480010000730800106091700005020003160358003708000066080000800108004180041800418004180041
80024800406200000001100190000080025166025160010800108000080010800004175871375878880015800408009369946370020160010208000020160000800408004011800211090101080000800000108000001480009000108001000101400005020004160448003708000006080000800108004180041800418004180041
8002480040620000000001000100080025060025160010800108000080010800004178597375875680015800408004069946370052160010208000020160160800408004011800211090101080000800000108000001480010000108001001121400005020003160448003708000066080000800108004180041800418004180041
8002480040620000000110019001008002516610251600108001080025800108000041777093758798800158009280040699463700201600102080000201600008004080040118002110901010800008000001080000008000901015800106101700005020003160458003718000090080000800108004180041800418004180041
8002480040620100000000040001008002516662516001080010800008001080000417584737587668001580040800406994637002016001020800002016000080040800401180021109010108000080000010800000080013000108000000101900005020004250338003708000099080000800108004180041800418004180041