Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3R (4H)

Test 1: uops

Code:

  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.012

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 3.012

retire (01)cycle (02)030507080a0b0e0f191e22243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
64005287012151171160000400537027959001601440093009100030001000500035727322850284952813131040001000300010003000284252804911610011000100001003321002021100121311132911046872723501760195683399381313443627802151601251712707100030002806828566281162827128105
64004281532151150141000400524727963001578340123012100030001000500035722022879280042806331040001000300010003000283852856911610011000100001003121002011100121310141721057673303541937197543370382216424227822139501295913729100030002810828088279812853228322
64004285342141131140000400534527945111587740093009100030001000500035760522849284912812931040001000300010003000280252795511610011000100001003321004022100121310134831042070313436840192973420382216384427774147131209012832100030002804828035280242841828527
64004281382111141151000310510227872101596240123012100030001000500035751622909284422851031040001000300010003000280022808311610011000100001002131003011100122310141431008273363508638193273456381813434327817138051211312949100030002848028051280702845128124
64004283542141100150000400521727977111598340123012100030001000500035757222850286282866431040001000300010003000281142819511610011000100001003331004002100122312142481057472133550837193463430382214503927981138611223712545100030002803228007282822861628255
6400428241211170101000400489527893001582240123012100030001000500035757922894280012828431040001000300010003000283672805211610011000100001002121003021100021312135041033572913499735192143483381212444627871141411213112841100030002855528583285532812028020
64004282502101141150110400527228373101650740093012100030001000500035735422860284892812831040001000300010003000284462854221610011000100001001121002002100122210139641053272873417938189923302381810454928119141201218113798100030002868228220286312820628050
64004283952111141121000410520227944001600240093012100030001000500035732622840280052855331040001000300010003000284672838411610011000100001002231001011100122211140481044872813474635191693369382015413927808140551229112890100030002815028148285882809028193
64004280482131121121000410512027938011598140093012100030001000500035745822853280052806231040001000300010003000282512812611610011000100001003221002002100122312143211041573143408437192543470382214404127933138781214012937100030002806928515280152841328674
640042817421411117011041053122784200163184009300910003000100050003575292287628047281263104000100030001000300028217279991161001100010001100322100301110002121214035103947304347284519715334538189423627783139161225012690100030002821328479281222847528097

Test 2: throughput

Count: 8

Code:

  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  ld3r { v0.4h, v1.4h, v2.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305090e0f1e22233f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020580067600010023008002606625320163100240054800001002400008000050040052228817981800228004180041323320100200800002400002008000024000080041800411180201100991001008000080000010080000188001401480018610005109117118003801310800002400001008004280042800428004280042
32020480041599001020008002610625320163100240054800001002400008000050040083628818811800228004180041323320100200800002400002008000024000080041800411180201100991001008000080000010080000080018018800006014180510911711800381010800002400001008004280042800428004280042
320204800415990000201080026166253201631002400638000010024000080000500400727288349418002280041800413233201002008000024000020080000240000800418004111802011009910010080000800000100800001880018013800000113005109117118003801310800002400001008009380042800428004280042
3202048004160000000108002606625320154100240063800001002400008000050040085928836791800228004180041323320100200800002400002008000024000080041800411180201100991001008000080000010080000188001711480000000220510911711800380013800002400001008004280042800428004280042
3202048004159900102300800261602532010010024006380000100240000800005004006992884025180022800418004132332010020080000240000200800002400008004180041118020110099100100800008000001008000018800000148000001182205109117118003811313800002400001008004280042800428004280042
3202048004160000002310800261602532015410024006380000100240000800325004003012884008180022800418004132332010020080000240000200800002400008004180041118020110099100100800008000001008000018800170138001401172205109117118003801313800002400001008004280042800428004280042
320204800416000000000800261062532016310024006380000100240000800005004008722880252180022800418004132332010020080000240000200800002400008004180041118020110099100100800008000001008000022800170188000060182205109117118003811313800002400001008004280042800428004280042
320204800416000011240080026106253201631002400548000010024000080000500400797288396518002280041800413233201002008000024000020080000240000800418004111802011009910010080000800000100800001880017014800136117220510911711800381100800002400001008004280042800428004280042
3202048004160000000008002616025320154100240000800001002400008000050040086428837601800228004180041337320100200800002400002008000024000080041800411180201100991001008000080000010080000188001801880018600220510911711800380013800002400001008004280042800428004280042
3202048004160000102310800261662532010010024006380000100240000800005004006612881871180022800418004132332010020080000240000200800002400008004180041118020110099100100800008000001008000018800000148000060142205109117118003811310800002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030508090b0e0f18191e1f2223243a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32002580054600000000003601001800260660046320107102404088010010240300800005040001128816920800228004180041032332001020800002400002080000240000800418004111800211091010800008000001080000027800140018800176172860005019012171215800381131380000240000108004280042800428004280042
32002480041599000010002000000800260060025320073102400638000010240000800005040002228823410800228004180041032332001020800002400002080000240000800418004111800211091010800008000001080000027800140018800006113180000501901417131380038101080000240000108004280042800428004280042
3200248004159900000000240100080026106006632007310240063800001024000080000504000242882341080022800418004103233200102080000240075208000024000080041800411180021109101080000800000108000002880014002180014611700000501901517171580038101080000240000108004280042800428004280042
3200248004159900000000240000080026066002532006410240000800001024000080000504000112881693080022800418004103233200102080000240000208000024000080041800411180021109101080000800000108000000800000008001761132200005019015171310800380131380000240000108004280042800428004280042
32002480041600000000002401000800260060025320073102400008000010240000800325040002228816930800228004180041032332001020800002400002080000240000800418004111800211091010800008000001080000028800130020800136117220000501901017141480038013080000240000108004280042800428004280042
320024800415990000000000100080026106002532006410240054800001024000080000504000182881693180022800418004103233200102080000240000208000024000080041800411180021109101080000800000108000002880018001880018610000005019010171414800380131080000240000108004280042800428004280042
32002480041599000000000000008002610600253200641024006280000102400008000050400000288000008002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800000288001400138001861140000050190131714168003810080000240000108004280095800428004280042
320024800416000000000020000008002606600107320063102400008000010240000800005040003828800090800228004180041032332001020800002400002080000240000800418004111800211091010800008000001080000028800850017800006017000005019012171415800380131080000240000108004280042800428004280042
320024800415990000000024000008002616600253200731024006380000102400008000050400024288234108002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800000288001700080017601300000501961617151580038013080000240000108004280042800428004280042
320024800415990000000024000008002616000253200721024005480000102400758000050404568288991008002280041800410323320010208000024000020800002400008004180041118002110910108000080000110800000278001400188001400132200005019013171215800381131380000240000108004280042800428004280042