Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3R (4S)

Test 1: uops

Code:

  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.006

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 3.006

retire (01)cycle (02)0304070e0f1e1f22233a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)dtlb miss (c1)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
640052832221332112010504927858111158394006300610003000100050023569042289128319280893104000100030001000300028434283661161001100010000100031001001000212113474104367166346705819182335838159454128037136211292213111100030002811028073283402815628079
640042809021300109010509428052001165354000300010003000100050003571002282928334286543104000100030001000300028205284081161001100010001100001000151001122013070941170393214039196663298381613384527967156441328314154100030002862328733286172873828775
640042876522300002000525727920000162844006300910003000100050003569452287928329281083494000100030001000300329238290731161001100010000100021000011000202012893918368883053038202013088381611443828309164141400015068100030002924029228292902925229274
64004292842200000300145672875300017076400630091000300010005000357136228362914229143310400010003000100030002909129226116100110001000010002100000100020001286792376836310404120294308738157434228303164611372415069100030002926829234292342921829265
64004292642192000300045722876500017091400630061000300010005000356814228192911629256310400010003000100030002915629119116100110001000010002100000100020001274991056833310604220294306638156444828271164041389415033100030002924029192291782918129283
64004292562180000300045242880600117159400630061000300010005000356792228452908429262310400010003000100030002910729147116100110001000010000100000100000201295491116852307103920197300638146454528351163791380914840100030002924529192291852922229290
64004292122190000000145672879101017024400630061000300010005000356955228342902329279310400010003000100030002910629169116100110001000010002100000100120301293991336911308304220153306638169404328345164421383614989100030002917929253291772917129233
64004291622191000600046272880400016958400630061000300010005000357406227602904929178310400010003000100030002907429084116100110001000010002100001100000201276894006891315404620170310038148434228339165371380114947100030002930429232292282918829284
64004291692180000300145702888401017063400630061000300010005000357175228432907329224310400010003000100030002910029156116100110001000010002100000100020201299491816922303704620253310838099434628344160761383815029100030002913829167293412931529303
64004291982190001000145202881700016996400630091000300010005000356175228052909129291310400010003000100030002914729093116100110001000010000100000100000201282191296875306604620230304138127364328383163081390415029100030002928729346293152918529215

Test 2: throughput

Count: 8

Code:

  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  ld3r { v0.4s, v1.4s, v2.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030508090b0e0f181e22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd0d5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320205800675990010000010080026160002532015410024000080000100240000800005004000092881693128002208004180041323320100200800002400002008000024000080041800411180201100991001008000080000110080000018800130014800136114180051092117118003801010800002400001008004280042800428004280042
320204800415991001000361018002616000253201631002400008000010024000080000500400018288000012800220800418004132332010020080000240000200800002400008004180041118020110099100100800008000011008000001880018001480018611418005109211711800381010800002400001008004280042800428004280042
32020480041600000000019100800260600025320154100240054800001002400008000050040000028816931080022080041800413233201002008000024000020080000240000800418004111802011009910010080000800001100800076288000702780024613028705129211711800381013800002400001008004280042800428009480042
32020480041599100000036100800261600025320183100240086800001002400008000050040004628802201280022080041800413233201002008000024000020080000240000800418004111802011009910010080000800001100800087278003101308000001310705109211711800381013800002400001008004280042800428004280042
32020480041599110000037001800260660025320179100240079800001002400008000050040005528839811280022080041800413233201002008000024000020080000240000800418004111802011009910010080000800001100800076288003100308000061706051092117118003811313800002400001008004280042800428004280042
320204800415991000000370018002616000253201861002400188000010024000080000500400054288398112800220800418004132332010020080000240000200800002400008004180041118020110099100100800008000011008000772880029003880023013027715109211711800380130800002400001008004280042800428004280042
3202048004159910010003610180026160002532011810024008380000100240000800005004000462883981128002238004180041723320100200800002400002008000024000080041800411180201100991001008000080000110080008628800091019800246129277051092117118003801313800002400001008004280042800428004280042
32020480041600100000061018002600600253201791002400208000010024000080000500400048288391512800220800418004132332010020080000240000200800002400008004180041118020110099100100800008000011008000972880031023180023613128715109211711800380130800002400001008004280042800428004280042
32020480041599110000036101800260660025320120100240086800001002400008000050040004628839811280022080041800413233201002008000024000020080000240000800418004111802011009910010080000800001100800087278000701680000013027715109211711800380130800002400001008004280042800428004280042
32020480041600110000037001800261600025320183100240086800001002400008000050040004728839811280022080041800413233201002008000024000020080000240000800418004111802011009910010080000800000100800076080007013080024603128615109211711800380130800002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f18191e1f2223243a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3200258005460000000007010008002616000253200891024002080000102400008000050400003288398108002280041800410323320010208000024000020800002400008004180041118002110910108000080000110800000188001800178000060000005019317328003810080000240000108004280042800428004280042
32002480041600000000054501000800261660025320010102400008000010240000800005040000028800001800228004180041032332001020800002400002080000240000800418004111800211091010800008000011080000008001810188000061132200050193174480038001080000240000108004280042800428004280042
32002480041600000000020010008002616000253200101024006380000102400008000050400022288234108002280041800410370320010208000024000020800002400008004180041118002110910108000080000010800000080014001880017601318000501941733800380131380000240000108004280042800428004280042
320024800415990000000240000080026060002532007310240054800001024000080000504000222880000080022800418004103233200102080000240000208000024000080041800411180021109101080000800001108000001880000003380000010000050194174380038013080000240000108004280042800428004280042
32002480041600000000024010008002616600253200301024008680000102400008000050400054288400518002280041800410323320010208000024000020800002400008004180041118002110910108000080000110800000080000000800136013000150194173480038101380000240000108004280042800428004280042
320024800416000000000200100080026166002532006410240063800001024000080000504000292881693180022800418004103233200102080000240000208000024000080041800411180021109101080000800001108000002280018001880018610000050194174480038113080000240000108004280042800428004280042
320024800415990000000570000080026100004432007310240063800501024000080000504000582881757180022800418004103473200102080000240000208000024000080041800411180021109101080000800001108000001880014403080018611422000501951735800380131380000240000108004280042800428004280042
32002480041600000000024010008002616000253200731024000080000102400008000050400021288001518002280041800410323320010208000024000020800002400008004180041118002110910108000080000110800000188000010178001701018000501931743800380131380000240000108004280042800428004280042
32002480041600000000024000008002606600253200731024006380000102400008000050400022288000018002280041800410323320010208000024000020800002400008004180041118002110910108000080000110800000188000000178000060142200050192173280038113080000240000108004280042800428004280042
3200248004160011000003501000800261060025320028102400838000010240000800005040000328802201800228004180041032332001020800002400002080000240000800418004111800211091010800008000011080000008000000138001800142200050194173480038113080000240000108004280042800428004280042