Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3R (16B)

Test 1: uops

Code:

  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.012

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 3.012

retire (01)cycle (02)030507080a0b0e0f1e1f22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
64005292782191181130101001045232884400117013401230121000300010005000357454228910290942928231040001000300010003000292112919511610011000100001002331002012110003020012922911668353162548202793101381814494828430163321379114889100030002927129285292752932629258
6400429209219014011000401045382875911017133400930121000300010005000357087228480291352919031040001000300010003000291922919111610011000100001002121003012610002131113009926068633073545202803061382017414428327163101394514976100030002921729307292642938529312
640042923722018113100301046242873610017142400930061000300010005000357732228470290762925931040001000300010003000291872910111610011000100001000031000000310022101112916915069033109643202813090382616434328392162021407314734100030002933029192292382922529269
6400429388218116116100401045192877300017017401230091000300010005000357498228580291042926731040001000300010003000292062921211610011000100011000001000000110002120012878939568433058440202933072381613373828437163091392814800100030002917929273292572936629299
6400429159219015013000100045822879300017139400330121000300010005000357496228380290612918131040001000300010003000292142921311610011000100001002221001001110003020013024917368593019844203613114381719383628372163581395815050100030002928429322292802926329307
6400429287219012015000101047022886600017046400330121000300010005000356341228170290792925331040001000300010003000291762923311610011000100001003321002001110003131012825912868163055842201933090382815424228356161911400015109100030002931929287292092932929283
640042922322011518100301045442870000017090400330151000300010005000357767228250291312926031040001000300010003000291202917911610011000100001003131004000310003131113253927068583094531202293103381613404428297162881409015058100030002927029273292982925729309
6400429300220115017000401045772883200017009401230091000300010005000356443228570291412925231040001000300010003000292182919411610011000100001004331003001310003121112982925168623070438202823092382318354128287163451385015090100030002927129267292902933429315
6400429178218116110100300046162879400017016400330031000300010005000357680229290290622927231040001000300010003000291322911811610011000100011003331003000410002131212701912468133044844203593138381214414228307166151379515129100030002923529290292622927229248
6400429227219111113000500046102872100016984400330121000300010005000357606229220291072922531040001000300010003000291202908011610011000100001002231001011410002131112761925768263016542202943093382217373328354163411421815222100030002924429345292942928129288

Test 2: throughput

Count: 8

Code:

  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  ld3r { v0.16b, v1.16b, v2.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f1e22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320205800676001110035100800261600025320120100240018800001002400008000050040004628839811800228004180041323320100200800002400002008000024000080041800411180201100991001008000080000010080006728800290030800006129070510911711800381013800002400001008004280042800428004280042
32020480041599101006000800260600025320120100240020800001002400008000050040004628839811800228004180041323320100200800002400002008003324000080041800411180201100991001008000080000110080008828800320129800246170705109117118003801313800002400001008004280042800428004280042
320204800416001010061008002600600253201201002400868000010024000080000500400151288022008002280041800413233201002008000024000020080000240000800418004111802011009910010080000800000100800088278003100298002460728705109117118003811313800002400001008004280042800428004280042
320204800416001100136000800261600025320186100240086800001002400008000050040000128839810800228004180041323320100200800002400002008000024000080041800411180201100991001008000080000010080007608002900318002361728605109117118003801313800002400001008004280042800428004280042
320204800415991000037101800261660025320120100240018800001002400008000050040005428839810800228004180041323320100200800002400002008000024000080041800411180201100991001008000080000110080008827800311131800006129071510911711800381013800002400001008004280042800428004280042
3202048004159911001370008002616600253201201002400798000010024000080000500400000288022008002280041800413233201002008000024000020080000240000800418004111802011009910010080000800001100800077288000700318000060292771510911711800380013800002400001008004280042800428004280042
320204800416001010071008002616600253201831002400868000010024000080000500400046288022008002280041800413233201002008000024000020080000240000800418004111802011009910010080000800000100800077278004800348002361310605109117118003811313800002400001008004280042800428004280042
32020480041600111003510080026160002532018310024008380000100240000800005004000462883981180022800418004132332010020080000240000200800002400008004180041118020110099100100800008000001008000772880031007800230162771510911711800380130800002400001008004280042800428004280042
3202048004159911000360018002616600253201831002400208000010024000080000500400046288398108002280041800413233201002008000024000020080000240000800418004111802011009910010080000800000100800079278003259068000061627715109117118003801313800002400001008004280042800428004280042
320204800415991110070008002616600253201791002400798000010024000080000500400011288403308002280041800413233201002008000024000020080000240000800418004121802011009910010080000800000100800088278003101298002461300715109117118003801313800002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030508090b0e0f18191e1f2223243a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cficache miss (d3)d5d6d9dadbddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320025800545990010000017010008002616600253200821024001880000102400008000050400038288288701800228004180041032332001020800002400002080000240000800418004111800211091010800008000011080000015800100011800006001500050190417000428003810980000240000108004280042800428004280042
3200248004159910011000320100080079066002532001010240000800001024000080000504000092880961018002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800077248002600268000060262372050190417000428003806680000240000108004280042800428004280042
3200248004159900000000170100080026106002532008210240075800001024000080000504000382882839018002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800000158000000118001160111500050190417000428003806680000240000108004280042800428004280042
3200248004159900000000160100080026160002532005210240042800001024000080000504000072880000018002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800000158000000148001100111500050190417000248003816080000240000108004280042800428004280042
320024800416000000000017010008002616000253200521024004280000102400008000050400000288096101800228004180041032332001020800002400002080000240000800418004111800211091010800008000001080000015800110010800110011000050190417000428003809980000240000108009380042800428004280042
320024800416001101000070000180026006002532005210240042800001024000080000504000082880964018002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800078248002701268000061262471050190417000428003806680000240000108004280042800428004280042
320024800415990000000017010008002616000253200811024007180000102400008000050400033288283901800228004180041032332001020800002400002080000240000800418004111800211091010800008000001080000015800000011800106011000050190217000428003816680000240000108004280042800428004280042
32002480041600000000001701000800261660025320052102400998000010240000800325040017328821920080022800418004103233200102080000240000208000024000080093800411180021109101080000800000108000001580014000800106010000050190417000428003800680000240000108004280042800428004280042
3200248004159900000000160100080026106002532007710240067800001024000080000504000392882774018002280041800410323320010208000024000020800002400008004180041118002110910108000080000010800076248000601268002001252462050190217000348003816680000240000108004280042800428004280042
3200248004159900000000001000800261660025320082102400678000010240000800005040003828827740180022800418004103233200102080000240000208000024000080041800411180021109101080000800000108000000800120010800006001500050190317000248003806680000240000108004280042800428004280042