Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 2D)

Test 1: uops

Code:

  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03040508090a0b0e0f1e1f2223243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
6400528629214300110001700005264279830442212140004000400020706310169772805128260310400040004000281652810511610011000100014006684007002144010604110014025103987158359226319170350138231440372778014793116631251640002809928327281152815028075
640042813421110000011400100525128008000220154000400040002070841016979280882834231040004000400028189281111161001100010000400008400000064006513110013937104717142336113719092347538181538382774213903117771301040002814127991281862816528382
64004281452111000000027010051872816600022051400040004000207372001697627957281223104000400040002799228267116100110001000040000040030006400040680013710104087367352704619181341638151742412782114301117131270040002808328193282192827328243
6400428092211100000009000052042838324022171400040004000207090001696727886281253104000400040002805128490116100110001000040000040060000400401480014137101887218355304319124351938211339362787315005118571259240002810728258281742829028129
64004281192100000000000100526028178004222244000400040002071160816962279712836631040004000400028070279941161001100010001400000400201074006606110013812104537312354004319359340538161645402807613989117191248540002834428119283702820328145
6400428334211200000008000051602818424422123400040004000207025081695427933283573104000400040002819928093116100110001000040000840060003400451900013945103827335352203719270348838172041432786614116117731274940002829128132282592824728002
64004281802110000000012000052972827300022138400040004000207304001695927932281093104000400040002802928034116100110001000040000840030003400061400014172103377344350703819261343838141839422775513895116691261640002810228156280942810028081
6400428311212100000111301005316282132002209240004000400020741400169732791228088310400040004000280112806111610011000100004000084009012013400000000013962101607324341804519286340238171840412784314803120371273140002817027990280772801528252
64004279912100000000010010052982803200022224400040004000207051101697927915282503104000400040002804327983116100110001000040000840020006400201400014128100597302342403819032349138181732362780814591116221264740002808328412282612809428138
64004281912101000000010000051252799504422192400040004000207073181696528038281483104000400040002804628085116100110001000040000840000101744002000110014236104367211357324119115344638261537382782013770114731335540002801228111282242817428124

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3354

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
320205106730799000110044000210669907702532010010032000010032000050046742340106977010672710673226655326689320100200320000200320000106727106707118020110099010010080000800000100320000043032003800085832004061194319205109217111067330135320000100106737106737106737106737106737
32020410682279900000004400011067210000253201001003200001003200005004699887110670601067311067312665532668932010020032000020032000010673110672711802011009901001008000080000010032000000032003800042320039000000051091171110672814100320000100106732106708106708106708106732
320204106879799000000053010110671620119253201001003200001003200005004699887010668201067311067312667532671332010020032000020032000010673110672711802011009901001008000080000010032000004403200390004732000061394400051091171110672814140320000100106732106708106732106708106708
32020410674779900000004501011066922111925320100100320000100320000500469988711066820106731106731266793267133201002003200002003200001067311067271180201100990100100800008000001003200000430320038000116320000610440005109117111067041400320000100106732106728106732106708106728
320204106837799000000044010010671620119253201001003200001003200005004705621110668201067311067322667632668932010020032000020032000010673110672711802011009901001008000080000010032000004303200380102153200390104400051091171110672414140320000100106728106728106732106732106708
320204106873800000110044010110671621200253201001003200001003200005004700708110668301067071067072667932671332010020032000020032000010670710673111802011009901001008000080000010032000004303200380001613200380039440005109117111067300140320000100106732106732106708106732106732
320204106843799000000044000110669220019253201001003200001003200005004699887010670601067071067312665532671332010020032000020032000010670710672711802011009901001008000080000010032000004403200000001613200380039440005109117111067281407320000100106708106708106708106728106728
320204106759800000000000100106716211025320100100320000100320000500467408501067060106819106707266793266893201002003200002003200001067071067271180201100990100100800008000001003200000430320038060051320038600440005109117111067281407320000100106732106732106708106708106732
32020410686180000000000010010671220002532010010032000010032000050047056210106706010673410670726679326714320100200320000200320000106731106707118020110099010010080000800000100320000043032003900020032003801404400051101171110670414144320000100106735106728106732106708106712
320204106731799000110044000010671621119253201001003200001003200005004709628110670601067271067312667932671332010020032000020032000010672710670711802011009901001008000080000010032000004303200380001593200006038440005109117111067281477320000100106732106708106740106732106732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3341

retire (01)cycle (02)0305080b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)c2cfd5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
320025106737799000102510110670117003253200101032000010320000504646912000106706106731106731266633267133200102032000020320000106731106716118002110910108000080000010320000058320072150253200556125430501911701110672866332000010106717106732106717106732106789
320024106731800000002510110671617001725320010103200001032000050473369101010669010671610671526664326713320010203200002032000010671510673111800211091010800008000001032000005832007224269320054615400501911701110671306032000010106716106732106732106717106732
320024106715799001006010010671625153017253200101032000010320000504665335000106706106715106731266803267133200102032000020320000106731106731118002110910108000080000010320000058320072220563200540054430501911701110671360032000010106717106732106732106732106758
320024106722799111006000010671625150325320010103200001032000050468020200010669010673110673126664326697320010203200002032000010673110671511800211091010800008000001032000005832007233225320025615400501911701110672806332000010106717106736106736106724107327
320024106731799000006010110671617003253200101032000010320000504734699000106706106715106731266793267133200102032000020320000106731106731118002110910108000080000010320000058320041350543200250155430501911701110671260032000010106717106716106732106732106834
3200241067317990000060000106716250301725320010103200001032000050471967800010670610673110671626667326697320010203200002032000010673110671511800211091010800008000001032000000320070154025320054012500501911701110671366032000010106716106716106716106732106897
32002410672379910000600011067161715301825320010103201301032017850471987700010687010682510671526664326732320010203200002032000010673110673111800211091010800008000001032000000320072220573200250124430501911701110672860332000010106736106732106717106720106883
3200241067227991000072100106716251530212532001010320000103200005047295050001067101067201067192667932671732001020320000203200001067311067161180021109101080000800000103202640583200728501650320184002500501911701110671600332000010106716106720106721106732106942
320024106735800000007210110672025153019253200101032000010320000504665534000106706106735106735266643267173200102032000020320000106731106731118002110910108000080000010320000058320057302875320025012501501911701110672860332000010106716106716106732106716106757
320024106731799000006010010670125150325320010103200001032000050473369100010670810673110671526679326713320010203200002032000010671610673111800211091010800008000011032000000320041051563200256154430501911701110671260032000010106716106732106717106732106757