Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 2D)

Test 1: uops

Code:

  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e1f2223243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
6400528629214300110001700005264279830442212140004000400020706310169772805128260310400040004000281652810511610011000100014006684007002144010604110014025103987158359226319170350138231440372778014793116631251640002809928327281152815028075
640042813421110000011400100525128008000220154000400040002070841016979280882834231040004000400028189281111161001100010000400008400000064006513110013937104717142336113719092347538181538382774213903117771301040002814127991281862816528382
64004281452111000000027010051872816600022051400040004000207372001697627957281223104000400040002799228267116100110001000040000040030006400040680013710104087367352704619181341638151742412782114301117131270040002808328193282192827328243
6400428092211100000009000052042838324022171400040004000207090001696727886281253104000400040002805128490116100110001000040000040060000400401480014137101887218355304319124351938211339362787315005118571259240002810728258281742829028129
64004281192100000000000100526028178004222244000400040002071160816962279712836631040004000400028070279941161001100010001400000400201074006606110013812104537312354004319359340538161645402807613989117191248540002834428119283702820328145
6400428334211200000008000051602818424422123400040004000207025081695427933283573104000400040002819928093116100110001000040000840060003400451900013945103827335352203719270348838172041432786614116117731274940002829128132282592824728002
64004281802110000000012000052972827300022138400040004000207304001695927932281093104000400040002802928034116100110001000040000840030003400061400014172103377344350703819261343838141839422775513895116691261640002810228156280942810028081
6400428311212100000111301005316282132002209240004000400020741400169732791228088310400040004000280112806111610011000100004000084009012013400000000013962101607324341804519286340238171840412784314803120371273140002817027990280772801528252
64004279912100000000010010052982803200022224400040004000207051101697927915282503104000400040002804327983116100110001000040000840020006400201400014128100597302342403819032349138181732362780814591116221264740002808328412282612809428138
64004281912101000000010000051252799504422192400040004000207073181696528038281483104000400040002804628085116100110001000040000840000101744002000110014236104367211357324119115344638261537382782013770114731335540002801228111282242817428124

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3354

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
320205106730799000110044000210669907702532010010032000010032000050046742340106977010672710673226655326689320100200320000200320000106727106707118020110099010010080000800000100320000043032003800085832004061194319205109217111067330135320000100106737106737106737106737106737
32020410682279900000004400011067210000253201001003200001003200005004699887110670601067311067312665532668932010020032000020032000010673110672711802011009901001008000080000010032000000032003800042320039000000051091171110672814100320000100106732106708106708106708106732
320204106879799000000053010110671620119253201001003200001003200005004699887010668201067311067312667532671332010020032000020032000010673110672711802011009901001008000080000010032000004403200390004732000061394400051091171110672814140320000100106732106708106732106708106708
32020410674779900000004501011066922111925320100100320000100320000500469988711066820106731106731266793267133201002003200002003200001067311067271180201100990100100800008000001003200000430320038000116320000610440005109117111067041400320000100106732106728106732106708106728
320204106837799000000044010010671620119253201001003200001003200005004705621110668201067311067322667632668932010020032000020032000010673110672711802011009901001008000080000010032000004303200380102153200390104400051091171110672414140320000100106728106728106732106732106708
320204106873800000110044010110671621200253201001003200001003200005004700708110668301067071067072667932671332010020032000020032000010670710673111802011009901001008000080000010032000004303200380001613200380039440005109117111067300140320000100106732106732106708106732106732
320204106843799000000044000110669220019253201001003200001003200005004699887010670601067071067312665532671332010020032000020032000010670710672711802011009901001008000080000010032000004403200000001613200380039440005109117111067281407320000100106708106708106708106728106728
320204106759800000000000100106716211025320100100320000100320000500467408501067060106819106707266793266893201002003200002003200001067071067271180201100990100100800008000001003200000430320038060051320038600440005109117111067281407320000100106732106732106708106708106732
32020410686180000000000010010671220002532010010032000010032000050047056210106706010673410670726679326714320100200320000200320000106731106707118020110099010010080000800000100320000043032003900020032003801404400051101171110670414144320000100106735106728106732106708106712
320204106731799000110044000010671621119253201001003200001003200005004709628110670601067271067312667932671332010020032000020032000010672710670711802011009901001008000080000010032000004303200380001593200006038440005109117111067281477320000100106732106708106740106732106732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
320025106737799000102510110670117003253200101032000010320000504646912000106706106731106731266633267133200102032000020320000106731106716118002110910108000080000010320000058320072150253200556125430501911701110672866332000010106717106732106717106732106789
320024106731800000002510110671617001725320010103200001032000050473369101010669010671610671526664326713320010203200002032000010671510673111800211091010800008000001032000005832007224269320054615400501911701110671306032000010106716106732106732106717106732
320024106715799001006010010671625153017253200101032000010320000504665335000106706106715106731266803267133200102032000020320000106731106731118002110910108000080000010320000058320072220563200540054430501911701110671360032000010106717106732106732106732106758
320024106722799111006000010671625150325320010103200001032000050468020200010669010673110673126664326697320010203200002032000010673110671511800211091010800008000001032000005832007233225320025615400501911701110672806332000010106717106736106736106724107327
320024106731799000006010110671617003253200101032000010320000504734699000106706106715106731266793267133200102032000020320000106731106731118002110910108000080000010320000058320041350543200250155430501911701110671260032000010106717106716106732106732106834
3200241067317990000060000106716250301725320010103200001032000050471967800010670610673110671626667326697320010203200002032000010673110671511800211091010800008000001032000000320070154025320054012500501911701110671366032000010106716106716106716106732106897
32002410672379910000600011067161715301825320010103201301032017850471987700010687010682510671526664326732320010203200002032000010673110673111800211091010800008000001032000000320072220573200250124430501911701110672860332000010106736106732106717106720106883
3200241067227991000072100106716251530212532001010320000103200005047295050001067101067201067192667932671732001020320000203200001067311067161180021109101080000800000103202640583200728501650320184002500501911701110671600332000010106716106720106721106732106942
320024106735800000007210110672025153019253200101032000010320000504665534000106706106735106735266643267173200102032000020320000106731106731118002110910108000080000010320000058320057302875320025012501501911701110672860332000010106716106716106732106716106757
320024106731799000006010010670125150325320010103200001032000050473369100010670810673110671526679326713320010203200002032000010671610673111800211091010800008000011032000000320041051563200256154430501911701110671260032000010106716106732106717106732106757