Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 8H)

Test 1: uops

Code:

  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)0e0f181e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
640052952822035126000120045182909701123269400040004000207110169122872729271310400040004000291012922911610011000100004000084003010240030100012962909768433085118020285308238191353552838616233132641503840002918629212292132935129275
64004293102202412311090047282913100023461400040004000207112169862867929327310400040004000292272910411610011000100004000084002000040026038013141919768183108105720262306338241553582842016545133821501140002929629335292862936129291
6400429322220250240105004557291220002340940004000400020701016949288292929231040004000400029144291891161001100010000400008400300024002513001315791686930318265820329306438231650552845016143132191504940002922729355292722928729363
6400429290219320240009004583291360002326840004000400020723016936287732931131040004000400029198291051161001100010000400008400200084002512001311893926872307996020336322738231059622849816204134441499040002935929313293412931429237
64004293352182802111000046082904400023268400040004000207180169612869529217310400040004000290242906711610011000100004000084003000240006130012969929368623163115920313313938171460652851716156134821512940002932029318294712938829370
64004293752192202001080046002917500023320400040004000207071169402868629298310400040004000292042919511610011000100004000084002000240026100012896913369153131106120330315338211155492838216400134421493740002934229299292522930029324
6400429248220231281100104661291150002323840004000400020708016937286672921531040004000400029146291511161001100010000400008400200034002013001290491186820315996120206311938171457582847516300132721507340002930429343293752932929275
640042926421924020010500465429080000232434000400040002070501694528623292533104000400040002913729156116100110001000040000040030002400061380127929362680131337572028331203817960512849516096133301490740002923629251292762924229385
6400429376220290231008004532291580402334240004000400020711316949286242932631040004000400029192291491161001100010000400008400200004003510801298592826869304295820221309838232154512849316430132241486940002935829297292332927129292
6400429247219281241108004592290700002327940004000400020710016942287122933331040004000400029088291761161001100010000400008400200054002612801291491286939305796120232312538222059552857916076132931488740002930429322292622935829318

Test 2: throughput

Count: 8

Code:

  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3341

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
32020510673080000100000101106712211213425320100100320000100320018500473068501066820106707106731266576266803201172003200242003200241067271067271180201100991001008000080000110032000004303200390000320038603944011151160160010672814100320000100106732106732106708106732106728
3202041067318000000004400011067123012162532010010032000010032001850047097170106702010672710672726657626704320119200320024200320024106731106727118020110099100100800008000011003200000003200000003832000061394401115116016001067040140320000100106728106732106708106708106830
3202041067098000011004501011066922112192532010010032000010032001850046999761106682010673110670726678626700320118200320024200320024106707106707118020110099100100800008000001003200000430320000000032000061390011151160160010670410107320000100106732106728106732106708106708
320204106731799000000560001106716211222225320100100320000100320018500467417711067020106732106727266817267043201182003200242003200241067071067271180201100991001008000080000010032000004303200380003832003801394301115116016001067041000320000100106708106708106732107071106736
320204106731799000000440100106717201221725320100100320000100320017500470971701066820106731106727266576266793201172003200242003200241067311067271180201100991001008000080000010032000004303200390003832003960000111511601600106704000320000100106732106708106728106728106728
3202041067277990000004400011066922102492532010010032000010032001850046989711106682010672710672726657626703320118200320024200320024106730106707118020110099100100800008000001003200000430320000000383200006100000051091171110672414107320000100106732106708106708106708106728
32020410673180000000044000110669221212149253201001003200001003200005004699887010670601067391067422667932671332010020032000020032000010673110672711802011009910010080000800000100320000043032004000038320038613944000051091171110672914147320000100106732106733106728106729106812
320204106727800001000450101106716210792532010010032013010032000050046738511106706010670710670726672326689320100200320000200320000106727106727118020110099100100800008000001003200000430320039000332000061390000051091171110672814147320000100106786106725106741106708106728
320204106707799000000001011066922010253201001003200001003200005004709628110668201067311067312667532671332010020032000020032000010673110672711802011009910010080000800000100320000043032003800003200000104300005109117111067240147320000100106790106981106744106739106708
32020410673179900000000001106692201216253201001003200001003200005004698880110670701067311067312667932671332010020032000020032000010673110670711802011009910010080000800000100320000000320000000032000061044000051091171110672414147320000100106728106728106708106728106708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3341

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3200251067277990090101106712212121625320010103200001032000050470962801067021067271067322667632670932001020320000203200001067271067271180021109101080000800000103201304332003904232003961394405019417111067241010432000010106728106732106732106732106732
32002410673180000207001106869212121625320010103200001032000050472818311067061067271067272667532670932001020320000203200001067311067271180021109101080000800000103200004332003924232003961394305019117111067281014432000010106732106728106708106728106732
320024106727799005710110671221121825320010103200001032000050468309901067061067271067272667932670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019217111070261010432000010106728106728106728106728106728
3200241067287990044001106712212121625320010103200001032000050472818321067111067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019117111067241010432000010106779106728106728106728106728
3200241067277990045001106712212121625320010103200001032000050470962811067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019217111067241010432000010106728106728106728106728106728
3200241067277990045001106712212121625320010103200001032000050472818311067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019117121067241010432000010106728106728106728106728106728
3200241067277990045101106712212121825320010103200001032000050470962811067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019117111067241010432000010106728106728106728106728106728
3200241067278000045001106715212121625320010103200001032000050472818311067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332000003932003961394305019117111067241010432000010106897106728106728106728106728
3200241067277990045001106716212121625320010103200001032000050470962811067021067271067272667932670932001020320000203200001067311067271180021109101080000800001103200004332003903932003961394305019217221070031010432000010106728106735106728106728106746
3200241067318001044101106712212121625320010103200001032000050470962811067021067271067272667532670932001020320000203200001067271067071180021109101080000800000103200004332003903932003961394305020117111069071010432000010106728106728106728106728106728