Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 8H)

Test 1: uops

Code:

  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0307090a0e0f181e22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
640052952822035126000120045182909701123269400040004000207110169122872729271310400040004000291012922911610011000100004000084003010240030100012962909768433085118020285308238191353552838616233132641503840002918629212292132935129275
64004293102202412311090047282913100023461400040004000207112169862867929327310400040004000292272910411610011000100004000084002000040026038013141919768183108105720262306338241553582842016545133821501140002929629335292862936129291
6400429322220250240105004557291220002340940004000400020701016949288292929231040004000400029144291891161001100010000400008400300024002513001315791686930318265820329306438231650552845016143132191504940002922729355292722928729363
6400429290219320240009004583291360002326840004000400020723016936287732931131040004000400029198291051161001100010000400008400200084002512001311893926872307996020336322738231059622849816204134441499040002935929313293412931429237
64004293352182802111000046082904400023268400040004000207180169612869529217310400040004000290242906711610011000100004000084003000240006130012969929368623163115920313313938171460652851716156134821512940002932029318294712938829370
64004293752192202001080046002917500023320400040004000207071169402868629298310400040004000292042919511610011000100004000084002000240026100012896913369153131106120330315338211155492838216400134421493740002934229299292522930029324
6400429248220231281100104661291150002323840004000400020708016937286672921531040004000400029146291511161001100010000400008400200034002013001290491186820315996120206311938171457582847516300132721507340002930429343293752932929275
640042926421924020010500465429080000232434000400040002070501694528623292533104000400040002913729156116100110001000040000040030002400061380127929362680131337572028331203817960512849516096133301490740002923629251292762924229385
6400429376220290231008004532291580402334240004000400020711316949286242932631040004000400029192291491161001100010000400008400200004003510801298592826869304295820221309838232154512849316430132241486940002935829297292332927129292
6400429247219281241108004592290700002327940004000400020710016942287122933331040004000400029088291761161001100010000400008400200054002612801291491286939305796120232312538222059552857916076132931488740002930429322292622935829318

Test 2: throughput

Count: 8

Code:

  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3341

retire (01)cycle (02)03080b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
32020510673080000100000101106712211213425320100100320000100320018500473068501066820106707106731266576266803201172003200242003200241067271067271180201100991001008000080000110032000004303200390000320038603944011151160160010672814100320000100106732106732106708106732106728
3202041067318000000004400011067123012162532010010032000010032001850047097170106702010672710672726657626704320119200320024200320024106731106727118020110099100100800008000011003200000003200000003832000061394401115116016001067040140320000100106728106732106708106708106830
3202041067098000011004501011066922112192532010010032000010032001850046999761106682010673110670726678626700320118200320024200320024106707106707118020110099100100800008000001003200000430320000000032000061390011151160160010670410107320000100106732106728106732106708106708
320204106731799000000560001106716211222225320100100320000100320018500467417711067020106732106727266817267043201182003200242003200241067071067271180201100991001008000080000010032000004303200380003832003801394301115116016001067041000320000100106708106708106732107071106736
320204106731799000000440100106717201221725320100100320000100320017500470971701066820106731106727266576266793201172003200242003200241067311067271180201100991001008000080000010032000004303200390003832003960000111511601600106704000320000100106732106708106728106728106728
3202041067277990000004400011066922102492532010010032000010032001850046989711106682010672710672726657626703320118200320024200320024106730106707118020110099100100800008000001003200000430320000000383200006100000051091171110672414107320000100106732106708106708106708106728
32020410673180000000044000110669221212149253201001003200001003200005004699887010670601067391067422667932671332010020032000020032000010673110672711802011009910010080000800000100320000043032004000038320038613944000051091171110672914147320000100106732106733106728106729106812
320204106727800001000450101106716210792532010010032013010032000050046738511106706010670710670726672326689320100200320000200320000106727106727118020110099100100800008000001003200000430320039000332000061390000051091171110672814147320000100106786106725106741106708106728
320204106707799000000001011066922010253201001003200001003200005004709628110668201067311067312667532671332010020032000020032000010673110672711802011009910010080000800000100320000043032003800003200000104300005109117111067240147320000100106790106981106744106739106708
32020410673179900000000001106692201216253201001003200001003200005004698880110670701067311067312667932671332010020032000020032000010673110670711802011009910010080000800000100320000000320000000032000061044000051091171110672414147320000100106728106728106708106728106708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3341

retire (01)cycle (02)030e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
3200251067277990090101106712212121625320010103200001032000050470962801067021067271067322667632670932001020320000203200001067271067271180021109101080000800000103201304332003904232003961394405019417111067241010432000010106728106732106732106732106732
32002410673180000207001106869212121625320010103200001032000050472818311067061067271067272667532670932001020320000203200001067311067271180021109101080000800000103200004332003924232003961394305019117111067281014432000010106732106728106708106728106732
320024106727799005710110671221121825320010103200001032000050468309901067061067271067272667932670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019217111070261010432000010106728106728106728106728106728
3200241067287990044001106712212121625320010103200001032000050472818321067111067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019117111067241010432000010106779106728106728106728106728
3200241067277990045001106712212121625320010103200001032000050470962811067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019217111067241010432000010106728106728106728106728106728
3200241067277990045001106712212121625320010103200001032000050472818311067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019117121067241010432000010106728106728106728106728106728
3200241067277990045101106712212121825320010103200001032000050470962811067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332003903932003961394305019117111067241010432000010106728106728106728106728106728
3200241067278000045001106715212121625320010103200001032000050472818311067021067271067272667532670932001020320000203200001067271067271180021109101080000800000103200004332000003932003961394305019117111067241010432000010106897106728106728106728106728
3200241067277990045001106716212121625320010103200001032000050470962811067021067271067272667932670932001020320000203200001067311067271180021109101080000800001103200004332003903932003961394305019217221070031010432000010106728106735106728106728106746
3200241067318001044101106712212121625320010103200001032000050470962811067021067271067272667532670932001020320000203200001067271067071180021109101080000800000103200004332003903932003961394305020117111069071010432000010106728106728106728106728106728