Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 4S)

Test 1: uops

Code:

  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0304070a0e0f1e1f22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a7a8acafb5b6bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
640052862121551200801049732807024022352400040004000207092169672823828723310400040004000282742816811610011000100004000840060004002416813383106527080337204419392356138073937392814714283121531269940002831228256282782846728428
640042858221500010801048922815720422504400040004000207063169472798628488310400040004000282802838111610011000100014000840020124002006014176103967120328913919345323638134346382795114051118931258940002832028388283372831228847
640042841021200000800052302798820022569400040004000207052169572798928253310400040004000286872840911610011000100004000040060044004400013935104727158330603719623345238074040342793314014128591298640002874028315283342851128527
640042850521300000000049542805520422307400040004000207000169712799028409310400040004000284772882511610011000100004000840000064004614814048104677091335203319905357238144036402800914112119031316540002834028138284422888628385
64004288402120000000005325283982002278940004000400020711916969280722852931040004000400028659287321161001100010000400084003016400061081422699377101342603819502355238124344352801414276121881250540002886228370284572827628429
64004283462130000080005191281300402243740004000400020710116966280932839531040004000400028292288391161001100010000400084004002400600681412897817089352204119629334538144135342794814665117881261940002836628488284132829028461
640042843621600000800051612802000422497400040004000207114169762809828487310400040004000283122851211610011000100004000840040024003010814207102427092345103819741345538104340372804014282117711393940002892028457284502828428800
640042848621600000801048162805824422446400040004000207133169872802828537310400040004000285012843111610011000100004000840000104000612813984101097139329214219662348938134040392819314667126381277640002829628492285522831128489
640042847621700000800048332825824422400400040004000207104169212814928442310400040004000288312853711610011000100014000840060024003612814046100887155350804719444345938063637312802815208121061407240002824328712284372894728662
640042840921300000800050102808220022499400040004000207194169672805728367310400040004000286952839911610011000100004000840040064003502814342103737183335213519745344238094040392807314431122111303140002833328285283182840828466

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3341

retire (01)cycle (02)030b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
32020510673079900000001001067122121025320100100320000100320000500469897601067021067271067272667532668932010020032000020032000010673310670711802011009910010080000800000100320000430320039039320039613900051101171110672401007320000100106708106728106728106708106708
3202041067278000000045000110671601201625320100100320000100320000500470562101071151067071067072665532671332010020032000020032000010672710670711802011009910010080000800001100320000430320039039320039600430051091171110670400104320000100106708106728106708106708106728
32020410672779900000450000106712211162532010010032000010032000050047096280106682106727106727266793266893201002003200002003200001067271067071180201100991001008000080000010032000043032003903932003961384400510913511106724010144320000100106728107083106869106732106728
32020410670779900000440101106713201202532010010032000010032000050047281830106682106727106731266553267093201002003200002003200001067271067271180201100991001008000080000010032000000320039003200396039000510911711106724014107320000100106728106708106728106728106728
3202041067367990000000101106692011216253201001003200001003200005004728183110670210672710672726655326709320100200320000200320000106727106707118020110099100100800008000001003200004303200000393200000139430051101171110672400107320000100106708106708106708106728106708
32020410670779900000450101106716212016253201001003200001003200005004673754010670610672710672726655326709320100200320000200320000106727106731118020110099100100800008000001003200004303200390383200006104300511011711106724010104320000100106732106728106732106732106731
320204106731799000004400001067152121025320100100320000100320000500470962811067021067271067272667532670932010020032000020032000010672710670711802011009910010080000800000100320000003200000393200396139430051091171110672401000320000100106728106728106728106732106708
3202041067507990000045000110671221212162532010010032000010032000050047281831106706106707106727266793267093201002003200002003200001067271067271180201100991001008000080000010032000043032003903932003901394300511011711106728010107320000100106870106708106708106728106708
320204106707799000004501011067122012162532010010032000010032000050046737541106682106727106727266753267093201002003200002003200001067271067271180201100991001008000080000010032000043032003903932000061394300511011711106724010104320000100106728106728106708106728106728
32020410672779900000450101106712212121625320100100320000100320000500470962811067061067271067302667532670932010020032000020032000010670710670711802011009910010080000800000100320000430320039239320039610000510911711107020010107320000100106728106732106728106708106708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3340

retire (01)cycle (02)030e0f18191e1f2223243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cfd5d6daddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
32002510672279910004101002106712201212253200101032000010320000504685107110669710670710672226670326689320010203200002032000010673110670711800211091010800008000011032000039032003503532000061353900502011701110672006232000010106723106723106723106723106728
320024106707800110044010021067073181216253200101032000010320000504728183110670210672210670726675326709320010203200002032000010672710672211800211091010800008000001032000039032003903932003961350005019117011106724106032000010106904106881106774106723106708
3200241067227990000001002106712218180253200101032000010320000504685107110670510670710670726655326689320010203200002032000010672710672211800211091010800008000001032000039032003903932000060363900501911701110672406032000010106728106723106708106723106723
32002410672779900004500002106712212181625320010103200001032000050467375411067021067071067072665532670932001020320000203200001067221067221180021109101080000800000103200003903200390332000061350005075117011106724100432000010106708106708106708106728106728
3200241067227990000000001106692000122532001010320000103200005046851071106697106727106707266753267093200102032000020320000106727106707118002110910108000080000010320000390320035039320000613543005019117011106704100232000010106723106708106728106723106708
3200241067277990000480000010671221212122532001010320000103200005046851071106697106707106708266703267043200102032000020320000106722106722118002110910108000080000010320000003200001032003561039005019117011106724106232000010106709106728106708106728106728
320024106727799000000000010671020181425320010103200001032000050470962811066821067271067272667532670432001020320000203200001067271067221180021109101080000800000103200000032000003532003560353900501911701110672406432000010106728106728106711106728106728
32002410670779900004500001106729001802532001010320000103200005046988801106682106722106722266703266893200102032000020320000106727106722118002110910108000080000110320000003200390393200390039000501911701110672266232000010106723106723106708106723106723
320024106707799010050010011067072121216253200101032000010320000504709628110670210672710670726655326709320010203200002032000010672210670711800211091010800008000001032000039032003900320035010000507211701110672400232000010106726106723106708106723106723
3200241067227991100410100210671221818025320010103200001032000050470962811066971067271067072667532668932001020320000203200001067071067221180021109101080000800000103200003903200390032003960394300501911711110670806432000010106708106728106728106708106728