Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 4S)

Test 1: uops

Code:

  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e1f22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
640052862121551200801049732807024022352400040004000207092169672823828723310400040004000282742816811610011000100004000840060004002416813383106527080337204419392356138073937392814714283121531269940002831228256282782846728428
640042858221500010801048922815720422504400040004000207063169472798628488310400040004000282802838111610011000100014000840020124002006014176103967120328913919345323638134346382795114051118931258940002832028388283372831228847
640042841021200000800052302798820022569400040004000207052169572798928253310400040004000286872840911610011000100004000040060044004400013935104727158330603719623345238074040342793314014128591298640002874028315283342851128527
640042850521300000000049542805520422307400040004000207000169712799028409310400040004000284772882511610011000100004000840000064004614814048104677091335203319905357238144036402800914112119031316540002834028138284422888628385
64004288402120000000005325283982002278940004000400020711916969280722852931040004000400028659287321161001100010000400084003016400061081422699377101342603819502355238124344352801414276121881250540002886228370284572827628429
64004283462130000080005191281300402243740004000400020710116966280932839531040004000400028292288391161001100010000400084004002400600681412897817089352204119629334538144135342794814665117881261940002836628488284132829028461
640042843621600000800051612802000422497400040004000207114169762809828487310400040004000283122851211610011000100004000840040024003010814207102427092345103819741345538104340372804014282117711393940002892028457284502828428800
640042848621600000801048162805824422446400040004000207133169872802828537310400040004000285012843111610011000100004000840000104000612813984101097139329214219662348938134040392819314667126381277640002829628492285522831128489
640042847621700000800048332825824422400400040004000207104169212814928442310400040004000288312853711610011000100014000840060024003612814046100887155350804719444345938063637312802815208121061407240002824328712284372894728662
640042840921300000800050102808220022499400040004000207194169672805728367310400040004000286952839911610011000100004000840040064003502814342103737183335213519745344238094040392807314431122111303140002833328285283182840828466

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3341

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
32020510673079900000001001067122121025320100100320000100320000500469897601067021067271067272667532668932010020032000020032000010673310670711802011009910010080000800000100320000430320039039320039613900051101171110672401007320000100106708106728106728106708106708
3202041067278000000045000110671601201625320100100320000100320000500470562101071151067071067072665532671332010020032000020032000010672710670711802011009910010080000800001100320000430320039039320039600430051091171110670400104320000100106708106728106708106708106728
32020410672779900000450000106712211162532010010032000010032000050047096280106682106727106727266793266893201002003200002003200001067271067071180201100991001008000080000010032000043032003903932003961384400510913511106724010144320000100106728107083106869106732106728
32020410670779900000440101106713201202532010010032000010032000050047281830106682106727106731266553267093201002003200002003200001067271067271180201100991001008000080000010032000000320039003200396039000510911711106724014107320000100106728106708106728106728106728
3202041067367990000000101106692011216253201001003200001003200005004728183110670210672710672726655326709320100200320000200320000106727106707118020110099100100800008000001003200004303200000393200000139430051101171110672400107320000100106708106708106708106728106708
32020410670779900000450101106716212016253201001003200001003200005004673754010670610672710672726655326709320100200320000200320000106727106731118020110099100100800008000001003200004303200390383200006104300511011711106724010104320000100106732106728106732106732106731
320204106731799000004400001067152121025320100100320000100320000500470962811067021067271067272667532670932010020032000020032000010672710670711802011009910010080000800000100320000003200000393200396139430051091171110672401000320000100106728106728106728106732106708
3202041067507990000045000110671221212162532010010032000010032000050047281831106706106707106727266793267093201002003200002003200001067271067271180201100991001008000080000010032000043032003903932003901394300511011711106728010107320000100106870106708106708106728106708
320204106707799000004501011067122012162532010010032000010032000050046737541106682106727106727266753267093201002003200002003200001067271067271180201100991001008000080000010032000043032003903932000061394300511011711106724010104320000100106728106728106708106728106728
32020410672779900000450101106712212121625320100100320000100320000500470962811067061067271067302667532670932010020032000020032000010670710670711802011009910010080000800000100320000430320039239320039610000510911711107020010107320000100106728106732106728106708106708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3340

retire uop (01)cycle (02)030e0f18191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
32002510672279910004101002106712201212253200101032000010320000504685107110669710670710672226670326689320010203200002032000010673110670711800211091010800008000011032000039032003503532000061353900502011701110672006232000010106723106723106723106723106728
320024106707800110044010021067073181216253200101032000010320000504728183110670210672210670726675326709320010203200002032000010672710672211800211091010800008000001032000039032003903932003961350005019117011106724106032000010106904106881106774106723106708
3200241067227990000001002106712218180253200101032000010320000504685107110670510670710670726655326689320010203200002032000010672710672211800211091010800008000001032000039032003903932000060363900501911701110672406032000010106728106723106708106723106723
32002410672779900004500002106712212181625320010103200001032000050467375411067021067071067072665532670932001020320000203200001067221067221180021109101080000800000103200003903200390332000061350005075117011106724100432000010106708106708106708106728106728
3200241067227990000000001106692000122532001010320000103200005046851071106697106727106707266753267093200102032000020320000106727106707118002110910108000080000010320000390320035039320000613543005019117011106704100232000010106723106708106728106723106708
3200241067277990000480000010671221212122532001010320000103200005046851071106697106707106708266703267043200102032000020320000106722106722118002110910108000080000010320000003200001032003561039005019117011106724106232000010106709106728106708106728106728
320024106727799000000000010671020181425320010103200001032000050470962811066821067271067272667532670432001020320000203200001067271067221180021109101080000800000103200000032000003532003560353900501911701110672406432000010106728106728106711106728106728
32002410670779900004500001106729001802532001010320000103200005046988801106682106722106722266703266893200102032000020320000106727106722118002110910108000080000110320000003200390393200390039000501911701110672266232000010106723106723106708106723106723
320024106707799010050010011067072121216253200101032000010320000504709628110670210672710670726655326709320010203200002032000010672210670711800211091010800008000001032000039032003900320035010000507211701110672400232000010106726106723106708106723106723
3200241067227991100410100210671221818025320010103200001032000050470962811066971067271067072667532668932001020320000203200001067071067221180021109101080000800000103200003903200390032003960394300501911711110670806432000010106708106728106728106708106728