Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 8H)

Test 1: uops

Code:

  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
630052941322012011310000141046142877400024082300030003000150029162352865529209310300030003000290552906611610011000100013005693008108300151303113045932468823047104920548306938171742392846916097137571551430002950129282292922926329301
6300429283220022017000006104636289580002414830003000300015000616150285992933531030003000300029177290731161001100010001300000300000630005109001280791816893310484020629305138161635362848216453135421554830002930029203292972929329238
6300429204219013012000006104579288800002401730003000300015000816152286592921831030003000300029086291601161001100010001300009300000130045149001304493086863313473620639310238211545482838416295136171571730002927229260293012930829239
630042925721901101500002101044872888301024106300030003000150001016145286232927431030003000300029055292521161001100010000300009300400430050040001304291066891310683720607306338151740412846716474139551541830002919129217292592927429268
630042929822001301100000230046232878300124044300030003000150091116154285572923331030003000300029150290791161001100010000300006300410530010149001274190046812324774020639310538201539412835616250136561532030002933729136292422937029293
63004293262200160150000010004528287710002404230003000300015000816157285782928931030003000300029144291171161001100010000300009300440430045059001278493536945312263820598312038111439402842016424136851562330002927529277292652930529209
6300429302220012013000009004534288480302410630003000300015000516150286592930031030003000300029023290861161001100010000300009300410030045019001281990726925302154220648309038131439412837116199136551541430002938129312293382937029290
63004293432200140130000012004619288350002412630003000300015000716157286312918931030003000300029121291201161001100010000300000300000330045049001324891456900310783920649311138191238382840416204137871545230002920229257292082925429342
630042928422001509000006104610289090102412930003000300015000916159286022925631030003000300029097291161161001100010000300009300100030005150001278291376862304394120558308438121136462838416233136261558830002935629281293352928929255
63004293502190130180000010004576287670002407630003000300015001016175286052927431030003000300029155290831161001100010000300009300010430005019001280792066896308974120653313438201438442833716268137671559830002926129201294112934129442

Test 2: throughput

Count: 8

Code:

  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  ld1 { v0.8h, v1.8h, v2.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2402058070360011110651280050201814252401001002400001002400005003500450180023800658004849988350023240100200240000200240000800488004811802011009910010080000800001100240019204224005710166240039515842191511041613800629922400001008006680074800708007080066
2402048006560010000740280055218017252401001002400001002400005003531263080050803688007650206350026240462200240383200240000803808006511802011009910010080000800001100240020214224005910063240039615842191511031642800449922400001008006680070800708006680049
24020480069600111003303800502181817252401001002400001002400005003520953080044800698007049992350027240100200240000200240000800698006911802011009910010080000800000100240020194224005611060240039005842190516521644800629922400001008006680048800668004980066
240204800656001010068128005031818025240100100240000100240000500353106508004080048802094999735002324010020024000020024000080066800651180201100991001008000080000010024002019024001910121240039605842192511031631800649922400001008004980066800668006680066
2402048004859911100211380033218180252401001002400001002400005003523674080040800478006549988350023240100200240000200240000800488006511802011009910010080000800000100240019204224005810060240038515942192511011633800629922400001008006780069800668006780048
2402048004859911100651380060218049252401001002400001002400005003500271080040800658006549988350023240100200240000200240000800658004711802011009910010080000800000100240021204224005710260240039015842191511031613800629922400001008006680066800688006680049
24020480065599110006513800502181816252401001002400001002400005003523674080040800688006649988350023240100200240000200240000800658006511802011009910010080000800000100240020204224005910160240039515842190511021624800629022400001008006680049800668006680066
240204800656001110021038005020181625240100100240000100240000500347251608004080065800654998835000624010020024000020024000080065800651180201100991001008000080000010024002019024005811166240040501942190511011635800639022400001008004980066800508004980066
2402048006859910100650380050118181625240100100240000100240000500350109508004080065800654998835002324010020024000020024000080065800651180201100991001008000080000110024001920422400581105824003950580191511021613800629922400001008006680048800498006680066
2402048006559911100650380053218017252401001002400001002400005003513588080040800658006549989350023240100200240000200240000800658006511802011009910010080000800000100240021214124005910260240038511942192511031632800629922400001008006680049800498006680066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)03l1d tlb fill (05)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
240025800725990007200101800462121216252400101024000010240000503521982180036080061800615000635004124001020240000202400008006180061118002110910108000080000010240000043024004003924004051404405020816758005801000240000108006880062800618006280062
24002480061599000450101800462121216252400101024000010240000503517947180040080041800415000635002124001020240000202400008006180061118002110910108000080000010240000043024004004024004051410050205166680038010104240000108006280061800628004280062
240024800606001004501008004621201625240010102400001024000050352198218003608006180061500063500212400102024000020240000800618006611800211091010800008000011024000004302400400024004001400050205167580058010104240000108006280065800428006280062
24002480061599000450101800462121216252400101024000010240000503537918180036080061800615000635004124001020240000202400008006180061118002110910108000080000010240000000240040040240040514043050207166480058010104240000108006280062800428006280062
240024800615990004600018013321212162524001010240000102400005035222361800360801138006150006350041240010202400002024000080061800611180021109101080000800000102400000430240000039240040514043050207166480058010104240000108006180062800628006280249
240024800415990004501018004601212162524001010240000102400005035222361800360800618006150006350021240010202400002024000080061800611180021109101080000800000102400000430240040040240040604043050207167580057010100240000108004280062800628006280062
24002480041600000460001800452121202524001010240000102400005035222361800350800618006150006350021240010202400002024000080061800611180021109101080000800000102400000430240040040240040504043050204166480058010104240000108006280064800658006180061
240024800616000004501018004621212162524001010240000102400005035222361800160800618006150006350041240010202400002024000080061800611180021109101080000800000102400000430240039041240039004043050207175780061010104240000108006180062800628006280062
2400248006160000045010180049212093252401401024052010240000603537918180035080061801945010635016024001020240381202400008004180061218002110910108000080000010240000043024004104024004051404305020816468005701004240000108004280062800628006280062
240024800415990000010180026212122252400101024000010240000503517947080036080061800615000635004124001020240000202400008006180061118002110910108000080000010240000043024000000240040514043050357167580061010100240000108245983372833868306083383