Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4R (1D)

Test 1: uops

Code:

  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.012

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.012

retire (01)cycle (02)030405080b0e0f1e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6600529285219310100381045842896300016866600840002000400020001000347576500230132910029065310600020004000200080002912529111116100110001000020000420000101200220600131379122692831414620097306838109403628389163791358214988200040002908529291292742920629224
6600429276218011110381046102886900116843601640042000400020001000047564700229572903729164310600020004000200080002902429025116100110001000020024620060006200244622127639392687930403219911301238136323428409164101318114959200040002923929241292812913929154
660042916921901010033210455828784001167946004402020004000200010002476201200229932905929243310600020004000200080002916729072116100110001000020033620030016200042021129129266689430863820037303938139373728407160311333614775200040002923029251292152921329172
6600429243218011100431104559287820001682160164016200040002000100004757613002303729157292963106000200040002000800029265290591161001100010000200336200301162000420221299891666882310438200003042382010373428369165301334414780200040002915729264292142921129244
660042928621911110020046202876302016917601240162000400020001000047544600230102905729275310600020004000200080002906429085116100110001000020043420040022200044620128389214691930333720043305638107403828455161661322714948200040002924429211293492932729211
660042908021911010047900461228823200168156012402020004000200010000475946002301829082291913106000200040002000800029134290801161001100010000200236200310022000424201290792576885303241200553041382112413328326164201329714837200040002930229309292162939129153
6600429240219011000411046172881800016855600440122000400020001000047580600230292907429165310600020004000200080002910129038116100110001000020034620031002200044620129719182686230433220039310438184363728331163911330114764200040002927729239292562922529199
6600429157218011100317004586288652001680760124004200040002000100004766211002301429067292483106000200040002000800029206290051161001100010000200424200200022000424201304391846850306332200443127382011353928406162281333315084200040002919129171291672919729166
66004292412190111004721045802879300016922601240162000400020001000047450500229872904829170310600020004000200080002910929062116100110001000120023620050014200004021128379139687831293620159308538228413528356164271352014758200040002925129190292392922629184
660042926421901100071046042884922016830600440202000400020001000047558500229952898829209310600020004000200080002905929048116100110001000120033420040002200046420128849153683730474420053313538147393828384162991336514704200040002917629257292422912329442

Test 2: throughput

Count: 8

Code:

  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)030508090b0e0f1e22243a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480205800656001101102100280050216161025480124100320024160000100320000160000500800964544002418004680047800650034748010020016000032000020016000064000080065800651180201100991001008000080000010016001414420160051101121600386051421200510911711800440901600003200001008006680066800668004880048
480204800656001100005600280032216000254801761003200601600001003200001600005008009641088015218002880065800650034748010020016000032000020016000064000080047800651180201100991001008000080000010016001415420160051000511600006113421300510911711800449921600003200001008006680066800488004880048
4802048006559911100057102800323160102548016810032002416000010032000016000050080097010880156180046800658006500329480100200160000320000200160000640000800658006511802011009910010080000800000100160014130016001300053160038611301200510911711800449921600003200001008004880048800668004880048
4802048004760010100057102800322161610254801681003200681600001003200001600005008009701088015218004680065800650032948010020016000032000020016000064000080065800471180201100991001008000080000010016001312420160052000511600386151421310510911711800629901600003200001008006680066800668006680066
48020480065599111000660028003220160025480168100320024160000100320000160000500800970108801521800468006580047003474801002001600003200002001600006400008004780047118020110099100100800008000001001600141400160012000131600386150421220510911711800629001600003200001008006680066800488006680048
48020480047599110000570028005001601025480124100320024160000100320000160000500800042108801520800288004780047003474801002001600003200002001600006400008006580065118020110099100100800008000001001600141400160051002511600386050421310510911711800620921600003200001008004880066800488004880066
48020480047600111000571028005020160025480160100320028160000100320000160000500800042108801521800288006580065003474801002001600003200002001600006400008004880065118020110099100100800008000001001600141442016005100051160038605101300510911711800440921600003200001008004880066800668006680066
48020480047600101011570028003221601025480172100320076160000100320000160000500801083108801521800468006580065005729480100200160000320000200160000640000800658006511802011009910010080000800000100160013134251600520031122160298005101320510911711800620921600003200001008004880048800668006680066
480204800655991110001310280032001610254801681003200681600001003202961600005008009591088015218004680065800650034748010020016000032000020016000064000080047800651180201100991001008000080000010016001413420160051001501600386051421320510911711800629001600003200001008006680048800668006680066
4802048004760011001069002800502161600254801681003200241600001003200001600005008009701088015218002880065800650034748010020016000032000020016000064000080065800652180201100991001008000080000010016001213420160314103131600006150421300510911711800629021600003200001008006680066800488006680048

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0009

retire (01)cycle (02)030508090b0e0f181e22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd0d5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
48002580060599001001058002800542000254800861032008416000010320000160000508011791088075208005008006980069035148001020160000320000201600006400008004780069118002110910108000080000010160013144316001402131600006013013105019021774800661305160000320000108007080070800708007180070
4800248006960010001005800080032250525480034103200241600001032000016000050800071108807521800280800698006933294800102016000032000020160000640000800478006911800211091010800008000001016001415431600520113160000615201300501903173380066005160000320000108007180048800708007180070
480024800476001101110580028005425502548007010320024160000103200001600005080118154400321800500800698007003514800102016000032000020160000640000800698004711800211091010800008000011016001314431600520112160039605243120050190417348006613135160000320000108004880070800488007080070
4800248006959911011001310280054205025480082103200241600001032000016000050801179108807520800510800478004733514800102016000032000020160000640000800698006911800211091010800008000001016001412431600530260160039611243130050190417558006613130160000320000108007080070800708007080070
4800248016660011111001910280054055025480082103200241600001032000016000050800048108807521800500800478006933514800102016000032000020160000640000800478006911800211091010800008000001016001313431600130152160000615243130050190417448006613135160000320000108007080070800708004880048
4800248006959911110005800280054255525480034103200721600001032000016000050801187576002808005008006980069035148001020160000320000201600006400008006980069118002110910108000080000010160014134316005300521600396151013105019051755800661304160000320000108005080070800488007080070
4800248006960011110005800080032200525480082103200641600001032000016000050801183108807521800500800698006933514800102016000032000020160000640000800698006911800211091010800008000001016001314431600530152160039015243130050190417338006613130160000320000108007080070800488007080070
480024800476001101110121028003325552548008610320024160000103200001600005080118110880752180050080069800693351480010201600003200002016000064000080047800691180021109101080000800000101600151501600521152160000615243130050190517448006613135160000320000108007080048800708007080070
480024800696001100000570028005425554548003410320064160000103200001600005080004210880752080050080069800693351480010201600003200002016000064000080069800691180021109101080000800000101600121343160052015116003961524313005019041755800441304160000320000108007080048800708007080048
480024800476001101100570028005425052548003410320072160000103200001600005080004010880752180050080069800693351480010201600003200002016000064000080069800691180021109101080000800001101600121343160052105216000001524312105019051733800661300160000320000108007080070800708004880070