Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple structures, 4S)

Test 1: uops

Code:

  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 12.014

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.014

retire (01)cycle (02)03070a0e0f181e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
720052931722025190000104629290482201533912014801440008000400020323982502247582919029335310120004000800040002000029285292961161001100010000400084002054000515812924920068493124676193683109382212635528683163191302213530400080002927129349294192928329300
7200429215218151500091044762910200015222120148014400080004000203309835052474329108292553101200040008000400020000292012926911610011000100004000040060640086050131979293684031431156193433085382119585828600164931296313616400080002935629469294052930829266
7200429348220222901010004631291170041536212000800040008000400020321983651247822914629310310120004000800040002000029178293251161001100010000400084000064000502813021922368143065964193423089382019625828558163551295913504400080002929929245292912929929340
72004293622192611100010456229119200152031201480144000800040002034198328524769292542929931012000400080004000200002919529191116100110001000040000400000400260012129739194685430821060193343134382421565528461163371307213597400080002923429298293772935429241
7200429289220172811080045492903620015238120148024400080004000203859833612476929154292103101200040008000400020000291572917611610011000100004000124003083400061012129639113686930261060192503017382117565928524162611312113524400080002935329353292902941029392
720042936022019190008004551290750001534712000801440008000400020322980643247682921629304310120004000800040002000029255291931161001100010000400004002054005612012931923668363029556193113035382014595728594163611299813557400080002926929404293442931029293
720042931522020240008004555290780001533112014801440008000400020313980130247412923129290310120004000800040002000029334292551161001100010000400004003010740036128128639003684731501260193483092382218526128493164691273713506400080002935829381293542921529389
720042931822021221008004547291142001529312028801440008000400020334984052247502925229249310120004000800040002000029185292451161001100010000400004002084002610812925916669343115951193163068382117565928483161701312713573400080002929829346293592938729296
7200429342220242001000044962912900015356120008000400080004000203259832432478329188293653101200040008000400020000291992921311610011000100004000114003084005013012930940268373065964192603131382716576028539163981289413426400080002936829329292842928229269
7200429335220191500001045792902000015299120148014400080004000203149840102470529229293313101200040008000400020000292912920011610011000100004000040020040026108128019157681330251055192293097382412645728417163411286813587400080002930629264293872932329321

Test 2: throughput

Count: 8

Code:

  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)0307080b0e0f18191e1f2223243a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
960205160069119900000005301000160044312120025960156100640000320000100640000320000500168048218560012116004001600431600430032596010020032000064000020032000016000001600591600591180201100991001008000080000010032000003503206824032003261035000510902171316005610101320000640000100160060160044160044160060160044
960204160043119900000000010001600443012002596015610064005632000010064000032000050016809351856001211600400160043160059003419601002003200006400002003200001600000160059160059118020110099100100800008000001003200000350320032023932003201323500051090117141600560101320000640000100160060160060160060160044160044
96020416004311990000000380000016004430120172596010010064005632008610064000032000050016804671856001611600400160059160043003419601002003200006400002003200001600000160043160059118020110099100100800008000001003200000350320032303232003261320000510901172316004214100320000640000100160044160060160060160044160060
960204160059119900000003801000160129312120025960156100640056320000100640000320000500168000181600000160040016005916006100325960100200320000640000200320000160000016005916004311802011009910010080000800000100320000000320000003200320103500051090117341600560101320000640000100160060160044160044160044160044
9602041600451199000010038000001600283120002596015610064000032000010064000032000050016800011856001201600240160059160059003419601002003200006400002003200001600000160043160043118020110099100100800008000001003200000350320032032320032013200005109041743160040000320000640000100160061160060160044160044160060
96020416005911990000000120100016004430120025960156100640000320000100640000320000500168000181600000160024016004316005900325960100200320000640300200320000160000016004316005911802011009910010080000800000100320000001632003203232003261035000513702172416005610101320000640000100160060160060160044160060160044
96020416005911990000000000000160028001200228962372100640064320000100640800320796500170844598533920160812016324516260000341960100200320000640000200320000160000016004516004311802011009910010080000800000100320000035032000003232003261323500051090517531600400141320000640000100160060160060160044160060160060
96020416004311980000000380100016002801200025960156100640056320000100640000320096500168108618560016016002401600591600590034196010020032000064000020032000016000001600591600601180201100991001008000080000010032000003503200320032003261323500051090217331600611001320000640000100160060160060160060160044160044
96020416005911990000000000000160028012000259601561006400563200001006400003200005001680001185600120160040016004316005900325960100200320000640000200320000160000016005916005911802011009910010080000800000100320000000320032032320032600000051090217341600401001320000640000100160060160060160044160060160060
9602041600591199000010000000016002831200025960156100640064320000100640000320000500168041518560012016002401600591600590034196010020032000064000020032000016000001600591600591180202100991001008000080000010032000003503200320383200000132000051090417421600560101320000640000100160060160060160044160044160060

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)03080a0b0e0f18191e1f22243f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
9600251600561199000000030000160050312120259600541064004432000010640000320000501680242166399800160024160056160056003389600102032000064000020320152160000016005616005611800211091010800008000001032000002703200240002432000060240000050193175316004066132000064000010160057160057160044160057160057
960024160056119900000003001016004111200459600541064004432000010640000320000501680227166399800160037160056160056003259600102032000064000020320000160000016005616004311800211091010800008000001032000002703200241300320000602427000050193175316005366032000064000010160044160057160044160057160057
9600241600561199000100030000160041312120259600541064014032000010640000320000501680045166399801160037160043160043003259600102032000064000020320000160000016005516005611800211091010800008000011032000002703200000002432002460027000050194174416005366132000064000010160057160057160057160044160136
96002416005311990000000300101600503012025960058106400003200001064000032000050168000116639980016002416005616004600325960010203200006400002032000016000001600431600561180021109101080000800000103200000270320024000032000060027000050194174416005360132000064000010160057160044160057160057160044
960024160056119900000000010160041301202596005410640044320000106405323204085016800011600002011600241600561600560033896001020320000640000203200001600000160056160043118002110910108000080000010320000027032000000024320024612427000050192174416004006132000064000010160044160044160044160057160044
9600241600561198000000030010160041312002596001010640044320000106400003200005016802478160000116002416005616004300338960010203200006400002032000016000001600561600561180021109101080000800000103200000270320000063024320024012427000050193175516005366032000064000010160057160044160044160057160044
9600241600561199000000030010160041012120259600541064004432000010640000320000501680242166399801160037160056160056003259600102032000064000020320000160000016005616005611800211091010800008000001032000000032002400087132002401027000050193175516005306132000064000010160057160044160057160057160057
96002416005611990000000300001600413120025960054106400443200001064000032000050168024216639980116003716005616004300325960010203200006400002032000016000001600431600561180021109101080000800000103200000003200240000320024002427000050194173416005300132000064000010160044160057160057160057160047
96002416004311990000000001016004130120259600101064000032000010640000320000501680001166399800160024160043160043003259600102032000064000020320000160000016005616005611800211091010800008000001032000002703200240002432003261240000150194174416005366032000064000010160044160044160057160044160057
96002416004311990000000001016004121212025960054106400443200001064000032000050168000181600001160027160043160044003389600102032000064000020320000160000016004416004611800211091010800008000001032000002703200250000320000612427000050194175316004006032000064000010160057160057160045160044160044