Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple structures, 4S)

Test 1: uops

Code:

  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.006

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 3.006

retire (01)cycle (02)030507080a0b0e0f1e223a3f4651inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
66005293232191111160000146102887801685660063006300030003000150003569222301629137292763106000300030003000900029133291631161001100010001300063000013001310312906909168693085115220058309038215363428422164001331515032300030002930029234294042931529260
66004293352190100140116146682887001686860063006300030003000150003575312300929155292193106000300030003000900029095291981161001100010000300033001013000311312829924168663054527200743085381213353428538163561329614908300030002922129283293872925429271
660042928121909010000614570288360168566009300330003000300015000357651230762910029269310600030003000300090002914029138116100110001000030006300000300131031303292486852303053520071301738207363828485163561342415031300030002927329236292682923029239
6600429281220017013000514541288780168476006300330003000300015000357122230532906529243310600030003000300090002913529090116100110001000030006300600300151131292191116869305153120119313938289312728398163741331315027300030002931529294292452927229257
6600429226219012013000614579289093168876003300630003000300015000356862229292911029265310600030003000300090002914729115116100110001000030003300001300150161298992546878305073220113308038217353128423164301324214626300030002924229233293002925429310
660042922822008014000614654288730168826006300330003000300015003356982230092909629266310600030003000300090002926029086116100110001000030006300001300451181283892616839307043320023314138219373828429162401325814929300030002930529277292552922829195
66004293092180809000614584288520168286006300330003000300015000357171230162909729288310600030003000300090002914229178116100110001000030003300100300151161281291906958314863420093310438197303428394164741316814879300030002931229253292982928529235
660042919821901108000614601288940168406006300630003000300015000357212230072911329166310600030003000300090002910529059116100110001000030006300001300351161297189276808303233020099307638159374028429163811341914979300030002924829218292732926729269
6600429291220016016000614622289040169026006300630003000300015000357033230152906929227310600030003000300090002907129120116100210001000030006300101300151161278890956851304272720107309638238333228394162351321514875300030002931629319292732930529285
6600429287219010014000614654288720168626006300630003000300015005357162229502904329256310600030003000300090002912929177116100110001000030006300001300031061275092396862300183320087311338119323528386162711336114817300030002923329192293192922029235

Test 2: throughput

Count: 8

Code:

  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  ld3 { v0.4s, v1.4s, v2.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03090e0f1e2223373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480205800806000100007080047204142548014210024004324000010024000024000050016764415730957180047800448004410034848010020024000024000020024000072000080066800621180201100991100100800008000001002400000452400430002400425100051091171180059141032400002400001008006780067800458006380045
480204800445990114710228005624402548041510024004324000010024000024000050035417525532980180034800468007200337480100200240000240000200240000720000800628006211802011009951001008000080000010024000000240000004224000050045051091171180063141402400002400001008004580045800638006780045
480204800445991119600018005100414254801441002400442400001002400002400005003529373554276418002580066800441403444801002002400002400002002400007200008006280062118020110099510010080000800000100240000045240000004224004201424505109117118005914002400002400001008004580067800458006380067
480204800665990114800228005104414254801421002400422400001002400002400005003541752360326318004780066800660034848010020024000024000020024000072000080044800621180201100991100100800008000001002400000452400000042240000014300510911711800630032400002400001008006780067800458006780045
4802048006660000148105028005104414254801441002400422400001002400002400005003524521556946118004780068800440034848010020024000024000020024000072000080066800441180201100990100100800008000001002400000024000010024000001424505109117118005914032400002400001008006780067800728006780067
480204800666000110003180051244025480143100240043240000100240000240000500167644157323931800258006280066140348480100200240000240000200240000720000800668004411802011009951001008000080000010024000000240000004224004350424505109117118006314002400002400001008006780067800638006780045
48020480066600010000328023401500254801451002400452400001002400002400005003529373554235118002580044800441003264801002002400002400002002400007200008004480044118020110099510010080000800000100240000045240000004324004351045051091171180063141402400002400001008006780045800678004580045
48020480066600011480012800473154142548010510024004424000010024000024000050016764413880301180047800628006600348480100200240000240000200240000720000800668004411802011009951001008000080000010024000004524000000542400000143005109117118004114032400002400001008006780067800678006780067
4802048004460000000020800290400254801421002400432400001002400002400005003541752553297518004780063800661403484801002002400002400002002400007200008006280044118020110099510010080000800000100240000045240043000240000510450510911711800590002400002400001008006680067800678004580087
4802048006660000051003080051044142548014410024000624000010024000024000050035245215569535180037800708007000326480100200240000240000200240000720000800628004411802011009901001008000080000110024000000240000000240043614200510911711800630032400002400001008006880067800688006380067

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03090b0e0f1e2224373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)cdcfd5d6daddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480025800676000011481010800292150102548005210240042240000102400002400005035243295532975418002580068800620034848001020240000240000202400007200008006280062118002110911010800008000001024001717430240017012024004051000050191171118005910100240000240000108006380045800638006380063
4800248006260000114800120800472151514254800201024004524000010240000240000503288778557932531800258006280044100326480010202400002400002024000072000080381800911180021109110108000080000010240000045024004210442400425141450050191170118005910100240000240000108006380045800458006380063
48002480062599000047000180047215150254800531024004624000010240000240000503542951573110381800438006280066901924348001020240000240000202400007200008004480062118002110901010800008000011024000004502400000002400000142450050191170118004110100240000240000108006780063800458006380045
48002480062600001147100080047241502548001610240042240000102400002400005016764415532980818002580062800629034448001020240000240000202400007200008004480062118002110911010800008000001024000004502400001042240042504200050191170118004110100240000240000108006380045800458004580045
480024800446000011010818004701515132548005210240042240000102400002400005035415605542353718004380062800441015326480010202400002400002024000072000080062800441180021109110108000080000010240000045024004200422400420142450050191170118005910100240000240000108004580063800638006380045
480024800716010001471000800292151510254800521024004224000010240000240000503524329554234400800438006680044100348480010202400002400002024000072000080062800621180021109110108000080000010240000045024004200422400430142450050191170118004110100240000240000108004580063800638006780063
4800248006260000014710118004701501025480052102400422400001024000024000050352452156939710180043800628006200344480010202400002400002024000072000080062800441180021109110108000080000010240000045024004200422400005142450050191170118005910100240000240000108006380063800638006380063
48002480062599110047001080047201510254800161024000824000010240000240000503529181578307601800438006280062003444800102024000024000020240000720000800628006211800211091101080000800000102400000002400410042240000004245005019117011800591000240000240000108004580067800638006380045
4800248006660000104700318004720151025480052102400422400001024000024020250352937357232410180047800628004410034448001020240000240000202400007200008007980045118002110901010800008000001024000004502400410002400426042450050191170118004114100240000240000108006380074800668006380045
480024800625990000481051800472401025480052102400452400001024000024000050354156057255990080025800448006210034448001020240000240000202400007200008004480044118002110911010800008000011024000004502400000042240042514145005019117011800591000240000240000108006780063800638004580045