Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, three registers, 4S)

Test 1: uops

Code:

  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030407090a0e0f1e1f223a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
630052846221342010030005128281470222297830003000300015000316172281322854831030003000300028521281241161001100010001300063001013001514613166103367259319214919621323138161241392791514111119741295430002851428657283692844728193
63004279402130011005701516128063003235973000300030001500001617728005286193103000300030002802328257116100110001000130006300100300151761324294817267337003519425329438161341382777913858121251333030002809928167282922805928003
63004279752110201005401525328219200229623000300030001500011617928171285483103000300030002797028488116100110001000030006300101300151361350410448731434491371942935133818729382780813821128461471730002819728515282332805728486
63004280422110101004500482727970220232673000300030001500031615728013283163103000300030002832528012116100110001000030006300101300151161383896307313319504319494346538171333392805814007128101314730002823828486282302816228503
630042850921311010062410815274284082002291730003000300015000116172279292855731030003000300028371280891161001100010000300063001013001514614269103237292346403019919348238211140362803715151119871309130002807228355281622809828155
630042854121301010021004829279732002336830003000300015000016168279072841131030003000300028479281771161001100010000300063001013001514614164102757170330303019441348538181032412782314071127611295530002811828308281242825728213
6300428038211010100601521727871220230243000300030001500011617028136284073103000300030002849228072116100110001000030006300101300151461406699207311350004219689337838171438342810015091118481282330002809928455281612805928136
63004282882100101003300520828355000228983000300030001500011617927991283803103000300030002807228098116100110001000030006300101300151161361797777169337003819512330238181739352773913906127851326330002810928076280402817228088
63004285462101101004201529228245000228583000300030001500011617127839283373103000300030002864128129116100110001000030006300101300151461405210277733534450411958434633816737372782914044128571412930002814328109281572806228152
6300428140211000111480150602793220022955300030003000150005161552786428384310300030003000280912849711610011000100013000630010130015116133099801702233290401945731573822737342773613847119961356930002846128090287912814428449

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03090e0f1e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c2cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
2402058005659910045128004601212122524010010024013010024000050034956610800188007180046499843500142401002002400002002400008004080056118020110099100100800008000011002400000240000040240053513740005110116118003710042400001008006580062800458005780062
2402048006160000001280046312121625240100100240000100240000500350030308001580072800714998435001924010020024000020024000080056800561180201100991001008000080000010024000040240000039240040503740005110116118003710022400001008006280062800628006280062
24020480056599000580280041212120252401001002400001002400005003537804080036800728007849984350019240100200240000200240000800618004011802011009910010080000800000100240000024004003524004000040005110116118005810042400001008005780041800418006280062
24020480061600000000800412121916252401001002400001002400005003539192080015800878006149978349998240100200240000200240000800568005611802011009910010080000800000100240000402400400024013060040005110116118005410612400001008005980057800578006280062
2402048006160000045128004120191625240100100240000100240000500353780408003780050800644998335001924010020024000020024000080056800571180201100991001008000080000010024000040240000040240040514140005110116118005810012400001008006280062800628005780041
240204800406000009600800462120162524010010024000012124000050034954360800368006480056499843499982401002002400002002400008004080056118020110099100100800008000001002400004324003700240000513700051101161180040101002400001008006280062800628004180062
240204800615990007502800460121216252401001002400001002400005003501773080015800718006149984350019240100200240000200240000800618005611802021009910010080000800000100240000024004005524000050393900511011611800530642400001008006280041800418006280041
2402048006160000045118004600121625240100100240000100240000500349468108003680074800694998435001924010020024000020024000080061800561180201100991001008000080000010024000002400370024000050040005110116118005810602400001008004180041800588006280062
240204800406000004601800462012025240100100240000100240000500353848008001580060800564998435000124010020024000020024000080040800561180201100991001008000080000010024000002400400402400005040400051101161180058101042400001008004180062800418004180062
240204800405990004512800252012182524010010024000010024000050034954360801038005280056499843500192401002002400002002400008004080040118020110099100100800008000011002400000240040040240040514040005110116118005810642400001008004180062800628005780062

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0009

retire (01)cycle (02)0305080b0e0f181e22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
24002580071600111110670038010237720252400101024000010240000503527112180023800698004750206350049240010202400002024000080047800691180021109010108000080000010240019204324001900062240041516045198050201161180280013130240000108007080070800718007080070
24002480069600110000210028003237002524001010240000102400005035215821800448008180069499923500272400102024000020240000800708004711800211090101080000800000102400192102400601116124000000194319105020116118006600130240000108007180049800498007080056
24002480070600110000211028005529720252400101024000010240000503500769180045800868007049993350027240010202400002024000080072800701180021109010108000080000010240019194324001900062240055005843191050201161180067013135240000108004980070800708007180048
2400248015760111100021103800543072025240010102400001024000050353365608002280081800714999935002724001020240000202400008004780070118002110901010800008000001024001920432400601012124004151190191050201161180044013135240000108007080049800708007180070
240024800705991010002110180054207202524001010240000102400005035015750800238008380082499933500282400102024000020240192800898005711800211090101080000800001102400192002400191002124003950604319005020116118004500135240000108007080074800748007380070
24002480069599110000990028003207919252400101024000010240000503500176080044800858007050015350028240010202400002024000080069800691180021109010108000080000010240021194324001920221240040606043190050201161180066013135240000108016180071800718004980070
24002480070599110000211008005530718252400101024000010240000503527112180045800848006950014350032240010202400002024000080070800701180021109010108000080000010240020204324005710021240041505944191050201161180066013135240000108007080049800708007380071
240024800696051000003600180054077202524001010240000102400005035215820800468008380047500173500502400102024000020240000800698006911800211090101080000800000102400192002400570012124004151194318105020116118006601305240000108007080070800718007080070
240024800706001000002110280033307202524001010240000102400005035215820800458008380047500143500272402082024000020240000800698004711800211090101080000800000102400202043240060101612400005159431910502011611800440000240000108007080070800708004880048
2400248007060010100068003800332071252400101024000010240000503501575080023800788006950015350049240010202400002024000080069800691180021109010108000080000010240019194324005910162240040006045191050201161180066013135240000108007180070800708007080048