Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 4S)

Test 1: uops

Code:

  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)09l2 tlb miss instruction (0a)0e0f1e1f223a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
630052846221342010030005128281470222297830003000300015000316172281322854831030003000300028521281241161001100010001300063001013001514613166103367259319214919621323138161241392791514111119741295430002851428657283692844728193
63004279402130011005701516128063003235973000300030001500001617728005286193103000300030002802328257116100110001000130006300100300151761324294817267337003519425329438161341382777913858121251333030002809928167282922805928003
63004279752110201005401525328219200229623000300030001500011617928171285483103000300030002797028488116100110001000030006300101300151361350410448731434491371942935133818729382780813821128461471730002819728515282332805728486
63004280422110101004500482727970220232673000300030001500031615728013283163103000300030002832528012116100110001000030006300101300151161383896307313319504319494346538171333392805814007128101314730002823828486282302816228503
630042850921311010062410815274284082002291730003000300015000116172279292855731030003000300028371280891161001100010000300063001013001514614269103237292346403019919348238211140362803715151119871309130002807228355281622809828155
630042854121301010021004829279732002336830003000300015000016168279072841131030003000300028479281771161001100010000300063001013001514614164102757170330303019441348538181032412782314071127611295530002811828308281242825728213
6300428038211010100601521727871220230243000300030001500011617028136284073103000300030002849228072116100110001000030006300101300151461406699207311350004219689337838171438342810015091118481282330002809928455281612805928136
63004282882100101003300520828355000228983000300030001500011617927991283803103000300030002807228098116100110001000030006300101300151161361797777169337003819512330238181739352773913906127851326330002810928076280402817228088
63004285462101101004201529228245000228583000300030001500011617127839283373103000300030002864128129116100110001000030006300101300151461405210277733534450411958434633816737372782914044128571412930002814328109281572806228152
6300428140211000111480150602793220022955300030003000150005161552786428384310300030003000280912849711610011000100013000630010130015116133099801702233290401945731573822737342773613847119961356930002846128090287912814428449

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  ld1 { v0.4s, v1.4s, v2.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)03090e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2402058005659910045128004601212122524010010024013010024000050034956610800188007180046499843500142401002002400002002400008004080056118020110099100100800008000011002400000240000040240053513740005110116118003710042400001008006580062800458005780062
2402048006160000001280046312121625240100100240000100240000500350030308001580072800714998435001924010020024000020024000080056800561180201100991001008000080000010024000040240000039240040503740005110116118003710022400001008006280062800628006280062
24020480056599000580280041212120252401001002400001002400005003537804080036800728007849984350019240100200240000200240000800618004011802011009910010080000800000100240000024004003524004000040005110116118005810042400001008005780041800418006280062
24020480061600000000800412121916252401001002400001002400005003539192080015800878006149978349998240100200240000200240000800568005611802011009910010080000800000100240000402400400024013060040005110116118005410612400001008005980057800578006280062
2402048006160000045128004120191625240100100240000100240000500353780408003780050800644998335001924010020024000020024000080056800571180201100991001008000080000010024000040240000040240040514140005110116118005810012400001008006280062800628005780041
240204800406000009600800462120162524010010024000012124000050034954360800368006480056499843499982401002002400002002400008004080056118020110099100100800008000001002400004324003700240000513700051101161180040101002400001008006280062800628004180062
240204800615990007502800460121216252401001002400001002400005003501773080015800718006149984350019240100200240000200240000800618005611802021009910010080000800000100240000024004005524000050393900511011611800530642400001008006280041800418006280041
2402048006160000045118004600121625240100100240000100240000500349468108003680074800694998435001924010020024000020024000080061800561180201100991001008000080000010024000002400370024000050040005110116118005810602400001008004180041800588006280062
240204800406000004601800462012025240100100240000100240000500353848008001580060800564998435000124010020024000020024000080040800561180201100991001008000080000010024000002400400402400005040400051101161180058101042400001008004180062800418004180062
240204800405990004512800252012182524010010024000010024000050034954360801038005280056499843500192401002002400002002400008004080040118020110099100100800008000011002400000240040040240040514040005110116118005810642400001008004180062800628005780062

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24002580071600111110670038010237720252400101024000010240000503527112180023800698004750206350049240010202400002024000080047800691180021109010108000080000010240019204324001900062240041516045198050201161180280013130240000108007080070800718007080070
24002480069600110000210028003237002524001010240000102400005035215821800448008180069499923500272400102024000020240000800708004711800211090101080000800000102400192102400601116124000000194319105020116118006600130240000108007180049800498007080056
24002480070600110000211028005529720252400101024000010240000503500769180045800868007049993350027240010202400002024000080072800701180021109010108000080000010240019194324001900062240055005843191050201161180067013135240000108004980070800708007180048
2400248015760111100021103800543072025240010102400001024000050353365608002280081800714999935002724001020240000202400008004780070118002110901010800008000001024001920432400601012124004151190191050201161180044013135240000108007080049800708007180070
240024800705991010002110180054207202524001010240000102400005035015750800238008380082499933500282400102024000020240192800898005711800211090101080000800001102400192002400191002124003950604319005020116118004500135240000108007080074800748007380070
24002480069599110000990028003207919252400101024000010240000503500176080044800858007050015350028240010202400002024000080069800691180021109010108000080000010240021194324001920221240040606043190050201161180066013135240000108016180071800718004980070
24002480070599110000211008005530718252400101024000010240000503527112180045800848006950014350032240010202400002024000080070800701180021109010108000080000010240020204324005710021240041505944191050201161180066013135240000108007080049800708007380071
240024800696051000003600180054077202524001010240000102400005035215820800468008380047500173500502400102024000020240000800698006911800211090101080000800000102400192002400570012124004151194318105020116118006601305240000108007080070800718007080070
240024800706001000002110280033307202524001010240000102400005035215820800458008380047500143500272402082024000020240000800698004711800211090101080000800000102400202043240060101612400005159431910502011611800440000240000108007080070800708004880048
2400248007060010100068003800332071252400101024000010240000503501575080023800788006950015350049240010202400002024000080069800691180021109010108000080000010240019194324005910162240040006045191050201161180066013135240000108007180070800708007080048