Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4R (8H)

Test 1: uops

Code:

  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.008

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.012

retire (01)cycle (02)03040708090a0b0e0f1e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)dfe0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6500528785215550130002105140280880011607250124012100040001000500047520822938028363284543105000100040001000400028225282811161001100010000100002100006100022201357710244706032012741930932153811145356227944152761260513553100040002834628175285442843328411
6500428423213450060004005138281350111632750084012100040001000500047533322940028314282233105000100040001000400028269283601161001100010001100002100103100122301390410020716533992571923832563814145250227895144011276713055100040002839627991286292841528455
650042835721422005000410509327947000160925012400810004000100050004753402294102824828426310500010004000100040002823228287116100110001000010000210000310002220133659738711234222541929632193802146756227956147381277013999100040002830928424286522839928423
650042863321545005000210511228077000161285008401210004000100050004753812294202835928530310500010004000100040002828828249116100110001000010000210010310002121133831005171023191158194353378380795853227985143141284113317100040002828928405284022819228289
650042862921244015000310482128254100160075008401210004000100050014751832299102823128239310500010004000100040002830428353116100110001000010000210000410001130138319604721333866611939732643810185852228011145571316013889100040002854528453283632827528590
6500428322213550050003105015281091001595150084012100040001000501047558162289902824428291310500010004000100040002825828174116100110001000010000210000310002120133049912708933186561926432493809115957227998146221275613400100040002845628425282902848628539
6500428490213560070002105068280241001614050084012100040001000500047534132294802835728667310500010004000100040002829628260116100110001000010000210000310002220139059913706932687541926732913812176063227882145991254013182100040002820028325283252849328268
6500428380213530050002005062282591001611650084012100040001000500147563622976028153283083105000100040001000400028236281971161001100010000100002100103100020201394410206717133344551924933533799165853227864150361237413427100040002844928248283642826228366
6500428599214540040003105026279661001618350084012100040001000500047559522966028225284833105000100040001000400028309282871161001100010000100002100119410002030135819973711633617581928532903808105960227896147031254713765100040002841728270285852846528582
650042833121274004000210495128049100159195008401210004000100050024756252295402821528171310500010004000100040002805428609116100110001000010000210000610002020136399987709433002601926332733806195661227955147911276313139100040002834428437283882844028474

Test 2: throughput

Count: 8

Code:

  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)030508090b0e0f1e1f22243a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5cdcfd0d5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4002058006659911111060100804321062254001521003200248000010032000080000500400054108803040800478004780047003484001002008000032000020080000320000800718007111802011009910010080000800000100800078288003001178002361310610051090117118004410132800003200001008054780048800678006780048
400204800665991112003701008003210602540010010032007680000100320000800005004000294403096180027800608004100342400100200800003200002008000032000080060800411180201100991001008000080000010080000008001300014800006114180000510901171180038010100800003200001008004780061800618006180061
4002048004159900000000100800420660254001641003200648000010032000080000500400009864002018003880057800570034240010020080000320000200800003200008005780057118020110099100100800008000001008000001880018010080000000220000510901171180057110100800003200001008005780061800588005880061
4002048005760000000024000080045166025400100100320076800001003200008000050040002986400201800418005780041003424001002008000032000020080000320000800578005711802011009910010080000800000100800000188001400016800140113180000510901171180054010100800003200001008005880045800588005880061
4002048004159900000000000800451060254001001003200648000010032000080000500400000960002018002280041800600034240010020080000320000200800003200008006080057118020110099100100800008000001008000001880018010080018001418000051090117118005410100800003200001008005880045800588005880058
400204800415990000000000080045066025400176100320076800001003200008000050040002138400001800228009280057003394001002008000032000020080000320000800578005711802011009910010080000800000100800000188001300008001460130000051090117118003800100800003200001008005880058800588034980042
400204800576000000001900008002616602540017610032006480000100320000800005004000008640020080022800608006000339400100200800003200002008000032000080060800571180201100991001008000080000010080000018800180001780018011822000051090117118005400100800003200001008006180045800618004280042
400204800605990000002400008002616602540017610032000080000100320000800005004000008640020180038800578005700342400100200800003200002008000032000080060800411180201100991001008000080000010080000018800130000800186114220000510901171180057010100800003200001008006180058800588004280042
40020480060599000000000008002606602540017610032007680000100320000800005004000249600020180041800448005700342400100200800003200002008000032000080060800571180201100991001008000080000010080000008001400008001861000000510901171180041113130800003200001008006780048800678006780067
4002048004760011110137010080051006025400148100320072800001003200008000050040005454400281800418004180041003234001002008000032000020080000320000800608005711802011009910010080000800000100800000188000000017800186014220000510901171180054010100800003200001008005880058800588005880058

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)0305080a0b0e0f1e22243a3f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd2icache miss (d3)d5d6d9ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400025800626001100103110080047100254000341032005280000103200008000050400038102400168004380062800470344400010208000032000020800003200008004780047118002110910108000080000010800078238000801129800196025237050190051705580059009280000320000108004880063800638006380063
4000248006260011000071008004700625400062103200248000010320000800005040000310240016800438006280047034440001020800003200002080000320000800628006211800211091010800008000011080006608002700025800190126247050190041705580059090280000320000108004880063800488004880063
4000248006259911000032000800321062540006210320024800001032000080000504000031024001680028800478006203444000102080000320000208000032000080047800621180021109101080000800001108000870800280027800006025236050190041746580044099080000320000108006380048800488004880063
40002480062599110000310008003206025400062103200528000010320000800005040000354400248004380062800620329400010208000032000020800003200008004780047118002110910108000080000110800086248002600026800206125246150191041705480059090280000320000108006380063800488006380063
40002480062599100000321008004706625400062103200528000010320000800005040003810240016800288006280062034440001020800003200002080000320000800628006211800211091010800008000011080007724800250112580000602607150190051705680059199280000320000108006380048800638004880048
4000248006259910010031001800471662540003410320052800001032000080000504000385440028800438006280047034440001020800003200002080000320000800628006211800211091010800008000011080008824800260002680000607246150190051708580059009080000320000108006380048800488006380063
4000248004760010000032000800471662540006210320024800001032000080000504000381024001680043800628006203294002752080000320000208000032000080062800621180021109101080000800001108000672480026000268002061706150190041706680059009280000320000108004880063800638004880048
400024800476001001003300080047166254000621032005280000103200008000050400004102400168002880062800620329400010208000032000020800003200008006280062118002110910108000080000110800076248002600026800206125236250190051705580044199080000320000108006380063800638006380063
400024800625991101003200080047106254000621032005280000103200008000050400033544002480043800628004703444000102080000320000208000032000080047800471180021109101080000800000108000770800270017800206126247050190051705680044000280000320000108004880063800638004880063
4000248006259911000132100800470062540006210320052800001032000080000504000035440024800438006280062034440001020800003200002080000320000800478006211800211091010800008000011080008824800080012680018007237150190051705580044109080000320000108004880063800638004880063