Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 16B)

Test 1: uops

Code:

  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e223a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
63005285262142928011815073283032302326630003000300015000716159281562835531030003000300028470283051161001100010000300003004003004300613192968871623289119119665331138131464652794114617122091373130002867428429285342842828501
630042829121328290110048792823920323226300030003000150008161512802528399310300030003000282482817911610011000100003000630041430000079133141019871703208146219718324738191167702797714650121861367130002846728469285982862128325
63004283792143228011905134282012232325530003000300015000616199280552848431030003000300028377282041161001100010000300093004033004507613785941171933338156919657325838151264592792915110122671375330002849028626284492830628562
63004285292122426011005045281720032337330003000300015000316158280452839131030003000300028420284711161001100010000300063007043000003913717101047159345686519825332038121669642789514478125621361030002841728515284012823928375
630042838921229250006052002822123023243300030003000150004161472813728485310300030003000284562823311610011000100003000930040730074009136121008070333372136419594327938122262592814614518121771420830002831828601284612845328315
630042834021229240000051442809323223442300030003000150006161702811028680310300030003000283712829111610011000100003000930000730034016137301015871783400156619803330138121253612801114943124061370330002835128301284812822428351
6300428275212342900010051362821800323290300030003000150024161582798028262310300030003000283332827611610011000100003000930040330015149134671004670943371146919863331938091357592803114451121271342530002813828372283552840428360
6300428210213292701013150662807822323206300030003000150005161512802328364310300030003000284052835111610011000100003000930040030074036139421016270963267156319684322238091454552787714434122171369330002833428516283132836528383
63004283832133231000110501628138223231923000300030001500041614028198283823103000300030002817428252116100110001000030009300700300750761369310258720533958531962333153813866652789614721125261378530002843828472285322839028331
6300428556213332801013050062809823323102300030003000150004161592802628357310300030003000283922825311610011000100003000930040030043176135581029871713394136520090330238121357652798114338120031373230002821328479283192839328303

Test 2: throughput

Count: 8

Code:

  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2402058007059900000004400008004601019252401001002400001002400175003503030180015800658006549993650017240116200240024200240024800408006111802011009910010080000800000100240000043024004000039240038604100011151160160080058014072400001008004180066800668006680041
24020480062600000000044000180050211192524010010024000010024001650035041880800408006480064499686499922401152002400242002400248006580061118020110099100100800008000001002400000430240000010542400396159440011151160160080063001442400001008006380062803728027880041
24020480216602110000078000080050001221252401001002400001002400145003518162180044800658006549969650013240116200240024200240024800658006111802011009910010080000800000100240000000240000000024003950404400111511601600800580131302400001008007080070800668007580071
240204800685990010000210001800462100252401001002400001002400005003551453180015800658004049967350023240100200240000200240000800698006131802011009910010080000800000100240000003024000000210332400390104401000511011611800620141472400001008007080062800668006980066
240204800406020001000720001800542012125240100100240000100240000500351001418003680044800654996735002724010020024000020024000080069800651180201100991001008000080000010024000000024003800042240039014144000005110116118006600002400001008007680065800628006680041
2402048004059900010007900008004901016252401001002400001002400005003506992180015800658006549963350019240100200240000200240000800648006111802011009910010080000800000100240000043024000000062400406100000005110116118006200072400001008006680066800668006680066
24020480065599000000000001800492112025240100100240000100240000500354239018001580040800404996334999824010020024000020024000080040800401180201100991001008000080000010024000004302400400000240000013900000051101161180062001472400001008004180066800668006580065
2402048006160210001001260001800250112025240100100240000100240000500351984018003980065800654998435001924010020024000020024000080040800611180201100991001008000080000010024002120430240039000382400000159000000511011611800620131372400001008006580041800668006680066
240204800405990000000440001800502010252401001002400001002400005003505448180040800408006149988349998240100200240000200240000800618006111802011009910010080000800000100240000043024000000002400390040000000511011611800370141472400001008006280066800418004180066
24020480066599000000045100180025012121925240100100240000100240000500351854308004180040800404996335002324010020024000020024000080040800401180201100991001008000080000110024000000024000000040240039610440000051101161180062001072400001008006680066800658004180041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
2400258007160010011100670100800543902025240010102400001024000050352711208004580070800694999335004924001020240000202400008006980069118002110901010800008000011024001919432400601751622400395160431910502020160613800451305240000108007080049800488007180048
240024803486001011000066000080032079192524001010240000102400005035015750800238007080446500373500492400102024000020240000800708007011800211090101080000800000102400191943240059250260240000011901900502061601368006613130240000108007080049800708007180196
240024800695991011000021000180036077192524001010240000102400005035215820800458004880048500153500502400102024000020240000800698006911800211090101080000800000102400192043240019111212400400059018105020131601388007013135240000108007080070800488004880302
240024800705991010000021000080054307025240010102400001024000050350946518002380047800705001435002724001020240000202400008004780069118002110901010800008000001024002019432400601516324004100194319005020816074800660130240000108007180048800708007080662
24002480048599101100201000002800542072025240010102400001024000050350946518004580072800705001535005024001020240000202400008006980069118002110911010800008000001024001919432400601581622400005119019005020616145800461305240000108007480071800708007080186
240024800695991011000067010080055207202524001010240000102400005035015751800228006980069500143500492402072024000020240000802028007511800211090101080000800000102400212043240060101287240040005901900502010160658006713135240000108004880070800708007080209
2400248004760011010000670002800323771925240010102400001024000050350017618004580070800705001535002824001020240000202400008006980047118002110901010800008000001024001920432400581412124004151190190050206160658006613135240000108007080070800718007080212
2400248007060010100000670002800573701925240010102400001024000050351150018002280070800704999335004924001020240000202400008007380047118002110901010800008000001024002019432400600216124004151194319105020616056800661305240000108004880048800488004980206
2400248007059910110000670001800323094125240010102400001024000050352354518004480084800485001535002824001020240000202400008004880069118002110901010800008000001024002021024005904712324004151194319105020716077800671300240000108007180071800488004880664
240024800716001101000022200038005400719252400101024000010240000503521582080022800698007050015350050240010202400002024000080069800471180021109010108000080000010240019194324006012812124004051604319005020616145800441305240000108007180070800718004880166