Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (post-index, 4S)

Test 1: uops

Code:

  ld1r { v0.4s }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.002

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6200529387236020109100001710100460328804001736330031000100310001000100010005000500011949100022599292892949931030001000100020001000292742935311610011000100001002231003011100031410013392927968983128542207343397380812424328471010001621013058143161000100010002932929396293462931129370
62004293022361151014100004010045532884501173913001100010051000100010001000500050001196660022646291972918231030001000100020001000291742919111610011000100001000231000001100131402013074945269073078545207343297381210454928537010001638613219144331000100010002937229441294652935529528
620042950723701400110000034000047482884500173483000100010031000100010001000500050001193470022657292292933431030001000100020001000293172923111610011000100001001121001011100201411013225957668863182947208013275380713434328617010001611912919142271000100010002943729298292822936829407
62004293882351101011000001600100465928863001738530041000100410001000100010005000500011949100022645292542942331030001000100020001000293592924011610011000100001000131003022100020212013208948469783117640206543342380813454528638010001630013259143351000100010002939529279293242934029252
62004293832370111016000001600004742288011017335300410001005100010001000100050005002119388002264929063292533103000100010002000100029191293451161001100010000100133100202010000231201321794256979316664020634338038169474428628010001640612992145591000100010002948629596293752947429398
62004293042361810111110023000046902891101174493001100010031000100010001000500050001194690022706291882928031030001000100020001000292712924611610011000100001000301000001100010302013163966569393163641209533218381411404628661010001608513445144221000100010002940229389293792944429373
62004294482370110017000002900004587287910017370300010001001100010001000100050005000119587002266829170295293103000100010002000100029305291731161001100010000100202100201110002101001314695326888309884320847331138077444428641010001606113080142331000100010002938329305294212937129370
620042935123611310141000020000046792899800174033003100010041000100010001000500050001193640022629292742930031030001000100020001000292222931711610011000100001000321003011100120412013036928769813119540208663276381813424428620010001616413335146081000100010002935729293293872942429379
6200429470236014101210000547000047042897010173533000100010031000100010001000500050001195031022626292302953131030001000100020001000293182920611610011000100011003231002010100131010013276926868853114744207453307381210414128722010001615613017144051000100010002942329509293562949629367
620042937723511910151000046010047272894610173993003100010031000100010001000500050001196120022662293202944931030001000100020001001293222929211610011000100001000201002001100100211013095925469233101643206863332381211514328634010001604213239143151000100010002927729495293852942129402

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.4s }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60205140063112410210000200000140027139574518010450108200041000040242200001000012456915304715107106780140033140057140145131970313251870100302001000020081602002000020000140056140154115020110099100401001000010000010010001201000500041000111110032361931313981050000069100001000050100140058140151140057140153140158
60204140056112610100110200000140041139725528010450100200081000040243200001003912456645304715107122040140139140130140056132019131324377010030322100002008360200200002008314005314014911502011009910040100100001000001001001541100082032516110008111130342351744614048550120660100001000050100140057140111140057140148140152
6020414005611261211000115500000140041139611258010250110200041000040242200001000012394635304715107144871145511145403145210133871700135694870123806912541255167562625178252831458401458766415020110099100401001000010000010010005111000101041000011111032101931113972150000969100001000050100140042140057140042140057140059
60204140056108610010100200000140026139577258010450100200041000040100200001000012456915304753107107560140115140053140041131967313242270100302001000020000602002000020000140053140056115020110099100401001000010000010010001211000200331791000011113032101902214070050000069100001000050100140045140145140054140054140137
60204140056108710010010228800001400411396715180104501102000410001401002000010039124569153055431071067801400821401401400411319703132497703643020010040200006044820000200811400561400562150201100991004010010000100000100100052110001000110000111130321019311139814500101069100001000050100140125140057140144140054140057
602041400561086101000102880000140131139633258011950100200071000040243200001000012462165304715107147170140039140053140041131971313243770100302001000020000602002000020000140056140056115020110099100401001000010000010010002111000200011000011110032101931113972050000990100001000050100140057140057140057140057140057
60204140056108510100100700000140041139577258010450100200041000040100200001000012456915304715107106780140102140041140041131968313243770100302001000020000602002000020000140041140057115020110099100401001000010000010010001711000110091000011111032101931113972350000969100001000050100140057140060140057140057140059
602041400571134101101011400000140042139579768010450100200041000040242200001003912948855304829107107550140032140145148182132063313244970100302001004020000602002000020080140056140152115020110099100401001000010000010010002111000201041000111110032101931113972050000960100001000050100140057140057140150140042140953
6020414005611301100010019000021400261397822580118501002000510000401002008010000124761553047151072943201400181400581401441319703132473701003020010000200006044820000200801400571400531150201100991004010010000100000100100054110002011410000111100321011012113972050000999100001000050100140060140153140153140057140059
602041400561120100101022317601001400411396275080104501002001010000402432000010039124563753056771071067801400271400501400491319991313243381254302001000020000602002008220000140238140047115020110099100401001000010000010010001011000001031000000000032104931213971450010261113100001000050100140042140045140058140061140143

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f233a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)dfe0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6002514011610860000000000132001140035139654258001250010200021000040010200001000012457715307732107171090140026140050140050131998031324647001030020100002000060020200002000014005014004711500211091040010100001000001010000011000000429100001010003146648833613972150000969100001000050010140036140051140051140051140051
6002414020310850000000000253001140037139651258001050010200021000040010200001000012458615307504107171860140026140050140050131998031324587026830020100002000060020200002000014005014004711500211091040010100001000001010005011000000285100000010003146628833613970650000969100001000050010140051140051140051140051140051
600241401081086000000000000011400371396542580012500202000210000400102000010000124576153075041071710901400261401411400501319980313245570010300201000020000600202000020000140050140050115002110910400101000010000010100000110000000100010010003146639533613972150000969100001000050010140051140051140051140051140051
600241400501086000000000113001140038139615258001450010200041000040010200001000012458785309202107175770140032140056140056132070031324647001030020100002000060020200002000014014614005311500211091040010100001000011010001111001010410000111100314663106236139727500001069100001000050010140057140057140057140057140042
6002414005610851000000100145001140041139645258001450010200041000040010200781000012458155307110107174540140032140041140056132004031324677001030020100002000060020200002008314005714005311500211091040010100001000011010001111000211325610000111100324263101236139823500009610100001000050010140057140057140057140057140057
600241401481085100000000014600114004113966025800145001020002100004015220000100001245842530773210717735014003214005714005613199003132464700103002010000200006002020000200811400561400561150021109104001010000100000101000211100011041000011110031466330633613972750000990100001000050010140146140054140057140054140057
6002414005910851011000000140111400411396602580014500102000410000400102000010039124584253077321071745401400341400561400561320040313246470010300201000020000600202000020000140053140141115002110910400101000010000010100012010001001100001111003146638833613972450000996100001000050010140057140057140057140057140057
6002414005610861010000000288011400411396622580014500102000210000400102007810045124582153092701071757701400291401451400531320070313246470010300201000020000600202008220000140148140053115002110910400101000010000110100022110001014100001101103146638833613972750000000100001000050010140057140057140057140057140057
600241400561086100001000020011400431396632580014500102000410001400102000010000124584253077321071757701401151400411400561320040313278770010300201000020000605022000020000140044140053115002110910400101000010000110100012110002001100011111103146638733613972450000969100001000050010140054140057140057140057140162
6002414005610861000000000700114004113967910080042502902007110001402922023710039125257153110471072751901401011401441403471320320312133712700103014410081202476026420240201641402301402343150021109104001010000100001101000311100040095511000401110032496311943613993550030969100001000050010140335140331140246140324140320

Test 3: throughput

Count: 8

Code:

  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  ld1r { v0.4s }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)090e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602058004162010000009000080026106116252401298010080029800008010080000800004358970375882349188878002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080000028800230002480010612423000051102163380038180000008000080000801008004280042800428004280159
160204800416200000100470000800266611825240130801008003080000801008000080000435899837588234918669800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000001008000002880024010108002461230000051103163380038080000668000080000801008004280042800428004280042
1602048004162000000003000018002666115252401308010080030800008010080000800004358982375882449188938002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080000028800240002380024612323000051103163280038080000768000080000801008004280042800428004280042
1602048004162100010003000008002666117252401308010080029800008010080000800004359002375882349188878002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080000028800240002480010612423010051103165580038080000668000080000801008017680042800428004280042
160204800416200000000560000800266011725240130801008002980000801008000080000436228937588244919073800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000001008000002881533210248001001240000051103163380038180000668000080000801008004280042800428004280042
160204800416210011000290000800266611725240129801008003080000801008014380000435898237588244918603800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000001008000000800240102480023012423000051103163380038080000608000080000801008004280042800428004280042
160204800416200000000420000800266011225240130801008003080000801008000080000435897037588244918889800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000001008000002880023000278002301623000051102165380038080000668000080000801008004280042800428004280042
160204800416200001000100010800266602252401298010080010800008010080000800004358970375882449188878002280041800415992435999924010020080000800002001602888000080041800411180201100991001008000080000010080000028800060002480024612423000051103163380038080000708000080000801008004280042800428004280042
160204804416220000000330000800261060225240133801008003380000801088000880000435905437588234919281800228004180041599307599922401162008000880008200160016800088004180041118020110099100100800008000001008000000800260002780027612626011151183162480038080000008000080000801008004280042800428004280042
160204800416210001100450000800261060225240133801008003280000801048000480000435903037588244918658800228004180041599307599922401082008000880008200160016800088004180041118020110099100100800008000001008000003180010010278002761262601115118216128015018000010108000080000801008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160025800416430000000260000800261665252400318001080021800008001080000800004358417375882249186321801290800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000000800150017800150116190005020416448028718000013108000080000800108004280042800428004280042
16002480041642000110054000080026166142524003080010800008000080010800008000043584093758823491863718002208015880041599463600212400102080000800002016000080000800418004121800211091010800008000001080000019800151017800156116190005020316448003818000010108000080000800108004280042800428004280042
1600248004164300001002000008015916610252400318001080000800008001080000800004358433375882449186321800220800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000000800170017800156115200005020416448003818000012128000080000800108004280042800428017380042
1600248004164300001002900008002616662524003380010800218000080154800008000043584173758824491863218002208004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000019800150018800146115210005020416348003808000013108000080000800108004280042800428004280042
16002480041643000010030100800261666252400318001080020800008001080000800004358417375882449183831800220800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000001980015001680017011419000502041644800380800001008000080000800108004280042800428004280042
1600248004164200000002001008002616002524003380010800238000080010800008000043584053758823491862318002208004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000021800150016800140014190005020416448003808000010108000080000800108004280042800428004280042
160024800416430000100210000800261667252400318001080000800008001080000800004358401375882449186201800220800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000001980000001580015601400005020416338003818000010108000080000800108004280042800428004280042
16002480041642000010023000080026166625240030800108002180000800108000080000435840937588244918644180022080041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800000080017001480000601721000502031633800380800000108000080000800108004280042800428004280042
1600248004164300001002101008002610672524028380010800008000080010800008000043584213758823492407818002208004180041600003600212404332080000800002016028880000800418004121800211091010800008000001080000019801040030800156114192005020425338003808000011108000080000800108004280173800428004280175
160024800416440000100153000080026066560240033800108002380091800108000080000436177937588234918637180022080041800416000347600212400102080000800002016000080000800418017711800211091010800008000011080000021801070020800166115210005020416448003808000010108000080000800108004280042801788004280042