Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple, post-index, 16B)

Test 1: uops

Code:

  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 7.006

Integer unit issues: 1.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 3.006

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch indir (93)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
660052967523911211001003930000461629149300172817009100030093000100030003000500015000357577230472927129496310700030003000400090002937229310116100101000100003000030041030015140301318993346947317253820392334038161244452857710001623612952141193000300010002945729513294502949029370
660042933323601213000002798800045722897000017188700010003009300010003000300050001500035704112298129224295693107000300030004000900029293295111161001561000100013000030041030005000020413386933468963135125720400328838141062542862610001585112796136973000300010002949829413294512947929451
660042950023609100000060010458028955000168407006100030063000100030003000500015011357712230462914229353310700030003000400090002928129249116100101000100003000630040030075119001321493946932314014920184323238111341342851610001613313136143363000300010002944929337294372950729692
660042941222806110000012000046792887400017016700910003006300010003000300050001500035700523099292392926931070003000300040009000292872927211610010100010000300093000013000500600132069432709631516402022132273812640422862110001619113027142443000300010002938029490297752944729373
660042940722707100000100001470628910330169707007100030093000100030003000500015000355998230112917629432310700030003000400090002931629147116100101000100003000930040430105149001327794886934310304020020327538081039412864110001621413155145253000300010002942929377293482944329375
66004294402270121300000120000463728861030170557000100030093000100030003000500015000357795230292922329373310700030003000400090002919129253116100101000100003000930070030005119001309094806957319734520234334938111338352865910001601212924140233000300010002921029473294112948129339
6600429387237011901100306000047652896003016950701210003000300010003000300050001500435699623030292202939531070003000300040009000292132917011610010100010000300003000173001514600133229319694331157402030032803815936372859610001615613238147363000300010002964229751298102985029353
660042960523701590000090000462528938000169767000100030003000100030003000500015000358321230292924429404310700030003000400090002926329209116100101000100003000030031430030000001329894776938318024220103332638091038392870010001600013172145643000300010002929029354293162933729261
660042942823601111000001800004697290500021689570061000300030001000300030005000150183558312298129198293863107000300030004000900929206292661161001010001000030006300141030043139001317094536965316554619552314238171543382825510001548112452134683000300010002868528695286522867428758
660042880622207160101060000479228490000163317013100030063000100030003000500015000357115231002851428762310700730003000400090092861228706116100101000100003000630000030010060001319494076917309064619581321338071949452830110001476812284136763000300010002868228774286642876128532

Test 2: throughput

Count: 8

Code:

  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  ld3 { v0.16b, v1.16b, v2.16b }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2224373a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)67696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
480205801046201010011003000025380056270025560130801002400402400008010024000024000048049935472365680255080052800758007118035356010020024018924000020032000072000080075800711180201100991001008000080000010024001616440240057010572400005156451500510911723800680800001313240000240000801008010180263800728007280072
48020480050621100100000640006380055371172556013280101240036240000801642400002400004804953547236574984208005680071802652203325601002002400002400002003200007200008007180070118020110099100100800008000001002400151543024005600161240000511501520511111722800470800001313240000240000801008009080073800518007280072
4802048007162110111001063000028005637012556016380100240029240000801002402102400004804972799892504830608005680075800710035856010020024000024000020032000072056780071800741180201100991001008000080000010024001515430240056100212400415156431500510911721800720800001413240000240000801008084380064802718028680072
48020480073621110000000250000138006027716255601308010024002724000080100240000240000480498354723657262070800568007180050180354560526200240000240000200320000720000800758005011802011009910010080000800001100240015154302400560015724000051150150051112162180211080000013240000240000801008006380076800548007280079
4802048007162111000000064010148005637717255601418010024002924000080100240000240000480497349162448620561800528007180075003325601002002400002401892003200007200008007180075118020110099100100800008000011002400151643024005610161240040515643150051091172280068080000130240000240000801008007580051800808005180072
4802048007162011100000086010203800352701725560144801002400272400008010024000024000056046435205185688606180052800718007528035456010020024000024000020032000072056780071800751180201100991001008000080000110024001516430240015000622400005115431510515111721800680800001313240000240000801008007280072800788007980075
4802048007664510001000064000038005927788255601448010024004124000080100240000240000480496350623256748900800528007580070180353560100200240000240189200320000720000800718007511802011009910010080000800000100240016150024005500118240040515543150051111172180068080001130240000240000801008026880072800738007280072
480204800506201100000006400014180056377172556012680100240168240000801002400002400004804973531921564395308005680071800750035756010020024000024000020032000072000080071802641180201100991001008000080000110024001515430240056001622400415156431520510921621800720800001313240000240000801008009180084800818007580051
480204800716201101000016400003800562771725560129801002400272400008010024000024000048049635214803999042080052800718005006635456010020024000024000020032000072000080075800711180201100991001008000080000010024001518440240056001592400415156431510510921612800720800001313240000240000801008007580054800728007480075
48020480071620101010000630002380056077172556014380100240025240000801002400002400005604643520229559065418005280071800751803177560100200240000240000200320000720000800718007111802011009910010080000800001100240015154357240055101592400405155431510510921612800800800011313240000240000801008007580076800728007280076

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e18191e1f2224373a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4800258007662011000006200030800353180142556004180010240039240000800102400002400004800473564406575285100800478006680067140350560010202400002400002032000072000080071800661180021109101080000800001102400161542024005201158240040515442151050199176980081180000106240000240000800108006180063800638004680057
4800248006262010000004500002800512171702556002180010240040240000800102400002400004800472027042554582400800478006280066002354560010202400002400002032000072000080062800561180021109101080000800000102400160350240000000332400005040000050199169108006408043460240000240000800108006880057800638006580046
480024802376210000000500000280051218179255600458001024003924000080010240000240000480047351170455374721080043800668006290349560010202400002400002032000072000080062800621180021109101080000800000102400001600240055010432400405033431500501991659800590800001010240000240000800108006380068800578006380063
4800248007162000000004500002800512170925560049800102400442400008001024000024000048004735337465641659108004380062800621303455600102024000024000020320000720000800678005611800211091010800008000011024000003502400380005824004061324300050199169980059080000106240000240000800108006380064800678006380063
48002480056620000000045000128004711518132556003880010240023240000800102400002400004800473506714563773500800488006880066003325600102024000024000020320000720000800508007111800211091010800008000011024001616420240054020582400405155421500501991710118006408000099240000240000800108006880068800688006780068
4800248006862110000006000202800302171892556001980010240028240000800102400002400004800483536266570076100800488006280056003455600102024000024000020320000720000800568005711800211091010800008000011024000003502400400000240040513343000501991699800590800001010240000240000800108006380063800468005780050
4800248006962110000004500003800472017925560050800102400392400008001024000024000048004735339995627593008005580103800711603385600102024000024000020320000720000800568006211800211091010800008000001024000000024003300002400330104300050191116988005908000096240000240000800108006880063800638006780063
48002480062620000000046000508004501817925560052800102400002400008001024000024000048004734961435574080108004380062800621103505600102024000024000020320000720000800628006611800211091010800008000001024000003502400540003924000001544300050199178880042080000109240000240000800108005780072800468005780067
4800248006262100000003800002800472003255600518001024000124000080010240000240000480048352047857640720080045800638006190348560010202400002400002032000072000080045800601180021109101080000800001102400000002400000001824000051334400050198165980059080000106240000240000800108006380063800688006380063
480024800646210100000500009280051318014255600458001024004524000080010240000240000480047346925455602610080051800668005400355560010202400002400002032000072000080067800671180021109101080000800000102400151600240015001672400390115015005019817998006408000099240000240000800108006880068800688007280068