Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (post-index, 8B)

Test 1: uops

Code:

  ld1r { v0.8b }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.002

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
620052946122923031001003000045652881600017451300210001002100010001000100050005000119338225952908529403310300010001000200010002911129232116100110001000110000310000000100100201303593666934312256420676323038151458492853110001614813281143191000100010002924029265294312933329475
62004293232272102900100300004774288780101735430041000100310001000100010005000500011928102260529131292963103000100010002000100029198291751161001100010000100002100000001000103013278933869453096124920672326238201655512846110001617913149145461000100010002928729306294002924429342
6200429286227180250110020000470828971000173133003100010021000100010001000500050001192911226702908129249310300010001000200010002923729205116100110001000010000010000000100020201301692176929307510542063232173815955552850610001639613465144511000100010002927729363293382937929312
6200429267228190270010040000459428847001173723000100010021000100010001000500050001190982264029145293563103000100010002000100029135291351161001100010000100000100000001000002013083923669503183105420589317038151754502847910001620313136143821000100010002932629283294442933529329
620042936122717027000000010046822884800017413300010001003100010001000100050005000119436226742919729376310300010001000200010002932829077116100110001000010000210000000100000201293093926887315395720629326238151755502844410001614813100143191000100010002937129333294382932729410
6200429274227170261010000100464428893000173953004100010021000100010001000500050001190172267929162293523103000100010002000100029260291221161001100010001100002100100001000213012963927068293161106320763316338221457582854610001587113130145211000100010002941629279293082929929310
6200429333227180270100000000461528772010173653002100010021000100010001000500050001193872262029184294423103000100010002000100029211292591161001100010000100002100100001000003013062930569123159115420559324038181157512851410001622013081144531000100010002920129306293492934529348
6200429356228190290110030000454828846010173023000100010021000100010001000500050001194662263629187292523103000100010002000100029116291981161001100010000100000100101001000013013265957268693101114820723322838141458582847810001624513021144411000100010002940629282294802939229219
620042917422621029011002000045842879300117223300010001002100010001000100050005000119041222661291722934631030001000100020001000292772924511610011000100001000001000010010002130131409277685231319532073531693813952502851510001632813309144081000100010002939129245292562929729374
62004292532281602400000150000475728831010173053003100010021000100010001000500050001190511226242913529261310300010001000200010002911429175116100110001000010000010010000100021301318193426954311812552081331733815559552855010001603913017142231000100010002935229318293572918329292

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.8b }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
602051400501086100100002000001400381395772580104501002000410000401002000010000124562953039271071217601400830140035140035132058313243770100302001000020000604502024220000140140140053215020110099100401001000010000110010004211000500631510004111180325721322113994250021969100001000050100140238140238140312140324140322
602041425381108120010323971760000140307139833258010450100200041000040100200001000012453035302806107092760140032014005614005613196931324227010030200100002000060200200002000014005614004211502011009910040100100001000001001000220100010041000001110032351931114241650398999100001000050100143023143668143567143040143602
602041436831105010000536110000014008613955625801025010020002100004010020000100001245637530392710710210014001101400351400471319493132431701003020010049200006020020000200811400501400501150201100991004010010000100000100100000010000000100000010003210193111397115000091110100001000050100140042140057140057140057140057
60204140056108511011000200001140038139572258010250100200021000040100200001000012456555304487107092760140011014003514003513196431324167010030200100002000060200200002000014005014004711502011009910040100100001000001001000001100000001000010000032101931113971150000999100001000050100140037140051140036140036140051
602041400351086000011001200000140092139571258010050100200001000040100200001000012456075303927107102101140026014005014003513196431324167010030200100002000060200200002000014003514005011502011009910040100100001000001001000001100002001000000100032101931013971450000966100001000050100140051140049140036140051140051
60204140050108500000000100000140032139571258010250100200001000040100200001000012456375304487107092761140011014005014005013194931324317010030200100002000060200200002000014010614004911502011009910040100100001000001001000110100010141000011110032101931113972050000969100001000050100140057140057140058140054140057
60204140041108610010000100000140036139556258010250100200001000040100200001000012456165304604107092761140011014005014005013196531324167010030200100002000060200200002000014006114005011502011009910040100100001000001001000001100000001000010100032101931113969950000961100001000050100140051140051140036140051140051
60204140094108600000000000000140041139577258010450100200041000040100200001000012453125304715107102101140032014004114005613197031324387010030200100002000060200200002000014008514005911502011009910040100100001000001001000001100001031000000000032101931113971450000000100001000050100140051140036140053140051140051
6020414005010850000110012000001400351395682580102501002000210000401002000010000124563753044871070927601400260140050140050131964313243170100302001000020000602002000020000140124140048115020110099100401001000010000010010002701000200969310004101020377623221113987250032960100001000050100140311140319140235140320140333
6020414033110870101002313826400001400381395712580102501002000210000401002000010000124568253045651071044401400260140050140051131968313243570100302001000020000602002000020000140087140049115020110099100401001000010000010010000001000000610000001100321039311139723500001060100001000050100140057140057140058140042140047

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233f43494d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)c2cfd0d5map dispatch bubble (d6)d9daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60025140051112500000013601014003900139694258001250010200051000240010200001000012457535307542107176551400301400351400541320021313247070010300201000020000600202000020000140035140035115002110910400101000010000101000001100000001000010103140028800331397255000013013100001000050010140055140055140056140057140056
60024140051112500000008810140036001396392580012500102000210000402922000010000124582453076941071742114001214005414005413200231324727001030020100002000060020200002000014004314005911500211091040010100001000010100000210001003166100011110316402126002214018950000131013100001000050010140071140055140036140036140039
6002414005411250000004900014012600139654768008250028200081000343685222101003912521915313738107205811401761405021402321320843513366876755303831008120326607462016020325140241140224415002110910400101000010000101000701100050012665100061012318702117004313995050020101313100001000050010140227140333140239140316140330
600241401401127000010188101400410013965825800265001020002100004001020079100001248763530765610716479140030140054140153132002151324477001030141100002008060020200002008214005414014111500211091040010100001000010100000110008000100011010316402104003213972550000131313100001000050010140055140036140054140055140056
60024140035112500000010101400200013965825800125001020002100004001020000100001245833530765610717421140027140035140055132005241328227104730629101622040961254204002032214074014065551500211091040010100001000010100040110000001638510006101031401288003213980350000131413100001000050010140056140055140145140052140059
6002414005411260100011881014003900139639258001250010200031000040010200001000012458245309626107174211400301402291406661320683132462700103002010000200006002020000200001400551400541150021109104001010000100001010000011000002310000101031400288002213973050000101010100001000050010140061140062140146140062140060
6002414005911450010011488800140040001396612580028500102000210001400102000010038124584253070001071780814003514005514014313204613132509700103014010000200006002020080200001400551401371150021109104001010000100001010000011000001910000101031400217002213972550000101313100001000050010140055140052140139140142140144
60024140054112400000016000140021001396572580012500102000510000401512000010000124575353097251072074714003314014114014013205731324647001030020100002000060266200822000014005414005411500211091040010100001000010100010110000003158100001010314002118003313972550000101010100001000050010140146140147140058140056140052
6002414015111251000001300014010900139658105780553500212000210000401512000010040125040553077701071742114003014010014014313208731325137027330140100002008160260200002000014014214015121500211091040010100001000010100000110001013233100001010316402106003313980350000131413100001000050010140055140055140140140056140143
600241401461125000000168800140040001396585180026500102000210000401512008010000124810853076561071765514009714005214026313200214132502700103014110041200006002020000200001402381401471150021109104001010000100001010000011000101610001101031400288002313970650000131315100001000050010140055140055140036140036140059

Test 3: throughput

Count: 8

Code:

  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  ld1r { v0.8b }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)79map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst simd alu (9a)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)dbddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160205800416210001100000001800261605252401008010080025800008010080000800004359018375882249187641800220800418004159924359999240100020080000800002001600008000080041800411180201100991001008000008000001008000000800170000800156115210511021602180038080000008000080000801008004280042800428004280042
160204800416200000000001001800260060252401218010080021800008010080000800004359018375882449185181800220800418004159924359999240100020080000800002001600008000080041800411180201100991001008000008000001008000001980014000178001761021051101160218003818000013108000080000801008004280042800428004280042
160205800416200001100210100180026066025240121801008002180000801008000080000435899837588244918767080022080041800415992435999924010002008000080000200160000800008004180041118020110099100100800000800000100800000198000000017800006014005110216011800381800000108000080000801008004280042800428004280042
160204800416200000000001001800261605252401228010080022800008010080000800004358994375882449187751800220800418004159924359999240100020080000800002001600008000080041800411180201100991001008000008000001008000001980017000168001761021051101160328003808000010108000080000801008004280042800428004280042
16020480041621000000023000008002610672524012380100800228000080100800008000043590183758822491879518002208004180041599243599992401000200800008000020016000080000800418004111802011009910010080000080000010080000019800000001780000611521051102160128003808000010108000080000801008004280042800428004280042
16020480041621000000000000080026106725240122801008002380000801008000080000435897837588224918768080022080041800415992435999924010002008000080000200160000800008004180041118020110099100100800000800000100800000198001500018800176115190511211602180038180000008000080000801008004280042800428004280042
16020480041620000000023000018002600662524012380100800008000080100800008000043590183758824491851818002208004180041599243599992401000200800008000020016000080000800418004111802011009910010080000080000110080000019800150001780016601521051101160128003818000010108000080000801008004280042800428004280042
16020480041620000000000000180026160725240123801008002280000801008000080000435900237588234918755180022080041800415992435999924010002008000080000200160000800008004180041118020110099100100800000800000100800000080000000080015611421051101160118003808000013138000080000801008004280042800428004280042
160204800416210001100230100080026106525240100801008002180000801008000080000435899037588244918767180022080041800415992435999924010002008000080000200160000800008004180041118020110099100100800000800000100800000080017000168000001142105110116011800380800000108000080000801008004280042800428004280042
160204800416210001000230000180026160625240122801008002380000801008000080000435898637588224918772180022080041800415992435999924010002008000080000200160000800008004180041118020110099100100800000800001100800000080017000080000601419051102160118003818000013108000080000801008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f23243f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002580041643001011102200080026166725240028800108001980000800108000080000435842137629394918564000800228004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000014800120001480013611217005020001161180038180000998000080000800108004280042800428004280042
160024801756440010000030000800261661912524002580010800008000080010800008000043584333758824491849500080022800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000002380000000080013619170050200011611801441800002398000080000800108004280042800428004280042
160024800416430010000015600080026166025240025800108001980000800108000080000435842137588244924104000800228004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000014800090001880012601317005020001241180038180000998000080000800108017880042800428004280042
160024800416420000000018000800261667622400288001080015800008001080000800004358417375882449185360008002280041800416000322600212400102080144800002016000080144800418004111800211091010800008000001080000014800000001380087601317005040001161180038180000968000080000800108004280042800428004280042
16002480041643001000101900080026106625240010800108010980000800108000080135435841737588244924114000800228004180041599463600212400102080000800002016000080000800418017511800211091010800008000001080000019800130001380013611317005020003341180038180000068000080000800108004280174800428004280042
16002480041643001010113900080026166482524002880105800008000080010801448000043584213758824492385400080022800418017459946360021240010208014480000201600008014480041800411180021109101080000800000108008901480012000138009060120005020001163180038080000008000080000800108004280042800428017380042
160024800416420000100002640080294166425240304801038000080090801508000080000436177537588244916910000800228004180175600033600212400102082436822962016057680144803078017221800211091010800008000001080087017801012071141801026100005040001253180143081587998000080000800108031280177801748030880311
16002480174646001000122838800801611069460240575801048019680091801508000080000436527237630934923970010800228004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000019800120001380013011317005020001161180038180000968000080000800108004280042800428004280042
16002480041642001000101900080160166562240028800108001980000801508000080000435840137629464918536000800228004180041599463600212404352080144801402016000080000800418004111800211091010800008000001080000014800130103588001201017015020001161180038080000968000080000800108004280042800428004280042
160024800416420000000030000800261661225240029800108001980000800108000080000435839737588244918542000800228004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000014800130102468001061917205020001161180144180000968000080000800108004280080800428004280042