Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (post-index, 2D)

Test 1: uops

Code:

  ld1r { v0.2d }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.004

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.004

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f243a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6200529637238113023100003004621289730001745130031000100310001000100010005000500011957500226182921629467327300010001000200210002926429318116100110001000100120100301110002101101321494916940318114420766332738241346472866110001603413135143681000100010002914229557294492939229353
6200429519236010001100003004668290250001722730041000100310001000100010005000500011903500226182921129292310300010001000200010002928629420216100110001000100410100102110000131101319493026974317205220733338938191349462872710001629813239146081000100010002943929432293572942529423
6200429532237011011011003004776289231001743530011000100310001000100010005000504411933410226162930329376310300010001000200010002936729307116100110001000100212100403210013141101322593466968314314720795331638201545492869010001616713147142911000100010002942029448293622950829525
6200429361237012002100003004634290060001742230071000100510001000100010005000500011919600226552921729506327300010001000200010002938029267116100110001000100210100200210002221101323096266976315415120799343738201344472869810001598513085144401000100010002950529453293422939829524
6200429335235011101100011004667289840111733530011000100410001000100010005000500011942610226212931829371310300010011000200010002937529339116100110001000100100100311110002201001326693616977320314820771324538131248482874410001625813099145331000100010002940629443293562950329481
620042929523601100100001100474128962011174793003100010011000100010001000500050001194600022673291522931931030001001100020001000293512936011610011000100010017210020111000312110134169610696831791452070632203822448472870610001626713407143421000100010002949529433294942934229535
6200429527236011010100101004756288280011780730031000100310001000100010005000500011908000226312916929402121030001000100020001000292342933811610011000100010022410020145110002141301312193186984312204720701332838171953502868010001608713277143761000100010002936029514294072951429350
62004294222350110012101058804709290210011768630031001100410001000100010005000500111909500225932912829453710300010001000200010002922629130116100110001000100313100500110013221001318393726904311604520655324038201142422857010001614513131144211000100010002933429364293492929629412
62004294132350110210106680752804728289040011745530051001100310001000100210005000500011944700226512919729316310300010001000200210002922529176216100110001000100422100300110012131101339193006980318405220868331538132650472873610011592813206141801000100010002943029479294162935829458
62004292562360100121110013500470628898001173393003100010041000100010001000500050001194211022625292332935531030001000100020001000292822929311610011000100010031310020111001224010132079366700631490462072332733814842382861810001624113163146021000100010002924729332293262929429368

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.2d }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0059

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6020514011811261011011100201011400381395772580104501002000410000401002000010000124566453046011071067801014002901400561400541319703132435701003020010000200006020020000200001400531400531150201100991004010010000100001100100012110002000110000111103210001931113971850000966100001000050100140057140054140054140057140057
602041400621125101101000050000140026139575258010450100200041000040100200001000012465805304717107106780001400290140057140056131970313242270100302001000020000602002000020000140056140056115020110099100401001000010000010010002211000200026810000111103210002931113971750000669100001000050100140054140054140054140054140054
60204140053112510110011002300011400391395752580104501002000410000401002000010000124566453047531070738200014001701400531400531319703132437701003020010000200006020020000200001400531400531150201100991004010010000100000100100012110001000283100011101032100011001113971350000967100001000050100140057140057140054140054140148
60204140114112510100000005000114002613973925801045010020004100004010020000100001245673530460110710678000140017014005314004113196731324257010030200100002000060200200002000014005414005411502011009910040100100001000001001000121100020004661000101111321000293111420465000010106100001000050100140042140132140057140057140055
602041400681125101100000080001140041139577258010450100200041000040100200001000012456735304601107106780001400290140053140053131967313243570100302001000020000602002000020000140041140053115020110099100401001000010000010010001111000100028910000011103210001931113971750000986100001000050100140057140054140057140057140058
6020414010011251010000000500011400411395743980104501002000410000401002000010000124573653046011071627100014002901400571400531319953132434701003020010000200006020020000200001400411400411150201100991004010010000100000100100011110001010710000110103210001931113971350000960100001000050100140054140057140057140043140054
602041401191125101100000050010140026139574258010450100200041000140100200001000012456645304718107107540001400320140056140056131967313243770100302001000020000604522000020000140053140056115020110099100401001000010000010010001211000100328610000111103210001931113971950000966100001000050100140056140055140054140054140057
6020414009211251001010101230011140026139574508010450100200041000040100200001000012456915304601107106780001400320140053140055131968313243770100302001000020000602002000020000140053140056115020110099100401001000010000010010002211000100028610000110103235001931113971350000966100001000050100140054140057140057140054140057
602041401311125100000000050001140038139588258010250100200021000040100200001000012456825304720107106780001400320140055140057131970313247670100302001000020000604842000020000140056140053115020110099100401001000010000010010003111000100032201000011111321000193211397935001011611100001000050100140141140054140143140057140057
602041403251125102100102323413264001140321139871126801295013320013100034038220318101161252168530514910741927000140244014022414040213205434132521706173044810080201636068820240201631403201402394150201100991004010010000100000100100044110004003424100001111032100019311139717500006610100001000050100140042140057140054140054140057

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
600251400501085000010100000140035139654518002650010200021000040010200001000012457885307504107164791140023140050140047132056313245870010300201000020000600202000020000140052140047115002110910400101000010000010100000110000000100001100314004888513972150000908100001000050010140051140051140051140144140145
600241402251087000032274880000140200139770128800265003820006100024043320161100791250179531130510725238014015914014814042113208330513399470269303821004120331607442008020240140210140314315002110910400101000010000010100042110003009578100031100314008888513971850000969100001000050010140051140051140051140051140051
6002414003510860000001300000140020139639258001250010200021000040010200001000012457975307582107171090140026140053140050132035313245870010300201000020000600202000020000140050140049115002110910400101000010000110100000110000100100001100314008888813972150000969100001000050010140036140051140051140051140051
600241400501086000000400000140035139654258001250010200021000040010200001000012458065307504107171091140033140050140050132026313245870010300201000020000600202000020000140050140047115002110910400101000010000010100000110000003100001100314007885813971950000969100001000050010140051140051140036140036140051
6002414005010850000001300000140037139654258001250010200021000040010200001000012457885306922107171091140026140050140035132048313245870010300201000020000600202000020000140050140047115002110910400101000010000010100000110000103100001100314008887813972150009999100001000050010140036140036140051140051140053
6002414003510860000001000001400351396572580012500172000610000400102000010000124576153075041071734301400231400521400501320383132443700103002010000200006002020000200001400481400501150021109104001010000100000101000001100000031000011003140048811713972150000999100001000050010140482140036140036140137140051
60024140050108501000012400000140035139639258001250010200021000040010200001000012457885307504107171091140028140050140050131995313245870010300201000020000600202000020000140035140035115002110910400101000010000010100000110000003100001100314008887513972150000999100001000050010140051140054140052140051140054
600241400501086000000101000140035139654258001250010200021000040010200001000012457615307504107171090140026140050140035131998313245870010300201000020000600202000020000140051140053115002110910400101000010000010100000110000000100001100314004886613972150000999100001000050010140051140051140051140051140051
600241400501085010100100000140020139639258001250010200021000040010200001000012457885306922107171090140011140050140047131995313245870010300201000020000600202000020000140050140047115002110910400101000010000110100000110000000100001000314008885713971850000999100001000050010140052140051140036140051140223
60024140220108810113226517600011421171405185180025500202000910002402942015710116124787853130181072521901401661402111402371321951313254071052302651012220166607642016220243140222140269315002110910400101000010000010100032110002026345100021100314008885713970650000699100001000050010140051140051140051140052140051

Test 3: throughput

Count: 8

Code:

  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  ld1r { v0.2d }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602058004159910100000320001800260662502524013180100800328000080100800008000043589823758824491889418002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000110080006823800262026800196126237151102161280038080000998000080000801008004280042800428004280042
1602048004162110001100320000800260661025240131801008003280000801008000080000435901437588244918897180022800418004159924359999240100200800008000020016000080000800418004111802011009910010080000800001100800067080024002880018612606051121162180038080000998000080000801008004280042800428004280042
160204800416201010000044000180026160225240131801938003180000801008000080000435898237588244918899180022800418004159924359999240100200800008000020016000080000800418004111802011009910010080000800004100800086080027112680000612506151102162180038180000998000080000801008004280042800428004280042
16020480041621110100004300008002616610252401078010080031800008010080000800004358986375882449188961800228004180041599953599992401002008000080000200160000800008004180041118020110099100100800008000011008000682380026007800180125237151121161280038080000908000080000801008004280042800428004280042
16020480041621110010003100008015706010252401318010080031800008010080000800004358970375882549188941800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000041008000772680007102880019617236051122162180038180000998000080000801008004280042800428004280042
1602048004162111010000310001800261661425240132801008003080000821998386981625435898637588244918893180022800418004159978359999240100200800008000020016000080000800418004111802011009910010080000800004100800087238002500298001860607051122162180038180000998000080000801008004280042800428004280042
16020480041621100000003100018002616010252401328010080031800008010080000800004358954375882349190141800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000011008000772380026017800186025237051122162180038080000908000080000801008004280042800428004280042
160204800416201010000031000080026166102524013280100800318000080100800008000043589823758824492442218002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080006823800250026800196125060511221612800380800001298000080000801008004280042800428004280042
160204800416201010000017500018002606611252401328010080031800008010080000800004358982375882449188961800228004180041599873599992401002008014080000200160000800008004180041118020110099100100800008000011008000772380026102580109612507151122162280146080000998000080000801008004280042800428004280042
1602048004162011100000320000800261661225240131801008000780000801008000080000436226137588244918898180022800418004159924359999240100200800008000020016000080000800418004111802011009910010080000800001100800968238002501288001961262360511021622800381800000108000080000801008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9abacafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)d9dadbddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600258004164310100110004200008002610615252400468001080036800008001080000800004358385375882049189120080022080041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800068288003200030800236133276005020216000228003808000013138000080000800108004280042800428004280042
160024800416431000000000350000800261663252400178001080035800008001080000800004358377375882449184240080022080041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800076278003001029800230070620502021600022800380800001308000080000800108004280042800428004280042
16002480041643100100010036000080026066925240046800108003580000800108000080000435837737588204918910008002208004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080008727800090007801100030277005020216000228003818000013138000080000800108004280042800428004280042
160024800416431001011100590001800260661125240045800108000780000800108000080000435843337588244918911008002208017780041599463600212400102080000800002016000080000800418004111800211091010800008000001080007628800072103080022617286005020216000228003818000013138000080000800108004280042800428004280042
1600248004164210100200004200018002610010252400168010480216800908001080000800004358377375882449189060080022080041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800086080029200780000612907005020216000228003808000013158000080000800108004280042800428004280042
160024800416431000011000190000800261661125240046800108003580000800108000080000435843337588244918424008002208004180041599463600212400102080000800002016000080000800418017511800211091010800008000001080008808002900829800236130327105020216000228014608000013138000080000800108004280042800428004280042
16002480041643100001010035010080026166142524004580010800358000080010800008000043583813758824491890600800220800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000772780030100308002360706005020216000228003808000013138000080000800108004280042800428004280042
1600248004164210100111107000080026066525240045800108003580000800108014080000435843337588254918911058002238004180041599462260021240010208000080000201600008000080041800411180021109101080000800000108000860800070102980023612907005020216000228003818000013138000080000800108004280042800428004280042
1600248004164310000000004700008002616614252400178001080035800008001080000800004358433375882349189140080022080041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800067308002900032800226162772050202160002280038080000008000080000800108004280042800428004280042
16002480041643101000000070000800261061252402988001080006800008001080000800004358377375882449189000080129080041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800086278002900029800230129276005020216000228003808009213138000080000800108004280042800428004280042