Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (post-index, 8H)

Test 1: uops

Code:

  ld1r { v0.8h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.003

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
62005290802331241127111001010047072857800169453004100010041000100010001000500050001191001200225992888029008329300010001000200010002881828930116100110001000010021310060036100031313013079935668993148166720313323938192566692843610001573312606139881000100010002910729111289802901029069
620042916723312811331110013010046432849200169993001100010061000100010001000500050011193701000225882884128959310300010001000200010002887728996116100110001000010012210020011100121211013162935267913102146720320322338152161672838810001556712846140121000100010002897229041290432908529032
6200428887232129002410000400004696285900017120300610001001100010001000100050005000119310008226312893229052310300010001000200010002895528971116100110001000010012210010124100021311013152919768293089126520475326138091262642843810001593412973140641000100010002909028970290252895229068
62004290542331231131100001600004663286700017167300110001001100010001000100050005000119040800225962886128991310300010001000200010002901228997116100110001000110022210030114100021011013033940668933099176420600328438092161592845110001552912995141771000100010002908129048291702913729074
620042915823312211211000030100459028519001707530031000100110001000100010005000500011911240022632288282912131030001000100020001000289132899021610011000100001002131005000119110000221025013203921669243130146020497320438102367692847210011566212700140951000100010002916929140292612908029098
62004290682341322127200131358810046332884300170523004100010051000100010001000500050001193704192262829035292106283000100110012004100028954291213161001100010000101373100300311000020111013143936068713127136420411322638142067612844910001591512818143701000100010002927029301293552922829302
6200429100233131222620000600014699287290017266300110001001100010001000100050005000122440400226402868528836310300010001000200010002884128964116100110001000010033010020111100101311012923943469683182136420250318738142465602827910001597112719140481000100010002892728936288462888728940
6200428963233128003310000130000475228674001693230011000100110001000100010005000500011902051822583288552905631030001001100020001000288812878511610011000100001002101004031410000121125112997920368103079146520752325238161659612840610001553412657142851000100010002885428809287962880728822
62004288832341250127000001500004689283790016857300310001003100110001000100050005000119060008226552881628972329300010001000200010002883628758116100110001000010022210020014100001213013039930568423210146320258328738072660652830510001580512619136841000100010002889928940289612878828942
6200428886223129112510000100014686285350016824300310001003100010001000100050005000119380900225652869629002310300010001000200010002878728840116100110001000010012210020012100001011013248948768863138207120346319538171965692834910001560312770141251000100010002894628871288872878628884

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.8h }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0056

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f233a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9aaacafb5bbbcl1d cache miss ld nonspec (bf)l1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60205140053104911000000140001400431395772580118501002000410000401002000010000124768953046011071067801400320140056140056132438141324347010030200100002008260200200002000014005614013711502011009910040100100001000001001000021100021103196100001101010000321001931113971350000669100001000050100140150140057140057140055140042
602041400561085111000002000140045139577258010450100200041000140100200001000012456645304715107106781140029014005614005613197031324227010030200100002008360200200002000014005614005311502011009910040100100001000001001000221100021004100001101010000321001931113978750000009100001000050100140057140055140055140057140132
60204140056112510100000100114003813958725801045010020004100014010020000100001247785530448710710678014003101400531400581319703132437701003044510121200006020020080200801400471400541150201100991004010010000100000100100021110001100110000010100100032100117011139720500006710100001000050100140057140055140054140057140057
602041400531126111000002001140049139577258010450100200041000040100200001000012456915308580107106780140029014004114005313196731324927010030200100002000060200200002000014005614005321502011009910040100100001000001001000111100000000100001101010000321001931113972250000996100001000050100140061140057140153140054140057
602041400571125101100001000140132139617258010250100200041000040100200001000012456825304715107107560140029014005614005613197031324317010030200100002000060200200002000014005614005311502011009910040100100001000001001000111100011004100001101011000321002931213971150000669100001000050100140042140056140057140055140057
602041400471124100000002000140041139579258010450100200041000040100200001000012456735304601107106780140023014005314005313196731324227010030200100002000060200200002000014005614004711502011009910040100100001000001001000021100020101100001101010000321001932113971950000007100001000050100140057140057140057140054140054
602041400531125100200111000140041139671258011850100200041000040100200001000012456645304793107106780140030014005614015013197031324377036130322100002000060200200002000014004114005611502011009910040100100001000001001000511100010101100001101011000321001931113972050000609100001000050100140042140059140051140057140144
602041400561125100000022000140035139577518010250100200041000040100200001000012456465304715107106780140109014005614014513196831324377010030324100002000060200200002000014005614005311502021009910040100100001000001001000100100011001100001101003000321001931213971350000669100001000050100140057140053140057140057140042
602041400591125100000002000140038139577258010250100200041000040100200001000012462555304715107106780140023014005414005613197031324357010030200100002000060200200002008314005614005311502011009910040100100001000001001000211100020300100001100010000323411931113971750000669100001000050100140057140057140054140142140057
6020414005411251120000014000140038139574258010250100200021000040401200001000012453035309876107102100140169014014714012613197013132589708833143210080203206067820160202451404071402444150201100991004010010000100000100100044110004030127861000211010100003280011312214196550020769100001000050100140236140339140323140333140328

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0056

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f233a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6002514005410491010000000140001400411396602580014500102000410000400102000010000124581553078491071781101140032140056140056132004313246470010300201000020000600202008020000140056140053115002110910400101000010000010100021110002080191000011111031405885413972750000969100001000050010140057140057140057140057140061
600241400561086100000000011000140041139660258001450010200041000040010200001000012458425307732107175770114002614005614005613200431324647030830020100002000060020200002000014005614005311500211091040010100001000001010001211000206011000011110031403884413972750000969100001000050010140057140057140054140054140058
600241400561085100101000020001400411396592580014500102000410000400102000010000124584253077321071745400140030140057140054132004313246170010300201000020000600202000020000140056140053115002110910400101000010000010100012110001039011000011112031403885413972750000999100001000050010140057140057140057140057140054
600241400561086100000000023000140125139735258001450010200041000040010200001000012458425307732107209900114003214004114004113200531324647001030020100002000060020200002000014004114004111500211091040010100001000001010002211000109111000001110031403885513972750000960100001000050010140057140042140042140058140061
600241400531086100100000110001400261396602580014500502000710000400102000010000124584253075041071757710140017140056140055132011313246170010300201000020000602622000020000140056140056115002110910400101000010000010100012110001091110000111110314051064413972750000999100001000050010140057140051140042140057140079
600241400561085100100000020001400351396572580014500102000410000400102000010000124581553077321071757701140032140056140056132004313246470010300201000020000600202000020000140041140053115002110910400101000010000010100021110002051011000011111031405884413972750000069100001000050010140057140059140058140042140098
6002414005610851000000000290001400261396452580014500102000410000401612009410000124582153071101071789001140029140143140056132004313246670010301411000020000600202000020000140056140056115002110910400101000010000010100021110001082041000011111031404885513972950000960100001000050010140042140052140058140057140087
600241400561086100001000020001400261396452580014500102000410001400102000010000124584253071101071745401140032140056140053132004313246470010300201000020000600202000020080140056140042115002110910400101000010000010100021110003051011000001110031405884513971250000660100001000050010140054140042140057140054140068
600241401441085101101010020001400261396472580014500102000410000400102000010040124584253077321071757701140032140053140053132004131324647001030020100002000060020200002000014004114013911500211091040010100001000001010002201000107011000101010031403885413972750000606100001000050010140057140057140153140057140064
60024140041108510000000001000140119139635258001450050200041000040010200001000012458425307732107213420114018814005714005613200531324647027130020100002000060020200002000014005614005611500211091040010100001000001010002111000101111100001111003168311245139884502141169100001000050010140242140328140314140238140400

Test 3: throughput

Count: 8

Code:

  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  ld1r { v0.8h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f233a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)7bmap int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch call (8e)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020580041642101000000320008002616614252401318010080032800008010080000800004359002375882449188980800228004180041599243599992405240200800008000020016000080000800418004111802011000991001008000080000100800077238011311278001961252370051101161180038080095968000080000801008004280042800428004280042
1602048004164310000000032000800261668252401198010080031800008010080000800004358982376298849186770800228004180041599243599992401000200800008000020016000080000800418004111802011000991001008000080000100800070238002600980019612617600511011611801461800009108000080000801008004280042801768004280042
16020480041643101000000180008002616652524011980100800198000080100800008000043590063758824491889308002280041800415992435999924010002008000080000200160000800008004180041118020110009910010080000800001008000772380015011280019611323700511011611800381800001098000080000801008004280042800428004280042
16020480041643101010000310108002616682524013280100800218000080100800008000043589863758824491898508002280041800415992422599992401000200800008000020016000080000800418004111802011000991001008000080000100800076238002600268001961252371051101161180038180000968000080000801008004280042800428017680042
1602048004164310001000032010800261668252404058010080019800008010080000800004358986375882349188960801268004180041599243599992401000200800008014320016000080000800418004111802011000991001008000080000100800066238002500258001861262371051101161180038180000998000080000801008004280042800428017580042
16020480041643101010000310108002616610252404038010080031800008010080000800004358982375882349188960801258004180041599243599992401000200800008000020016000080000800418004111802011000991001008000080000100800077148002700268001961251770051101161180038180000998000080000801008004280042800428004280042
160204800416430010100004401080026166122524040480100800318000080100800008000043589863758825491889608012980041800415992435999924010002008000080000200160000800008004180041118020110009910010080000800001008000662380026113380019611023000511011611800381800009138000080000801008004280042800428004280042
1602048004164210001110016011800261668252401328010080031800008010080000800004358986375882449189041800228016180041599243599992401000200800008000020016000080000800418004111802011000991001008000080000100800998238002700128001961101701051101161180038080000998000080000801008004280042800428004280042
160204800416430000100004201180026166825240131801008001980000801008000080000435898237588234918898180022800418004159924359999240100020080143800002001600008000080041800411180201100099100100800008000010080008823800270328800196162370051103161180038080000998000080000801008004280042800428004280042
1602048017564310101000032010800261668252401318010080031800008010080000800004358986375882449188930800228004180041599243599992401000200800008000020016000080000800418004111802011000991001008000080000100800067238002501258001961252370051311341180144180092668000080000801008017680306803118017480177

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)dfe0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002580041643110000003400018002606692524001780010800318000080010800008000043584333758824491875818002280041800415994603600212400102080000800002016000080000800418004111800211090101080000800000108000672308000600025800196125237100502331644380038080000908000080000800108004280042800428004280042
1600248004164310011100901008002610082524001780010800328000080010800008000043583933758824491842108002280041800415994603600212400102080000800002016000080000800418004111800211090101080000800000108000672308002600027800196125237000502341634380038080000908000080000800108004280042800428004280042
1600248004164311010020370100800261601325240017800108003280000800108000080000435839337588234918766180022800418004159946036002124001020800008000020160000800008004180041118002110901010800008000001080007723080007000780019612507000502331642380038180000908000080000800108004280042800428004280042
160024800416431000000090100800260661252400418001080032800008001080000800004358397375882349187610800228004180041599460360021240010208000080000201600008000080041800411180021109010108000080000010800078008002501025800196125237000502351634380038080000998000080000800108004280042800428004280042
16002480041643110000003401018002616614252400168001080032800008001080000800004358401375882249184240800228004180041599460360021240010208000080000201600008000080041800411180021109010108000080000010800068008002501125800186062370005023316433800381800001298000080000800108004280042800428004280042
16002480041643100100007000080026100102524001780010800318000080010800008000043584253758824491841308002280041800415994603600212400102080000800002016028880000800418004111800211090101080000800000108009472308000700125801946125237000502341634380038080000008000080000800108004280042800428004280042
16002480041643100011001670100800260008252400168001080007800008001080000800004358389375882549187610800228004180041599460360021240010208014480000201600008000080041800411180021109010108000080000110800087260800260007800196126236000502331633380038180000098000080000800108004280042800428004280042
1600248004164311000000350100800261663252400428001080031800008001080000800004361827375882449187610800228004180041599460360021240010208000080000201600008000080041800411180021109010108000080000010800067230800250002480019612506000502331642380038180000998000080000800108004280042800428004280042
160024800416421000000055010080026166102524031180010800328000080010800008000043584013758824491875808002280041800415994603600212400102080000800002016000080000800418004111800211090101080000800000108000872308000801126801066026236000502321633380038180000908000080000800108004280042800428004280042
160024800416421101010035010180026066102524004180010800318000080010800008000043584013758824491875908002280041800415994603600212400102080000800002016000080000800418004111800211090101080000800000108000872308002500025800186126236100502331643380038080000908000080000800108004280042800428004280042