Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (1D)

Test 1: uops

Code:

  ld1r { v0.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.003

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire (01)cycle (02)03040507080a0b0e0f1e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f61696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c9cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
62005285352141011011102610502528055010160662003100410001000100050001194150227152832228534310200010001000100010002800828139116100110001000010000210000000100020000013904998373223327060198463445381410464427802142791227412816100010002808428188282242826928023
6200428187210001000006005215279640001639020001002100010001000500011924100227202838227991310200010001000100010002815928232116100110001000010000210000000100020000013149983371323301046198223246380912394027887146481259213495100010002816428256281812835128507
6200428147211011110003005095281330011643220031004100010001000500011943100227472807728398310200010001000100010002819228114116100110001000010021310030021100221012013524994672503418043196693437381317484027860138971237313821100010002823128116282202834028178
62004282522100111110030052672815600016366200310031000100010005000119391102265928148286543102000100010001000100028411281271161001100010000100323100400231000312110141581067172533484042196623325381014404027738140371221013106100010002845928315281782813328312
620042833521101101000200506327955000161782003100110001000100050001194290226912818828287310200010001000100010002836928158116100110001000010021210030011100223213013518100357105345304519619345538179443927834146931229112704100010002836828023280032839628152
6200427971213010010001105286279380001615220011001100010001000500011961802274228063282373102000100010001000100028061281121161001100010001100133100200031001210120140771041572803583040197393426381919434027832142371270513520100010002815928095281602830728566
6200428346211011111003005018279840011602620051003100010001000500011942110227482819128185310200010001000100010002810828030116100110001000010021210010001100022011014022104177135344704319506339138138484527757138971254913383100010002846028296282282811028062
620042837421301111100100513528119000161992004100110001000100050001194050226852814428210310200010001000100010002826428121116100110001000010033210010014410002131201350797717251339904019714320338179444227861142811263613887100010002812828379284672845328192
620042823321001101100300514728091001161042003100310001000100050001194730226972812728229310200010001000100010002838528157116100110001000010023010010011100001210013611991472763509143197583310381912394327894140591255212989100010002804228214281222810128042
6200428340210011010003105120279680001653620031003100010001000500011933140228442814228052310200010001000100010002803628124116100110001000110022210030001100002011014139104537108347015019524338738098394327780150981225813174100010002854428454280722837928546

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.1d }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0056

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400541049111110020100140041139429129353257010440100200041000030100200001000012639766693974143122290140030140041140053131799313239160100302001000020000602001000020000140056140041215020210099100401001000010000010010001111000100110000110100321041262213956240000669100001000040100140057140057140057140057140057
602041400531049110110010000140026139429129367257010440100200041000030100200001005612638146693974143151430140032140044140053131819313240260100302001000020000602001000020000140056140053115020110099100401001000010000010010001111000300110000111110321021142213955040000609100001000040100140057140042140057140057140042
602041400411049111110020101140089139430129367257010240100200041000030100200001000012639766693244143122290140032140056140041131802313240160100302001000020000602001000020000140056140053115020110099100401001000010000010010002211000100110000011110321021142213956240000069100001000040100140057140057140054140042140057
6020414005610491011000170001140038139429129367257010440100200021000030100200001000012639766693974143096670140017140056140056131802313239160427302001000020000602001000020000140056140053115020110099100401001000010000010010001111000101110000111100321021262213956240000069100001000040100140057140057140059140057140057
602041400561049100010010000140038139456129353257010440100200041000030100200001000012639766693974143122291140029140056140056131799313240160100302001000020000602001000020000140041140041115020110099100401001000010000110010001211000211410000111111321021392213956740000609100001000040100140042140057140057140058140057
602041400561049101010010101140041139429129367257010240100200041000030100200001000012639766693974143122290140017140056140056131802313239160100303611000020000602001000020000140111140053115020110099100401001000010000010010002111000341410000011100321011392213956740005009100001000040100140057140057140054140057140054
602041400561049101110020001140041139413129353257010440100200021000030100200001000012639766693244143122291140029140053140056131802313240160100302001000020000605781000020000140053140057115020110099100401001000010000010010002201000301110000111100321011262213956740000999100001000040100140042140057140042140057140057
60204140056104911101003200001400411394131293672570104401002000210000301002000010000126304866935321431279901400171400531400411318023132401601003020010061200006020010000200001400561400531150202100991004010010000100000100100011010002002110000111100321021392213955040000969100001000040100140057140057140042140042140057
602041400561049111110020000140038139456129364257010240100200041000030100200001000012639766693974143096670140029140053140041131828313239160100302001000020000602001000020000140041140054115020110099100401001000010000010010001101000101110000110110321021392213956740000906100001000040100140057140057140054140042140057
60204140056104911111002000014002613942912936425701024010020004100003010020000100001263976669397414312439014002914005314005613189531324016010030200100002000060200100002000014004114005311502011009910040100100001000011001000111100010011000001111032102172213956740000090100001000040100140057140057140057140042140042

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0047

retire (01)cycle (02)03090e0f1e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)cficache miss (d3)d5d6daddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60025140047104910011014002013939412935925700124001020002100003001020000100001264443669353814325829014002301400351400471318153132420600103002010000200006002010000200001400471400351150021109104001010000100001101000011000000100001031400311302213962340000666100001000040010140048140049140048140048140036
6002414004710490116521014003213939412935925700104001020002100023001020000100001264443669294714325829114002301400471400351318033132430600103002010000200006002010000200001400471400471150021109104001010000100000101000011000000100001131400211302213956640000660100001000040010140036140048140048140048140048
60024140035104900011014003213939412935925700124001020000100003001020000100001264443669353814325829114001101400471400471318823132430600103002010000200006002010000200001400471400471150021109104001010000100000101000001000000100000131400211302213956640000666100001000040010140036140048140048140048140048
60024140035104900011014003213939412935925700124001020002100003001020000100001264443669294714325829114001101400471400471318153132430600103002010000200006002010000200001400471400471150021109104001010000100000101000011000000100001131400311312213955440000066100001000040010140048140065140052140043140048
60024140047104900111014003213939412935925700104001020002100003001020000100001264443669353814325829114002301400471400351318153132430600103002010000200006002010000200001400471400471150021109104001010000100000101000011000000100001131400211302213956640000666100001000040010140036140036140036140048140049
60024140047104800110014003213939412935925700124001020002100003001020000100001264443669353814325829114002301400471400471318153132420600103002010000200006002010000200001400471400471150021109104001010000100000101000011000000100001131400213502213988140000066100001000040010140048140048140036140092140048
60024140047105000011014003213939412935925700124001020002100003001020000100001264443669294714325829114001101400351400351318033132420600103018010000200006002010000200001400541400471150021109104001010000100000101000011000000100021131400211302213956640000600100001000040010140048140036140036140036140048
60024140035104900011014003213939412934725700124001020002100003001020000100001264488669294714321442114002301400351400471318153132430600103002010000200006002010000200001400471400471150021109104001010000100000101000011000000100001031400211302213956640000066100001000040010140048140048140036140048140036
60024140047104900010014003213939412934725700124001020002100003001020000100001264443669353814325933114002301400471400471318153132430600103002010000200006002010000200001400471400471150021109104001010000100000101000001000000100001131400211302213956640000066100001000040010140048140036140048140036140048
60024140047104900110014003213939412935925700124001020002100003001020000100001264443669353814325829014002301400471400351318153132430600103021210000200006002010000200001400471401301150021109104001010000100001101000011000000100001031400211102213956640000666100001000040010140048140048140048140048140048

Test 3: throughput

Count: 8

Code:

  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  ld1r { v0.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090a0e0f181e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020526730200100004500126693212082516014510080045800001008000080000500116913818873340267092670826728665036686160100200800008000020080000800002672826728118020110099100100800008000011008000043800393398003961394351101116101226705101080000800001002672926787268222674326734
1602042670820000000451012671321212182516014510080045800001008000080000500116875418835730266892673526728665036686160100200800008000020080000800002672826728118020110099100100800008000001008000043800391398003961394351101416101126725101080000800001002672926818268292674726880
160204267282000000045101267130121232516014510080045800001008000080000500117462818873870267092672826754672136686160100200800008000020080212800002670826708118020110099100100800008000001008000043800392080000613905110111681226705101080000800001002700626742267462673526709
16020426728200000004510126713212121925160100100800458000010080000800005001174628188733402670926728267286650366861601002008000080000200800008000026728267281180201100991001008000080000010080000438003910800396139435110101611112672510080000800001002672926824268272671426734
1602042672820000000451012669321204251601451008004580000100800008000050011746281887218026709267282672866503668616010020080000800002008000080000267282672811802011009910010080000800000100800004380039039800396139435110101611112670701080000800001002673026849267342674526729
16020426728201000000000267132121219251601451008004580000100800008000050011687541884011126689267082672866503668616010020080000800002008000080000267282672811802011009910010080000800000100800004380039242800390104351101116121126725101080000800001002672926803268152671526745
1602042672820010100451012671321212625160145100800458000010080000800005001168754188733412668926728267286650366661601002008000080000200800008000026708267281180201100991001008000080000010080000438003930800396104351101016111326725101080000800001002672926795268322673626733
160204267332000000045011267132121273251601451008004580000100800008000050011688801887343126689267112672866503668616010020080000800002008000080000267282672811802011009910010080000800000100800004380039142800390139435110111611726725101080000800001002672926780268222673626734
16020426728200000004501026713212121825160100100800458000010080000800005001174628188357312670926728267286650366861601002008000080000200800008000026728267281180201100991001008000080000010080000438000010800396139435110131611122670510080000800001002672926793268202673526734
16020426728200100000010267132121216251601451008004580000100800008000050011746281883573126709267282672866503668616010020080000800002008000080000267282672811802011009910010080000800001100800004380039139800396004351101116121226705101080000800001002697526731267382673126729

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)0305080b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6daddinst fetch restart (de)e0eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002526735200110006510326718218181625160071108006280000108000080000501170529188375126714267152673366773671316001020800008000020800008000026733267331180021109101080000800000108001920428001911162800386057411910502031604426730908000080000102673426799268292673726735
16002426733200101006500326718018181725160072108001980000108000080000501170304187790926714267332673366773671316001020800008000020800008000026715267331180021109101080000800000108001921428005711059800386019421911502031603426730098000080000102673426796268092672126726
1600242671520011000660032671800181925160072108006280000108000080000501167371188808526714267332671566783671316001020800008000020800008000026733267331180021109101080000800000108001921428005702159800380057411910502041604426730098000080000102673426734267342673426722
160024267152001110065100267002180172516007210800628000010800008000050117309318880852670026733267156678366951600102080000800002080000800002673326733118002110910108000080000010800192142800191005980038615701900502041605426730908000080000102673426793268382672026743
1600242673320010100650022671821802125160073108001980000108000080000501171570188380126714267332673366773671316001020800008000020800008000026733267331180021109101080000800000108001919428005712021800386119421910502031604426730998000080000102673426773268292674326726
1600242673320010100651022671820019251600721080062800001080000800005011730931888085266962673326733667836713160010208000080000208000080000267332671511800211091010800008000001080019200800570102180038611901910502031614326730908000080000102671626868267892684026734
160024267332001110065102267182018212516007110800618000010800008000050116753018756192671426715267156678367131600102080000800002080000800002673326715118002110910108000080000010800192042800190015980038605801920502041603526730998000080000102671626869268012681926716
16002426715200110006510326718018181825160071108006180000108000080000501167530188511626714267332671566603671316001020800008000020800008000026733267331180021109101080000800000108001919428005811159800006157401910502041604326712998000080000102673426828268442673826720
1600242673920011100650002670021801625160071108006280000108000080000501164410187647826716267422673366603671516001020800008019220800008000026733267331180021109101080000800000108001921428005711059800000157421900502031604426730998000080000102673426825268132673926749
160024267332001010065103267182002225160073108006180000108000080000501169528187919426714267332673366603669516001020800008000020800008000026733267151180021109101080000800000108001921428005712160800000019421900502041603426730998000080000102673426780268392673926747