Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 2S)

Test 1: uops

Code:

  ld1 { v0.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03050708090a0b0e0f1e22243a3f464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
610052923122012110700017104470288400024213100010001000500010015948285862929731010001000100029120291221161001100010001100233100200111000213111294394206846314511552162731933818646422837116185134501575810002929229403294032925829240
6100429206220118011310011045262873400241901000100010005000300159432849729223310100010001000291002917411610011000100011003301002002110003131212757939769333033114621525312138171241472857016364137311572010002929229243292682924529294
6100429280219115111511031104600288010024308100010001000500040015967286182927731010001000100029040290601161001100010000100132100200011001223111324092226836307084921621314538111139502835016214137551577410002922329297292932923529323
610042931921911611150008004543288340024275100010001000500020015964285432927331010001000100029034291291161001100010000100322100201111000213111287693636807303610362163030903815342402852716219135771578210002926729356293392926929256
6100429220219113111011028104580286720024180100010001000500030515964286992930031010001000100029204291251161001100010000100333100300111000210111282492216832306713502166530763817847472833216166134911576010002921629329293022930029219
610042920621811811151001104726286980024178100010001000500020015945286342924431010001000100029014291071161001100010000100232100200011000310121280590326821304084021592312338221144352837216373136681511910002920929316291962919929237
610042929722011311121001310465328697002423210001000100050004001595128526292823101000100010002902929117116100110001000010033210020121100001310128459181687030676382159330463816841432840616175138961568710002924929283292292920629322
6100429325218112111610038104551287990024284100010001000500020015948286082926131010001000100029166290621161001100010000100220100200111000212131313694596839311110422150930803817542392842116373137101551810002929429302291902925829240
61004291712191201115000710456828770002419810001000100050006001594628541292633101000100010002911129086116100110001000010022310011011100032312132279460685930399422156230473817941422846015983136111575110002924929303293112921729219
610042924221911311161114004625287860024260100010001000500040015950286042922231010001000100029106290471161001100010001100223100100211100001311131459517694330019382159732083814838342837816386134001544410002910929217292132928429196

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.2s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)03090a0e0f1e22243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120051899100011001200361195101094642560103401021000110000301001000010000107900857362366136317112003201200511200511131413113671501003020010000100006020010000100001200511200511150201100991004010010000100000100100001100000010000110032372107221196584000201091000040100120036120055120055120055120055
50204120054899000071001200391195131094672560103401021000110000301001000010000107903557362366136317112003301200511200511131493113671501003020010000100006020010000100001200541200511150201100991004010010000100000100100001100000010000010032102107221196584000201391000040100120052120055120055120052120054
502041200518990000110012003611951010944925601034010210001100003010010000100001079008573623661363170120031012005412005111314931136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000001000006100001100321021012211964640002101091000040100120036120052120055120052120055
502041200548990000010012002011951010946725601034010210001100003010010000100001079035573638061344611120045012005412005111314631136685010030200100001000060200100001000012003512005111502011009910040100100001000001001000011000000100000100321021072211965840002101091000040100120052120055120052120055120052
502041200359000000100012003911951310946725601034010210001100003010010000100001079035573545561363171120056012005412005111314131136685010030200100001000060200100001000012005412005111502011009910040100100001000001001000011000000100001100321021072211964640002101091000040100120055120052120055120055120052
5020412005489900001000120039119513109464256010340102100011000030100100001000010790355736236613631711200330120054120051113149311366850100302001000010000602001000010063120051120051115020110099100401001000010000010010000110000001000011003210210722119661400021310121000040100120036120055120052120055120036
502041200518990000110012002011951310946725601034010210001100003010010000100001079008573638061363171120040012005112005111314131136715010030200100001000060200100001000012005412005111502011009910040100100001000001001000011000000100001100321021072211965840002101391000040100120055120052120055120052120052
502041200519000000110012003911951010944925601034010010001100003010010000100001079035573638061363171120032012005112005111315331136715010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000100001100321021072211966140002131391000040100120052120055120052120055120036
502041200548990000100012003911951310946725601034010210002100003010010000100001079035573545561363171120054012005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000001000010100001100321021072211965840002101091000040100120055120052120052120055120052
502041200548990000100012003911951310946425601034010210001100003010010000100001079035573545561344611120060012005112005411314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000050100001100321021072211965840002101091000040100120052120055120055120052120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)0308090e0f18191e2223243a3f43494d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200478991110112100012003211119513109461256001640012100011000030010100001000010794835736044613337401200191200431200431131733113682500103002010000100006002010000100001200471200431150021109104001010000100001101000001100070041000111100031401210778119665400026601000040010120056120056120044120056120048
50024120047900000100210001200321111951310946825600134001410001100003001010000100001079593573604461333730120019120055120135113173311368650010300201000010000600201000010000120055120055115002110910400101000010000010100010210001001100010110003140610778119658400026651000040010120056120059120427120056120044
500241200478990011002000012003200119501109521256001640012100011000030010100001000010795935736428613332201200311200551200471131613113682500103002010000100006002010000100001200471200471150021109104001010000100001101000002100070011000001000031408107139119658400026651000040010120054120435120056120044120044
5002412005990200100011000120028101195081094612560016400121000210000300101000010000107952357358486133499012002312004312004711316553113682500103002010000100006002010000100001200551200471150021109104001010000100000101000002100010012100011110003140810788119670400046651000040010120044120044120048120056120056
500241200489020011001310001200321011950510946825600164001210002100003001010000100001079483573604461334990120019120043120047113165311368650010305011000010000600201000010000120055120043115002110910400101000010000010100000110000001100010100003140810788119670400020651000040010120056120048120048120048120044
50024120043899001100110001200281111951310955325600164001410001100003001010000100001079483573604461338660120019120047120047113174311368250010300201000010000600201000010000120055120047115002110910400101000010000010100060010001000100011110003140810778119660400026601000040010120056120048120048120044120048
500241200579020011002100012002811119513109468256001340012100011000030010100001000010795235736188613386601200191200551200431131735411368750010300201000010000600201000010000120047120047115002110910400101000010000010100000210000001100001110003140810798119658400029651000040010120048120044120048120048120044
5002412004389900010041000120040101195051094612560013400141000210000300101000010000107952357364286133322012002312004712005511317317114064512223002010000100006002010215100001200491200551150021109104001010000100000101001122100000041000011100031409107109119658400040981000040010120161120044120048120048120048
500241200438990010001000012003510119508109457816001640012100011000030010100001000010795935735848613386611200231200431200471132153113689500103002010000100006002010000100001200431200921150021109104001010000100000101000000100011012100011010003140810778119658400040651000040010120044120048120048120048120048
50024120043899001000100001200280011950110946825600134001210002100003001010000100001079483573618861338661120023120047120043113161311368650010300201000010000603981000010000120052120152115002110910400101000010000110100000010001000100011100003140810788119670400026651000040010120048120056120044120048120049

Test 3: throughput

Count: 8

Code:

  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  ld1 { v0.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)0305080e0f1e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802052672320100004101012670721818122580100100800001008000050011678081267112672226722166453166808010020080000200800002670726722118020110099100100800008000011008000003908003500035800356135390051101161126719662800001002670826723267232672326723
802042672220000004100012670721818122580100100800001008000050011678081266972684126729166453166808010020080000200800002672226722118020110099100100800008000001008000003908003500035800356135390051101161126719662800001002672326723267232672326723
8020426707200000000101267120181812258010010080000100800005001167808126698267072672216645316680801002008000020080000267222672211802011009910010080000800000100800000390800350003580035613500051101161126719662800001002672326723267232672326723
80204267222000011410000267072181812258010010080000100800005001167808126698267222672216645316680801002008000020080000267222670711802011009910010080000800000100800000008003500038800356135390051101161126719602800001002672426728267232672326725
80204267222000000410101267083018025801001008000010080000500116652512669826722267221664531666580100200800002008000026722267221180201100991001008000080000010080000039080035000358000061353900511011611267041062800001002672326708267232672326723
8020426722200000042000026707201212258010010080000100800005001167808126697267222672216630316665801002008000020080000267332673311802011009910010080000800000100800201942080057101588003861574219151101161126729992800001002673326733267342673326733
802042671420011006501032671801818162580100100800001008000050011674011267072673226732166553166908010020080000200800002673326733118020110099100100800008000001008000003908003500035800356135390051101161126704662800001002672326723267232672326723
8020426722200001141000226707201812258010010080000100800005001167808126697267222672216645316680801002008000020080000267222672211802011009910010080000800000100800000390800350003580035613500051101161126719662800001002672826708267232672326723
80204267222000001410001267070180122580100100800001008000050011657891266972672226722166453166808010020080000200800002672226722118020110099100100800008000001008000003908003500035800356135390051101161126719662800001002672326723267232672326723
80204267222000000410001267072181802580100100800001008000050011665251266972672226707166453166808010020080000200800002670726722118020110099100100800008000001008000003908003500035800356135390051101161126719062800001002672326708267232672326723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03070b0e0f18191e1f22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c2cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80025267222000000004101002670720016258001010800001080000501167605126697267272672716652316702800102080000208000026708267221180021109101080000800000108000008003903980039610390050202162226719106480000102670926709267092672326729
8002426708200000000000022671221801625800101080000108000050116884312669726727267081666731670780010208000020800002672226722118002110910108000080000010800003980039039800356135390050202162226705106480000102670926709267092672926709
80024267272000011008301002671201201625800101080000108000050116675012670226708267271665231670880010208000020800002670826722118002110910108000080000010800003980000039800390139430050202162226719106480000102672926723267282672926728
8002426708200000100450102267122121816258001010800001080000501168843126702267082672716668316708800102080000208000026728267221180021109101080000800000108000039800390428003961354300502021622267251010280000102670926728267292672926728
8002426727199000000450001266930012025800101080000108000050116884312668326708267271666731670880010208000020800002672826708118002110910108000080000010800003980035039800396135000502021622267301010280000102672926709267282672926729
80024267272000000000000226889212016258001010800001080000501168843126702267272672716652316688800102080000208000026722267221180021109101080000800000108000008000000800350039000502021622267241010480000102670926729267232670926729
8002426727200000000450102267132121216258001010800001080000501166750126702267282672716652316708800102080000208000026708267221180021109101080000800000108000039800350398003961350005020216222670506080000102672926709267292672926728
80024267272000010001200102266932120162580010108000010800005011676050266832672726727166723167088001020800002080000267282670811800211091010800008000001080000398000003980000613943005020216222672460480000102672926728267092672826709
800242670820000000041000226693012121625800101080000108000050116688612669726728267281665231670780010208000020800002670826722118002110910108000080000010800003980039043800006104300502021622267241010480000102670926728267282672326709
800242670820000110000002266932121203580010108000010800005011668861267022670826727166723166888001020800002080000267272672211800211091010800008000001080000080039035800390135000502021622267051010080000102670926723267282672826729