Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 8B)

Test 1: uops

Code:

  ld1 { v0.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03070a0e0f1e22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
6100528451214171300010482327941111231071000100010005000615943027889282103101000100010002804828040116100110001000010002100201100222214118101557312337685920628349238221248492786115003124281319410002828128433282882826228324
6100428253210171800300509027925100232841000100010005000215973027921281073101000100010002823528075116100110001000010002100000100020214145105767279351154820505342438211547492792214333122891326710002822428235282312824828039
6100428430212151400300517328061000231331000100010005000515964027862282343101000100010002812628073116100110001000010002100001100020213448105937227351684820886353538081445532792414687122591331010002802828374288102816228268
61004281402101617002105249278641002317310001000100050006159770278752812331010001000100028153281271161001100010000100001000011000212142391008572243390135420628349238141354492783813895120771358810002816827893284152836128313
61004284592101719003004814279560002325110001000100050005159660278982815831010001000100028444282271161001100010001100521001101000202138221038372163501105020493356638231649492796214325119941374210002813928382284832848328092
6100428199210141500300518528087000232801000100010005000515986027950283763101000100010002831028234116100110001000010002100000100020214190104247269346875220916347838091358462789313993126651386110002814728342279842811028329
6100428121211191800200530027993000234391000100010005000515965027897283233101000100010002815528413116100110001000010002100090100021213292102957056341475220839340338161447462796313952116841337210002803628145281312815928483
61004284372111818002004851281180002310410001000100050003159710280522815531010001000100028095281711161001100010000100021000031000202139821032671893258104520573343538212055482787314567123801403710002808128068280682849928596
61004283062121215002105265279360002328110001000100050005159760279672853031010001000100028128281011161001100010000100021001001000202139131017672833422105120494350938171648482789413864122901398310002836928158283152831028321
610042847621019191020051282799000023588100010001000500041597802786228485310100010001000281262807911610011000100001000210010110001001377197697177335885120540333538262149542790214055121791396510002818428131281572851428314

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.8b }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire (01)cycle (02)0305080b0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5020512005789910100020100120045119516109473256010340104100011000030100100001000010790975736524613662312001701200411200571131550311367450100302001000010000602001000010000120434120131415020110099100401001000010000010010001411000300110000111100321011071111966740004010121000040100120058120061120042120042120061
5020412006089911010020000120026119519109473256010340104100021000030100100001000010790625736668613677612001701200601200411131550311367750100302001000010000602001000010000120110120073115020110099100401001000010000010010002201000202110000011100321011071111966740002130121000040100120042120061120061120042120042
502041200418991000001000012004511951910947325601064011210002100003010010000100001079097573666861366231200170120041120060113137031136775010030200100001000060200100001000012012312005511502011009910040100100001000001001000120100020111000011111032101107111196644001110001000040100120042120094120060120042120042
50204120060899101000200001200451195191094732560106401021000210000301001000010000107907157365246136776120036012004112006011315503113677501003020010000100006020010000100001201011200801150201100991004010010000100000100100012010002001100000111003210110711119664400041010121000040100120042120058120058120058120042
5020412005789910100020100120045119513109455256010340102100021000030100100001000010790975735750613677612003601200411200571131520311367750100302001000010000602001000010000120080120060115020110099100401001000010000010010002101000220110000111100321011351111966740002130121000040100120061120042120042120042120061
50204120041899101000201001200421195191094702560106401041000210000301001000010000107908957366686133458120036012006012006011315503113771501003020010000100006020010000100001201471201561150201100991004010010000100000100100021110001011100001111003210110711119664400041313121000040100120061120061120042120061120042
50204120060900111100100001200421195161094552560103401041000210000301001000010000107906257366686133458120033312006012005711314003113677501003020010000100006020010000100001201051200601150201100991004010010000100000100100012110001111100001101103210110711119664400021313121000040100120058120061120061120042120042
5020412004189911000010000120026119513109473256010340104100021000030100100001000010790975736668613662312003601200601200411131520311367750100302001000010000602001000010000120109120093115020110099100401001000010000010010002111000100110000111100321011071111965440004131301000040100120042120042120061120042120042
502041200578991110001000012004511951910945525601064010210001100003010010000100001079071573652461366231200330120057120057113137031136305010030200100001000060200100001006412007812004211502011009910040100100001000001001000110100020111000011111032101107111196644000201001000040100120042120042120042120058120061
50204120057899100000200011200421195161094702560106401021000210000301001000010000107906257366686136776120017012006012004111313703113674501003020010053100006020010000100001201061200651150201100991004010010000100000100100021010002014100001111103210110711119664400041313121000040100120061120061120061120061120042

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)03040e0f1e2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200478990001100012009211950510946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000110100001100000010000110031410151071717119662400026651000040010120048120048120048120048120048
500251200478990006100012003211950510944625600134001210001100003001010000100001079523573604461334991120023120047120047113165311368750010303381000010000600201000010000120047120047115002110910400101000010000110100001100000010000110031410161071716119662400026651000040010120048120048120048120048120048
500241200358990011100012003211950510946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311418850010300201000010000600201000010000120047120047115002110910400101000010000010100031100000010000110031410171071517119662400026651000040010120093120051120441120153120048
500241206879000111100012003211950710946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000010100001100000010000110031430171071718119662400026651000040010120036120048120048120048120048
500241200478990001100012002011950510946125600134001210001100003001010000100001079517573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000110100001100000010000114031700151151717119650400026651000040010120048120048120125120048120154
5002412005093100141002120032119637110533135360013400161000110001300101000010000107953257360446133499012001112004712004711316631136865001030020100001000060020100001000012004712003511500211091040010100001000011010000110000001000011003147320107716119662400026651000040010120048120050120049120049120048
500241200478990001010012003511950510946125600134001210001100003001010000100001079523573604461334991120011120048120047113165311368650010300201021410000600201000010000120047120047115002110910400101000010000110100001100000310000112032050171161920119876400026601000040010120049120048120048120048120048
500241200478990010010012003511950810946125600134001210001100003001010000100001079523573545561334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000010100001100000310000110031470171071717119662400026651000040010120048120048120048120084120049
5002512004789910010010012003211950510946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000010100001100000010000110031470171071717119662400026651000040010120036120048120048120048120048
5002412004790000010000120032119505109461256001340012100011000030010100001000010795235736044613349911200231200471200471131653113686500103002010000100006002010000100001200471200471150021109104001010000100000101000011000003100001100314308107816119662400026651000040010120048120048120048120048120048

Test 3: throughput

Count: 8

Code:

  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03050708090a0b0e0f18191e1f2223243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80205267282000001000100440100326721307192580100100800001008000050011665190267112673626714166583166948010020080000200800002672726708118020110099010010080000800000100800000430800380003980039603944005110316112673400135800001002673726737267372673726715
8020426736200101000000021010002669201120258010010080000100800005001166525026706267312673116654316665801002008000020080000267142673611802011009901001008000080000010080020194308001910064800000159019051101161126704014144800001002673226732267322673226732
80204267312000000000000450000026712200025801001008000010080000500116712702670626731267311665431666580100200800002008038426741267301180201100990100100800008000011008000004308003802438800396139440051101161126733013135800001002673726737267372673726737
8020426736200100000000067000012671420119258010010080000100800005001167127026702267312673116654316689801002008000020080000267382674211802011009901001008000080000010080019200080019100218003860594319151101161126728010140800001002670826708267082673226732
802042673120000000000000010022672130721258010010080000100800005001167142026711267142673616659316672801002008000020080000267272672711802011009901001008000080000010080000000800380004180038613844005110116112673300135800001002673726737267152673726737
80204267362001010010000660100126716211162580100100800001008000050011683121267062673126731166303166898010020080000200800002671426737118020110099010010080000800000100800191945080058100618000061584319151101161126704014147800001002673226728267282670826708
8020426707200000000000044000032669927020258010010080000100800005001167231026702267272672716630316689801002008000020080000267312672711802011009901001008000080000010080000044080000000388003861394400511011611267330000800001002673726715267382673726715
802042673620010100100007001000267122001925801001008000010080000500116831202670226707267271665431668980100200807672008000026742267211180201100990100100800008000001008000000080000000080039010440051101161126724014140800001002673226708267082672826732
80204267272000000000100440000126692200192580100100800001008000050011683120267062673126712166543166898010020080000200800002670726727118020110099010010080000800000100800000430800380000800386039440051101161126728014104800001002672826713267282672826708
80204267072000000000000440000326721270202580100100800001008000050011667580267122673726714166383166948010020080000200800002673126707118020110099010010080000800000100800000430800380003880000603944005110116112670400140800001002672826732267082673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)030b0e0f1e1f223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c5cfd0d2d5d6d9daddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002526722200111838801267072181815258001010800001080000501166993002691426728267081667131669180010208000020800002670826722118002110910108000080000010800003908003503580035603939050200051600312670566080000102672726709267232670926709
8002426722200011410112671100180258001010800001080000501166750002691926741267081666731670280010208000020800002673126722118002110910108000080000010800003908003503880000613539050200021600332670566280000102672326723267232670926709
800242672220001053010267070180025800101080000108000050116760500268962672326737166673167028001020800002080000267272670811800211091010800008000001080000390800350358003561360050200011600132671906080000102670926709267232672326727
80024267222000114100026693218012258001010800001080000501166772002669726708267081665231670280010208000020800002671326736118002110910108000080000010800003908003523580035613539050200031600312671906080000102672326709267232670926709
800242672220001017300026707218181225800101080000108000050116884300266982672726708166733166888001020800002080000267222671011800211091010800008000001080000390800350931800350000050200011600132672500280000102672326709267092670926723
80024267082000110001267070181802580010108000010800005011722400026930267412670816668316702800102080000208000026708267081180021109101080000800000108000000800000358000061039050200031601312671906080000102672326711267232670926723
80024267092000114100126693018181225800101080000128000050116675001266832670826722166523166888001020800002080000267082672211800211091010800008000001080000008000003580000613539050200031600312671960280000102672326723267232672326709
800242670820001041000266930000258001010800001080000501167605002669726708267221665231668880010208000020800002670826722118002110910108000080000010800003908000003580000003539050200031600322670500480000102672326723267232672826709
800242672220001000012670701818112580010108000010800005011688430026700267082672216652316688800102080000208000026722267081180021109101080000800000108000039080035008003561039050200031600312671966080000102672326723267092672326729
800242670820001141001267072180025800101080000108000050116675000272432672226845166773166888001020800002080000267272672211800211091010800008000011080000390800350358003561039050200051600442671966280000102670926709267092670926730