Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 8B)

Test 1: uops

Code:

  ld1 { v0.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
6100528451214171300010482327941111231071000100010005000615943027889282103101000100010002804828040116100110001000010002100201100222214118101557312337685920628349238221248492786115003124281319410002828128433282882826228324
6100428253210171800300509027925100232841000100010005000215973027921281073101000100010002823528075116100110001000010002100000100020214145105767279351154820505342438211547492792214333122891326710002822428235282312824828039
6100428430212151400300517328061000231331000100010005000515964027862282343101000100010002812628073116100110001000010002100001100020213448105937227351684820886353538081445532792414687122591331010002802828374288102816228268
61004281402101617002105249278641002317310001000100050006159770278752812331010001000100028153281271161001100010000100001000011000212142391008572243390135420628349238141354492783813895120771358810002816827893284152836128313
61004284592101719003004814279560002325110001000100050005159660278982815831010001000100028444282271161001100010001100521001101000202138221038372163501105020493356638231649492796214325119941374210002813928382284832848328092
6100428199210141500300518528087000232801000100010005000515986027950283763101000100010002831028234116100110001000010002100000100020214190104247269346875220916347838091358462789313993126651386110002814728342279842811028329
6100428121211191800200530027993000234391000100010005000515965027897283233101000100010002815528413116100110001000010002100090100021213292102957056341475220839340338161447462796313952116841337210002803628145281312815928483
61004284372111818002004851281180002310410001000100050003159710280522815531010001000100028095281711161001100010000100021000031000202139821032671893258104520573343538212055482787314567123801403710002808128068280682849928596
61004283062121215002105265279360002328110001000100050005159760279672853031010001000100028128281011161001100010000100021001001000202139131017672833422105120494350938171648482789413864122901398310002836928158283152831028321
610042847621019191020051282799000023588100010001000500041597802786228485310100010001000281262807911610011000100001000210010110001001377197697177335885120540333538262149542790214055121791396510002818428131281572851428314

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.8b }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020512005789910100020100120045119516109473256010340104100011000030100100001000010790975736524613662312001701200411200571131550311367450100302001000010000602001000010000120434120131415020110099100401001000010000010010001411000300110000111100321011071111966740004010121000040100120058120061120042120042120061
5020412006089911010020000120026119519109473256010340104100021000030100100001000010790625736668613677612001701200601200411131550311367750100302001000010000602001000010000120110120073115020110099100401001000010000010010002201000202110000011100321011071111966740002130121000040100120042120061120061120042120042
502041200418991000001000012004511951910947325601064011210002100003010010000100001079097573666861366231200170120041120060113137031136775010030200100001000060200100001000012012312005511502011009910040100100001000001001000120100020111000011111032101107111196644001110001000040100120042120094120060120042120042
50204120060899101000200001200451195191094732560106401021000210000301001000010000107907157365246136776120036012004112006011315503113677501003020010000100006020010000100001201011200801150201100991004010010000100000100100012010002001100000111003210110711119664400041010121000040100120042120058120058120058120042
5020412005789910100020100120045119513109455256010340102100021000030100100001000010790975735750613677612003601200411200571131520311367750100302001000010000602001000010000120080120060115020110099100401001000010000010010002101000220110000111100321011351111966740002130121000040100120061120042120042120042120061
50204120041899101000201001200421195191094702560106401041000210000301001000010000107908957366686133458120036012006012006011315503113771501003020010000100006020010000100001201471201561150201100991004010010000100000100100021110001011100001111003210110711119664400041313121000040100120061120061120042120061120042
50204120060900111100100001200421195161094552560103401041000210000301001000010000107906257366686133458120033312006012005711314003113677501003020010000100006020010000100001201051200601150201100991004010010000100000100100012110001111100001101103210110711119664400021313121000040100120058120061120061120042120042
5020412004189911000010000120026119513109473256010340104100021000030100100001000010790975736668613662312003601200601200411131520311367750100302001000010000602001000010000120109120093115020110099100401001000010000010010002111000100110000111100321011071111965440004131301000040100120042120042120061120042120042
502041200578991110001000012004511951910945525601064010210001100003010010000100001079071573652461366231200330120057120057113137031136305010030200100001000060200100001006412007812004211502011009910040100100001000001001000110100020111000011111032101107111196644000201001000040100120042120042120042120058120061
50204120057899100000200011200421195161094702560106401021000210000301001000010000107906257366686136776120017012006012004111313703113674501003020010053100006020010000100001201061200651150201100991004010010000100000100100021010002014100001111103210110711119664400041313121000040100120061120061120061120061120042

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)03l1i tlb fill (04)0e0f1e2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200478990001100012009211950510946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000110100001100000010000110031410151071717119662400026651000040010120048120048120048120048120048
500251200478990006100012003211950510944625600134001210001100003001010000100001079523573604461334991120023120047120047113165311368750010303381000010000600201000010000120047120047115002110910400101000010000110100001100000010000110031410161071716119662400026651000040010120048120048120048120048120048
500241200358990011100012003211950510946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311418850010300201000010000600201000010000120047120047115002110910400101000010000010100031100000010000110031410171071517119662400026651000040010120093120051120441120153120048
500241206879000111100012003211950710946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000010100001100000010000110031430171071718119662400026651000040010120036120048120048120048120048
500241200478990001100012002011950510946125600134001210001100003001010000100001079517573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000110100001100000010000114031700151151717119650400026651000040010120048120048120125120048120154
5002412005093100141002120032119637110533135360013400161000110001300101000010000107953257360446133499012001112004712004711316631136865001030020100001000060020100001000012004712003511500211091040010100001000011010000110000001000011003147320107716119662400026651000040010120048120050120049120049120048
500241200478990001010012003511950510946125600134001210001100003001010000100001079523573604461334991120011120048120047113165311368650010300201021410000600201000010000120047120047115002110910400101000010000110100001100000310000112032050171161920119876400026601000040010120049120048120048120048120048
500241200478990010010012003511950810946125600134001210001100003001010000100001079523573545561334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000010100001100000310000110031470171071717119662400026651000040010120048120048120048120084120049
5002512004789910010010012003211950510946125600134001210001100003001010000100001079523573604461334991120023120047120047113165311368650010300201000010000600201000010000120047120047115002110910400101000010000010100001100000010000110031470171071717119662400026651000040010120036120048120048120048120048
5002412004790000010000120032119505109461256001340012100011000030010100001000010795235736044613349911200231200471200471131653113686500103002010000100006002010000100001200471200471150021109104001010000100000101000011000003100001100314308107816119662400026651000040010120048120048120048120048120048

Test 3: throughput

Count: 8

Code:

  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  ld1 { v0.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267282000001000100440100326721307192580100100800001008000050011665190267112673626714166583166948010020080000200800002672726708118020110099010010080000800000100800000430800380003980039603944005110316112673400135800001002673726737267372673726715
8020426736200101000000021010002669201120258010010080000100800005001166525026706267312673116654316665801002008000020080000267142673611802011009901001008000080000010080020194308001910064800000159019051101161126704014144800001002673226732267322673226732
80204267312000000000000450000026712200025801001008000010080000500116712702670626731267311665431666580100200800002008038426741267301180201100990100100800008000011008000004308003802438800396139440051101161126733013135800001002673726737267372673726737
8020426736200100000000067000012671420119258010010080000100800005001167127026702267312673116654316689801002008000020080000267382674211802011009901001008000080000010080019200080019100218003860594319151101161126728010140800001002670826708267082673226732
802042673120000000000000010022672130721258010010080000100800005001167142026711267142673616659316672801002008000020080000267272672711802011009901001008000080000010080000000800380004180038613844005110116112673300135800001002673726737267152673726737
80204267362001010010000660100126716211162580100100800001008000050011683121267062673126731166303166898010020080000200800002671426737118020110099010010080000800000100800191945080058100618000061584319151101161126704014147800001002673226728267282670826708
8020426707200000000000044000032669927020258010010080000100800005001167231026702267272672716630316689801002008000020080000267312672711802011009901001008000080000010080000044080000000388003861394400511011611267330000800001002673726715267382673726715
802042673620010100100007001000267122001925801001008000010080000500116831202670226707267271665431668980100200807672008000026742267211180201100990100100800008000001008000000080000000080039010440051101161126724014140800001002673226708267082672826732
80204267272000000000100440000126692200192580100100800001008000050011683120267062673126712166543166898010020080000200800002670726727118020110099010010080000800000100800000430800380000800386039440051101161126728014104800001002672826713267282672826708
80204267072000000000000440000326721270202580100100800001008000050011667580267122673726714166383166948010020080000200800002673126707118020110099010010080000800000100800000430800380003880000603944005110116112670400140800001002672826732267082673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f1e1f223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd0d2d5map dispatch bubble (d6)d9daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526722200111838801267072181815258001010800001080000501166993002691426728267081667131669180010208000020800002670826722118002110910108000080000010800003908003503580035603939050200051600312670566080000102672726709267232670926709
8002426722200011410112671100180258001010800001080000501166750002691926741267081666731670280010208000020800002673126722118002110910108000080000010800003908003503880000613539050200021600332670566280000102672326723267232670926709
800242672220001053010267070180025800101080000108000050116760500268962672326737166673167028001020800002080000267272670811800211091010800008000001080000390800350358003561360050200011600132671906080000102670926709267232672326727
80024267222000114100026693218012258001010800001080000501166772002669726708267081665231670280010208000020800002671326736118002110910108000080000010800003908003523580035613539050200031600312671906080000102672326709267232670926709
800242672220001017300026707218181225800101080000108000050116884300266982672726708166733166888001020800002080000267222671011800211091010800008000001080000390800350931800350000050200011600132672500280000102672326709267092670926723
80024267082000110001267070181802580010108000010800005011722400026930267412670816668316702800102080000208000026708267081180021109101080000800000108000000800000358000061039050200031601312671906080000102672326711267232670926723
80024267092000114100126693018181225800101080000128000050116675001266832670826722166523166888001020800002080000267082672211800211091010800008000001080000008000003580000613539050200031600312671960280000102672326723267232672326709
800242670820001041000266930000258001010800001080000501167605002669726708267221665231668880010208000020800002670826722118002110910108000080000010800003908000003580000003539050200031600322670500480000102672326723267232672826709
800242672220001000012670701818112580010108000010800005011688430026700267082672216652316688800102080000208000026722267081180021109101080000800000108000039080035008003561039050200031600312671966080000102672326723267092672326729
800242670820001141001267072180025800101080000108000050116675000272432672226845166773166888001020800002080000267272672211800211091010800008000011080000390800350358003561039050200051600442671966280000102670926709267092670926730