fix(sitemap): filter all routes with robots meta containing noindex (#…

…7964)
facebook · Sep 1, 2022 · c458f28 · c458f28
1 parent 6f1d066
commit c458f28
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 13 deletions.
diff --git a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts
@@ -158,7 +158,10 @@ describe('createSitemap', () => {
           meta: {
             // @ts-expect-error: bad lib def
             toComponent: () => [
-              React.createElement('meta', {name: 'robots', content: 'noindex'}),
+              React.createElement('meta', {
+                name: 'robots',
+                content: 'NoFolloW, NoiNDeX',
+              }),
             ],
           },
         },

diff --git a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts
@@ -13,6 +13,40 @@ import type {DocusaurusConfig} from '@docusaurus/types';
 import type {HelmetServerState} from 'react-helmet-async';
 import type {PluginOptions} from './options';
 
+function isNoIndexMetaRoute({
+  head,
+  route,
+}: {
+  head: {[location: string]: HelmetServerState};
+  route: string;
+}) {
+  const isNoIndexMetaTag = ({
+    name,
+    content,
+  }: {
+    name?: string;
+    content?: string;
+  }): boolean => {
+    if (!name || !content) {
+      return false;
+    }
+    return (
+      // meta name is not case-sensitive
+      name.toLowerCase() === 'robots' &&
+      // Robots directives are not case-sensitive
+      content.toLowerCase().includes('noindex')
+    );
+  };
+
+  // https://github.com/staylor/react-helmet-async/pull/167
+  const meta = head[route]?.meta.toComponent() as unknown as
+    | ReactElement<{name?: string; content?: string}>[]
+    | undefined;
+  return meta?.some((tag) =>
+    isNoIndexMetaTag({name: tag.props.name, content: tag.props.content}),
+  );
+}
+
 export default async function createSitemap(
   siteConfig: DocusaurusConfig,
   routesPaths: string[],
@@ -27,18 +61,15 @@ export default async function createSitemap(
 
   const ignoreMatcher = createMatcher(ignorePatterns);
 
-  const includedRoutes = routesPaths.filter((route) => {
-    if (route.endsWith('404.html') || ignoreMatcher(route)) {
-      return false;
-    }
-    // https://github.com/staylor/react-helmet-async/pull/167
-    const meta = head[route]?.meta.toComponent() as unknown as
-      | ReactElement<{name?: string; content?: string}>[]
-      | undefined;
-    return !meta?.some(
-      (tag) => tag.props.name === 'robots' && tag.props.content === 'noindex',
+  function isRouteExcluded(route: string) {
+    return (
+      route.endsWith('404.html') ||
+      ignoreMatcher(route) ||
+      isNoIndexMetaRoute({head, route})
     );
-  });
+  }
+
+  const includedRoutes = routesPaths.filter((route) => !isRouteExcluded(route));
 
   if (includedRoutes.length === 0) {
     return null;

diff --git a/website/docs/seo.md b/website/docs/seo.md
@@ -124,14 +124,32 @@ Read more about the robots file in [the Google documentation](https://developers
 
 :::caution
 
-**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. Use `<meta name="robots" content="noindex">` as [page metadata](#single-page-metadata) to prevent it from appearing in search results entirely.
+**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed.
+
+To prevent your whole Docusaurus site from being indexed, use the [`noIndex`](./api/docusaurus.config.js.md#noIndex) site config. Some [hosting providers](./deployment.mdx) may also let you configure a `X-Robots-Tag: noindex` HTTP header (GitHub Pages does not support this).
+
+To prevent a single page from being indexed, use `<meta name="robots" content="noindex">` as [page metadata](#single-page-metadata). Read more about the [robots meta tag](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag).
 
 :::
 
 ## Sitemap file {#sitemap-file}
 
 Docusaurus provides the [`@docusaurus/plugin-sitemap`](./api/plugins/plugin-sitemap.md) plugin, which is shipped with `preset-classic` by default. It autogenerates a `sitemap.xml` file which will be available at `https://example.com/[baseUrl]/sitemap.xml` after the production build. This sitemap metadata helps search engine crawlers crawl your site more accurately.
 
+:::tip
+
+The sitemap plugin automatically filters pages containing a `noindex` [robots meta directive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag).
+
+For example, [`/examples/noIndex`](/examples/noIndex) is not included in the [Docusaurus sitemap.xml file](pathname:///sitemap.xml) because it contains the following [page metadata](#single-page-metadata):
+
+```html
+<head>
+  <meta name="robots" content="noindex, nofollow" />
+</head>
+```
+
+:::
+
 ## Human readable links {#human-readable-links}
 
 Docusaurus uses your file names as links, but you can always change that using slugs, see this [tutorial](./guides/docs/docs-introduction.md#document-id) for more details.

diff --git a/website/src/pages/examples/noIndex.md b/website/src/pages/examples/noIndex.md
@@ -0,0 +1,25 @@
+# No Index Page example
+
+<head>
+  <meta name="robots" content="nOiNdeX, NoFolLoW" />
+</head>
+
+This page will not be indexed by search engines because it contains the page following [page metadata](/docs/seo#single-page-metadata) markup:
+
+```html
+<head>
+  <meta name="robots" content="noindex, nofollow" />
+</head>
+```
+
+:::tip
+
+The sitemap plugin filters pages containing a `noindex` content value. This page doesn't appear in Docusaurus [sitemap.xml](pathname:///sitemap.xml) file.
+
+:::
+
+:::note
+
+Robots directives are [case-insensitive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag#directives).
+
+:::