From df588911265de18737ccdbbda17ac3708a6098b0 Mon Sep 17 00:00:00 2001 From: Fraser Thompson Date: Wed, 18 Jan 2023 01:42:41 +1300 Subject: [PATCH] feat(gatsby-source-filesystem): Only generate hashes when a file has changed, and add an option for skipping hashing (#37464) Co-authored-by: LekoArts --- packages/gatsby-source-filesystem/README.md | 76 +++--- .../gatsby-source-filesystem/package.json | 1 - .../src/__tests__/create-file-node.js | 236 ++++++++++++------ .../src/create-file-node.js | 22 +- .../src/gatsby-node.js | 12 +- 5 files changed, 236 insertions(+), 111 deletions(-) diff --git a/packages/gatsby-source-filesystem/README.md b/packages/gatsby-source-filesystem/README.md index 7b14776de2589..532092749740b 100644 --- a/packages/gatsby-source-filesystem/README.md +++ b/packages/gatsby-source-filesystem/README.md @@ -1,35 +1,28 @@ # gatsby-source-filesystem -A Gatsby source plugin for sourcing data into your Gatsby application -from your local filesystem. +A Gatsby source plugin for sourcing data into your Gatsby application from your local filesystem. -The plugin creates `File` nodes from files. The various "transformer" -plugins can transform `File` nodes into various other types of data e.g. -`gatsby-transformer-json` transforms JSON files into JSON data nodes and -`gatsby-transformer-remark` transforms markdown files into `MarkdownRemark` -nodes from which you can query an HTML representation of the markdown. +The plugin creates `File` nodes from files. The various "transformer" plugins can transform `File` nodes into various other types of data e.g. [`gatsby-transformer-json`](https://www.gatsbyjs.com/plugins/gatsby-transformer-json/) transforms JSON files into JSON data nodes and [`gatsby-transformer-remark`](https://www.gatsbyjs.com/plugins/gatsby-transformer-remark/) transforms markdown files into `MarkdownRemark` nodes from which you can query an HTML representation of the markdown. ## Install -`npm install gatsby-source-filesystem` +```shell +npm install gatsby-source-filesystem +``` ## How to use -```javascript -// In your gatsby-config.js +You can have multiple instances of this plugin to read source nodes from different locations on your filesystem. Be sure to give each instance a unique `name`. + +```js:title=gatsby-config.js module.exports = { plugins: [ - // You can have multiple instances of this plugin - // to read source nodes from different locations on your - // filesystem. - // - // The following sets up the Jekyll pattern of having a - // "pages" directory for Markdown files and a "data" directory - // for `.json`, `.yaml`, `.csv`. { resolve: `gatsby-source-filesystem`, options: { + // The unique name for each instance name: `pages`, + // Path to the directory path: `${__dirname}/src/pages/`, }, }, @@ -38,7 +31,10 @@ module.exports = { options: { name: `data`, path: `${__dirname}/src/data/`, - ignore: [`**/\.*`], // ignore files starting with a dot + // Ignore files starting with a dot + ignore: [`**/\.*`], + // Use "mtime" and "inode" to fingerprint files (to check if file has changed) + fastHash: true, }, }, ], @@ -47,9 +43,23 @@ module.exports = { ## Options -In addition to the name and path parameters you may pass an optional `ignore` array of file globs to ignore. +### name + +**Required** + +A unique name for the `gatsby-source-filesytem` instance. This name will also be a key on the `File` node called `sourceInstanceName`. You can use this e.g. for filtering. + +### path + +**Required** + +Path to the folder that should be sourced. Ideally an absolute path. -They will be added to the following default list: +### ignore + +**Optional** + +Array of file globs to ignore. They will be added to the following default list: ```text **/*.un~ @@ -62,8 +72,24 @@ They will be added to the following default list: ../**/dist/** ``` +### fastHash + +**Optional** + +By default, `gatsby-source-filesystem` creates an MD5 hash of each file to determine if it has changed between sourcing. However, on sites with many large files this can lead to a significant slowdown. Thus you can enable the `fastHash` setting to use an alternative hashing mechanism. + +`fastHash` uses the `mtime` and `inode` to fingerprint the files. On a modern OS this can be considered a robust solution to determine if a file has changed, however on older systems it can be unreliable. Therefore it's not enabled by default. + +### Environment variables + To prevent concurrent requests overload of `processRemoteNode`, you can adjust the `200` default concurrent downloads, with `GATSBY_CONCURRENT_DOWNLOAD` environment variable. +In case that due to spotty network, or slow connection, some remote files fail to download. Even after multiple retries and adjusting concurrent downloads, you can adjust timeout and retry settings with these environment variables: + +- `GATSBY_STALL_RETRY_LIMIT`, default: `3` +- `GATSBY_STALL_TIMEOUT`, default: `30000` +- `GATSBY_CONNECTION_TIMEOUT`, default: `30000` + ## How to query You can query file nodes like the following: @@ -263,7 +289,7 @@ The `createFileNodeFromBuffer` helper accepts a `Buffer`, caches its contents to The name of the file can be passed to the `createFileNodeFromBuffer` helper. If no name is given, the content hash will be used to determine the name. -## Example usage +#### Example usage The following example is adapted from the source of [`gatsby-source-mysql`](https://github.com/malcolm-kee/gatsby-source-mysql): @@ -338,11 +364,3 @@ function createMySqlNodes({ name, __sql, idField, keys }, results, ctx) { module.exports = createMySqlNodes ``` - -## Troubleshooting - -In case that due to spotty network, or slow connection, some remote files fail to download. Even after multiple retries and adjusting concurrent downloads, you can adjust timeout and retry settings with these environment variables: - -- `GATSBY_STALL_RETRY_LIMIT`, default: `3` -- `GATSBY_STALL_TIMEOUT`, default: `30000` -- `GATSBY_CONNECTION_TIMEOUT`, default: `30000` diff --git a/packages/gatsby-source-filesystem/package.json b/packages/gatsby-source-filesystem/package.json index db89aa0450ad0..d6716f4d2d7b0 100644 --- a/packages/gatsby-source-filesystem/package.json +++ b/packages/gatsby-source-filesystem/package.json @@ -12,7 +12,6 @@ "file-type": "^16.5.4", "fs-extra": "^11.1.0", "gatsby-core-utils": "^4.5.0-next.0", - "md5-file": "^5.0.0", "mime": "^3.0.0", "pretty-bytes": "^5.6.0", "valid-url": "^1.0.9", diff --git a/packages/gatsby-source-filesystem/src/__tests__/create-file-node.js b/packages/gatsby-source-filesystem/src/__tests__/create-file-node.js index ac8149ddfbf49..19fa6e4eabb13 100644 --- a/packages/gatsby-source-filesystem/src/__tests__/create-file-node.js +++ b/packages/gatsby-source-filesystem/src/__tests__/create-file-node.js @@ -5,6 +5,95 @@ const fs = require(`fs-extra`) const fsStatBak = fs.stat +const createMockCache = (get = jest.fn()) => { + return { + get, + set: jest.fn(), + directory: __dirname, + } +} + +const createMockCreateNodeId = () => { + const createNodeId = jest.fn() + createNodeId.mockReturnValue(`uuid-from-gatsby`) + return createNodeId +} + +// MD5 hash of the file (if the mock below changes this should change) +const fileHash = `8d777f385d3dfec8815d20f7496026dc` + +// mtime + inode (if the mock below changes this should change) +const fileFastHash = `123456123456` + +function testNode(node, dname, fname, contentDigest) { + // Sanitize all filenames + Object.keys(node).forEach(key => { + if (typeof node[key] === `string`) { + node[key] = node[key].replace(new RegExp(dname, `g`), ``) + node[key] = node[key].replace(new RegExp(fname, `g`), ``) + } + }) + Object.keys(node.internal).forEach(key => { + if (typeof node.internal[key] === `string`) { + node.internal[key] = node.internal[key].replace( + new RegExp(dname, `g`), + `` + ) + node.internal[key] = node.internal[key].replace( + new RegExp(fname, `g`), + `` + ) + } + }) + + // Note: this snapshot should update if the mock below is changed + expect(node).toMatchInlineSnapshot(` + Object { + "absolutePath": "/f", + "accessTime": "1970-01-01T00:02:03.456Z", + "atime": "1970-01-01T00:02:03.456Z", + "atimeMs": 123456, + "base": "f", + "birthTime": "1970-01-01T00:02:03.456Z", + "birthtime": "1970-01-01T00:02:03.456Z", + "birthtimeMs": 123456, + "blksize": 123456, + "blocks": 123456, + "changeTime": "1970-01-01T00:02:03.456Z", + "children": Array [], + "ctime": "1970-01-01T00:02:03.456Z", + "ctimeMs": 123456, + "dev": 123456, + "dir": "", + "ext": "", + "extension": "", + "id": "uuid-from-gatsby", + "ino": 123456, + "internal": Object { + "contentDigest": "${contentDigest}", + "description": "File \\"/f\\"", + "mediaType": "application/octet-stream", + "type": "File", + }, + "mode": 123456, + "modifiedTime": "1970-01-01T00:02:03.456Z", + "mtime": "1970-01-01T00:02:03.456Z", + "mtimeMs": 123456, + "name": "f", + "nlink": 123456, + "parent": null, + "prettySize": "123 kB", + "rdev": 123456, + "relativeDirectory": "", + "relativePath": "/f", + "root": "", + "size": 123456, + "sourceInstanceName": "__PROGRAMMATIC__", + "uid": 123456, + } + `) +} + // FIXME: This test needs to not use snapshots because of file differences // and locations across users and CI systems describe(`create-file-node`, () => { @@ -43,12 +132,15 @@ describe(`create-file-node`, () => { }) it(`creates a file node`, async () => { - const createNodeId = jest.fn() - createNodeId.mockReturnValue(`uuid-from-gatsby`) + const createNodeId = createMockCreateNodeId() + + const cache = createMockCache() + return createFileNode( path.resolve(`${__dirname}/fixtures/file.json`), createNodeId, - {} + {}, + cache ) }) @@ -56,80 +148,74 @@ describe(`create-file-node`, () => { const dname = fs.mkdtempSync(`gatsby-create-file-node-test`).trim() try { const fname = path.join(dname, `f`) - console.log(dname, fname) fs.writeFileSync(fname, `data`) try { - const createNodeId = jest.fn() - createNodeId.mockReturnValue(`uuid-from-gatsby`) - - const node = await createFileNode(fname, createNodeId, {}) - - // Sanitize all filenames - Object.keys(node).forEach(key => { - if (typeof node[key] === `string`) { - node[key] = node[key].replace(new RegExp(dname, `g`), ``) - node[key] = node[key].replace(new RegExp(fname, `g`), ``) - } - }) - Object.keys(node.internal).forEach(key => { - if (typeof node.internal[key] === `string`) { - node.internal[key] = node.internal[key].replace( - new RegExp(dname, `g`), - `` - ) - node.internal[key] = node.internal[key].replace( - new RegExp(fname, `g`), - `` - ) - } - }) - - // Note: this snapshot should update if the mock above is changed - expect(node).toMatchInlineSnapshot(` - Object { - "absolutePath": "/f", - "accessTime": "1970-01-01T00:02:03.456Z", - "atime": "1970-01-01T00:02:03.456Z", - "atimeMs": 123456, - "base": "f", - "birthTime": "1970-01-01T00:02:03.456Z", - "birthtime": "1970-01-01T00:02:03.456Z", - "birthtimeMs": 123456, - "blksize": 123456, - "blocks": 123456, - "changeTime": "1970-01-01T00:02:03.456Z", - "children": Array [], - "ctime": "1970-01-01T00:02:03.456Z", - "ctimeMs": 123456, - "dev": 123456, - "dir": "", - "ext": "", - "extension": "", - "id": "uuid-from-gatsby", - "ino": 123456, - "internal": Object { - "contentDigest": "8d777f385d3dfec8815d20f7496026dc", - "description": "File \\"/f\\"", - "mediaType": "application/octet-stream", - "type": "File", - }, - "mode": 123456, - "modifiedTime": "1970-01-01T00:02:03.456Z", - "mtime": "1970-01-01T00:02:03.456Z", - "mtimeMs": 123456, - "name": "f", - "nlink": 123456, - "parent": null, - "prettySize": "123 kB", - "rdev": 123456, - "relativeDirectory": "", - "relativePath": "/f", - "root": "", - "size": 123456, - "sourceInstanceName": "__PROGRAMMATIC__", - "uid": 123456, - } - `) + const createNodeId = createMockCreateNodeId() + + const emptyCache = { + get: jest.fn(), + set: jest.fn(), + directory: __dirname, + } + + const node = await createFileNode(fname, createNodeId, {}, emptyCache) + + testNode(node, dname, fname, fileHash) + } finally { + fs.unlinkSync(fname) + } + } finally { + fs.rmdirSync(dname) + } + }) + + it(`records the shape of the node from cache`, async () => { + const dname = fs.mkdtempSync(`gatsby-create-file-node-test`).trim() + try { + const fname = path.join(dname, `f`) + fs.writeFileSync(fname, `data`) + try { + const createNodeId = createMockCreateNodeId() + + const getFromCache = jest.fn() + getFromCache.mockReturnValue(fileHash) + const cache = createMockCache(getFromCache) + + const nodeFromCache = await createFileNode( + fname, + createNodeId, + {}, + cache + ) + + testNode(nodeFromCache, dname, fname, fileHash) + } finally { + fs.unlinkSync(fname) + } + } finally { + fs.rmdirSync(dname) + } + }) + + it(`records the shape of the fast hashed node`, async () => { + const dname = fs.mkdtempSync(`gatsby-create-file-node-test`).trim() + try { + const fname = path.join(dname, `f`) + fs.writeFileSync(fname, `data`) + try { + const createNodeId = createMockCreateNodeId() + const cache = createMockCache() + + const nodeFastHash = await createFileNode( + fname, + createNodeId, + { + fastHash: true, + }, + cache + ) + + testNode(nodeFastHash, dname, fname, fileFastHash) } finally { fs.unlinkSync(fname) } diff --git a/packages/gatsby-source-filesystem/src/create-file-node.js b/packages/gatsby-source-filesystem/src/create-file-node.js index 3e9730fad6593..9877b8c751973 100644 --- a/packages/gatsby-source-filesystem/src/create-file-node.js +++ b/packages/gatsby-source-filesystem/src/create-file-node.js @@ -3,13 +3,13 @@ const fs = require(`fs-extra`) const mime = require(`mime`) const prettyBytes = require(`pretty-bytes`) -const md5File = require(`md5-file`) -const { createContentDigest, slash } = require(`gatsby-core-utils`) +const { createContentDigest, slash, md5File } = require(`gatsby-core-utils`) exports.createFileNode = async ( pathToFile, createNodeId, - pluginOptions = {} + pluginOptions = {}, + cache = null ) => { const slashed = slash(pathToFile) const parsedSlashed = path.parse(slashed) @@ -35,7 +35,21 @@ exports.createFileNode = async ( description: `Directory "${path.relative(process.cwd(), slashed)}"`, } } else { - const contentDigest = await md5File(slashedFile.absolutePath) + const key = stats.mtimeMs.toString() + stats.ino.toString() + let contentDigest + + if (pluginOptions.fastHash) { + // Skip hashing. + contentDigest = key + } else { + // Generate a hash, but only if the file has changed. + contentDigest = cache && (await cache.get(key)) + if (!contentDigest) { + contentDigest = await md5File(slashedFile.absolutePath) + if (cache) await cache.set(key, contentDigest) + } + } + const mediaType = mime.getType(slashedFile.ext) internal = { contentDigest, diff --git a/packages/gatsby-source-filesystem/src/gatsby-node.js b/packages/gatsby-source-filesystem/src/gatsby-node.js index 15ddd7e514033..5d629ca748d96 100644 --- a/packages/gatsby-source-filesystem/src/gatsby-node.js +++ b/packages/gatsby-source-filesystem/src/gatsby-node.js @@ -16,14 +16,21 @@ exports.onPreInit = ({ reporter }) => { * Create a state machine to manage Chokidar's not-ready/ready states. */ const createFSMachine = ( - { actions: { createNode, deleteNode }, getNode, createNodeId, reporter }, + { + actions: { createNode, deleteNode }, + getNode, + createNodeId, + reporter, + cache, + }, pluginOptions ) => { const createAndProcessNode = path => { const fileNodePromise = createFileNode( path, createNodeId, - pluginOptions + pluginOptions, + cache ).then(fileNode => { createNode(fileNode) return null @@ -162,6 +169,7 @@ exports.pluginOptionsSchema = ({ Joi }) => Joi.object({ name: Joi.string(), path: Joi.string(), + fastHash: Joi.boolean().default(false), ignore: Joi.array().items( Joi.string(), Joi.object().regex(),