From e9cd536c543a0b722cb3acdee2cf2c7f881a9016 Mon Sep 17 00:00:00 2001 From: isaacs Date: Mon, 10 Jun 2019 14:47:25 -0700 Subject: [PATCH] Use custom cachable fs.realpath implementation In this use case, we don't care much about a lot of the stuff that fs.realpath can (and should!) do. The only thing that's relevant to reading a package tree is whether package folders are symbolic links, and if so, where they point. Additionally, we don't need to re-start the fs.lstat party every time we walk to a new directory. While it makes sense for fs.realpath to do this in the general case, it's not required when reading a package tree, and results in a geometric explosion of lstat syscalls. For example, if a project is in /Users/hyooman/projects/company/website, and it has 1000 dependencies in node_modules, then a whopping 6,000 lstat calls will be made just to repeatedly verify that /Users/hyooman/projects/company/website/node_modules has not moved! In this implementation, every realpath call is cached, as is every lstat. Additionally, process.cwd() is assumed to be "real enough", and added to the cache initially, which means almost never having to walk all the way up to the root directory. In the npm cli project, this drops the lstat count from 14885 to 3054 for a single call to read-package-tree on my system. Larger projects, or projects deeper in a folder tree, will have even larger reductions. This does not account, itself, for a particularly large speed-up, since lstat calls do tend to be fairly fast, and the repetitiveness means that there are a lot of hits in the file system's stat cache. But it does make read-package-tree 10-30% faster in common use cases. --- package.json | 3 +- realpath.js | 92 ++++++++++++++++++++ rpt.js | 108 ++++++++++++++++++------ tap-snapshots/test-basic.js-TAP.test.js | 30 +++++++ test/basic.js | 33 ++++++++ 5 files changed, 238 insertions(+), 28 deletions(-) create mode 100644 realpath.js diff --git a/package.json b/package.json index 3ce66fd..857ea7f 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,8 @@ }, "homepage": "https://github.com/npm/read-package-tree", "files": [ - "rpt.js" + "rpt.js", + "realpath.js" ], "tap": { "100": true diff --git a/realpath.js b/realpath.js new file mode 100644 index 0000000..814e560 --- /dev/null +++ b/realpath.js @@ -0,0 +1,92 @@ +// look up the realpath, but cache stats to minimize overhead +// If the parent folder is in the realpath cache, then we just +// lstat the child, since there's no need to do a full realpath +// This is not a separate module, and is much simpler than Node's +// built-in fs.realpath, because we only care about symbolic links, +// so we can handle many fewer edge cases. + +const fs = require('fs') +const { promisify } = require('util') +const readlink = promisify(fs.readlink) +const lstat = promisify(fs.lstat) +const { resolve, basename, dirname } = require('path') + +const realpathCached = (path, rpcache, stcache, depth) => { + // just a safety against extremely deep eloops + /* istanbul ignore next */ + if (depth > 2000) + throw eloop(path) + + if (rpcache.has(path)) + return Promise.resolve(rpcache.get(path)) + + const dir = dirname(path) + const base = basename(path) + + if (base && rpcache.has(dir)) + return realpathChild(dir, base, rpcache, stcache, depth) + + // if it's the root, then we know it's real + if (!base) { + rpcache.set(dir, dir) + return Promise.resolve(dir) + } + + // the parent, what is that? + // find out, and then come back. + return realpathCached(dir, rpcache, stcache, depth + 1).then(() => + realpathCached(path, rpcache, stcache, depth + 1)) +} + +const lstatCached = (path, stcache) => { + if (stcache.has(path)) + return Promise.resolve(stcache.get(path)) + + const p = lstat(path).then(st => { + stcache.set(path, st) + return st + }) + stcache.set(path, p) + return p +} + +// This is a slight fib, as it doesn't actually occur during a stat syscall. +// But file systems are giant piles of lies, so whatever. +const eloop = path => + Object.assign(new Error( + `ELOOP: too many symbolic links encountered, stat '${path}'`), { + errno: -62, + syscall: 'stat', + code: 'ELOOP', + path: path, + }) + +const realpathChild = (dir, base, rpcache, stcache, depth) => { + const realdir = rpcache.get(dir) + // that unpossible + /* istanbul ignore next */ + if (typeof realdir === 'undefined') + throw new Error('in realpathChild without parent being in realpath cache') + + const realish = resolve(realdir, base) + return lstatCached(realish, stcache).then(st => { + if (!st.isSymbolicLink()) { + rpcache.set(resolve(dir, base), realish) + return realish + } + + let res + return readlink(realish).then(target => { + const resolved = res = resolve(realdir, target) + if (realish === resolved) + throw eloop(realish) + + return realpathCached(resolved, rpcache, stcache, depth + 1) + }).then(real => { + rpcache.set(resolve(dir, base), real) + return real + }) + }) +} + +module.exports = realpathCached diff --git a/rpt.js b/rpt.js index 046ad54..8e7ae95 100644 --- a/rpt.js +++ b/rpt.js @@ -1,16 +1,17 @@ const fs = require('fs') const { promisify } = require('util') -const realpath = promisify(fs.realpath) -const { basename, dirname, join } = require('path') +const { resolve, basename, dirname, join } = require('path') const rpj = promisify(require('read-package-json')) const readdir = promisify(require('readdir-scoped-modules')) +const realpath = require('./realpath.js') let ID = 0 class Node { constructor (pkg, logical, physical, er, cache) { // should be impossible. + const cached = cache.get(physical) /* istanbul ignore next */ - if (cache.get(physical)) + if (cached && !cached.then) throw new Error('re-creating already instantiated node') cache.set(physical, this) @@ -34,37 +35,84 @@ class Node { class Link extends Node { constructor (pkg, logical, physical, realpath, er, cache) { super(pkg, logical, physical, er, cache) + + // if the target has started, but not completed, then + // a Promise will be in the cache to indicate this. const cachedTarget = cache.get(realpath) + if (cachedTarget && cachedTarget.then) + cachedTarget.then(node => this.target = node) + this.target = cachedTarget || new Node(pkg, logical, realpath, er, cache) this.realpath = realpath this.isLink = true - this.children = this.target.children this.error = er + // convenience method only + /* istanbul ignore next */ + Object.defineProperty(this, 'children', { + get () { + return this.target.children + }, + set (c) { + this.target.children = c + }, + enumerable: true + }) } } -const loadNode = (logical, physical, cache) => new Promise((res, rej) => { - res(cache.get(physical) || realpath(physical) - .then(real => - rpj(join(real, 'package.json')) - .then(pkg => [real, pkg, null], er => [real, null, er]) - .then(([real, pkg, er]) => - physical === real ? new Node(pkg, logical, physical, er, cache) - : new Link(pkg, logical, physical, real, er, cache) - ), - // if the realpath fails, don't bother with the rest - er => new Node(null, logical, physical, er, cache)) - ) -}) - -const loadChildren = (node, cache, filterWith) => { +// this is the way it is to expose a timing issue which is difficult to +// test otherwise. The creation of a Node may take slightly longer than +// the creation of a Link that targets it. If the Node has _begun_ its +// creation phase (and put a Promise in the cache) then the Link will +// get a Promise as its cachedTarget instead of an actual Node object. +// This is not a problem, because it gets resolved prior to returning +// the tree or attempting to load children. However, it IS remarkably +// difficult to get to happen in a test environment to verify reliably. +// Hence this kludge. +const newNode = (pkg, logical, physical, er, cache) => + process.env._TEST_RPT_SLOW_LINK_TARGET_ === '1' + ? new Promise(res => setTimeout(() => + res(new Node(pkg, logical, physical, er, cache)), 10)) + : new Node(pkg, logical, physical, er, cache) + +const loadNode = (logical, physical, cache, rpcache, stcache) => { + // cache temporarily holds a promise placeholder so we + // don't try to create the same node multiple times. + // this is very rare to encounter, given the aggressive + // caching on fs.realpath and fs.lstat calls, but + // it can happen in theory. + const cached = cache.get(physical) + /* istanbul ignore next */ + if (cached) + return Promise.resolve(cached) + + const p = realpath(physical, rpcache, stcache, 0).then(real => + rpj(join(real, 'package.json')) + .then(pkg => [pkg, null], er => [null, er]) + .then(([pkg, er]) => + physical === real ? newNode(pkg, logical, physical, er, cache) + : new Link(pkg, logical, physical, real, er, cache) + ), + // if the realpath fails, don't bother with the rest + er => new Node(null, logical, physical, er, cache)) + + cache.set(physical, p) + return p +} + +const loadChildren = (node, cache, filterWith, rpcache, stcache) => { + // if a Link target has started, but not completed, then + // a Promise will be in the cache to indicate this. + if (node.then) + return node.then(node => loadChildren(node, cache, filterWith, rpcache, stcache)) + const nm = join(node.path, 'node_modules') - return realpath(nm) + return realpath(nm, rpcache, stcache, 0) .then(rm => readdir(rm).then(kids => [rm, kids])) .then(([rm, kids]) => Promise.all( kids.filter(kid => kid.charAt(0) !== '.' && (!filterWith || filterWith(node, kid))) - .map(kid => loadNode(join(nm, kid), join(rm, kid), cache))) + .map(kid => loadNode(join(nm, kid), join(rm, kid), cache, rpcache, stcache))) ).then(kidNodes => { kidNodes.forEach(k => k.parent = node) node.children = kidNodes.sort((a, b) => @@ -77,7 +125,7 @@ const loadChildren = (node, cache, filterWith) => { .catch(() => node) } -const loadTree = (node, did, cache, filterWith) => { +const loadTree = (node, did, cache, filterWith, rpcache, stcache) => { // impossible except in pathological ELOOP cases /* istanbul ignore next */ if (did.has(node.realpath)) @@ -85,11 +133,12 @@ const loadTree = (node, did, cache, filterWith) => { did.add(node.realpath) - return loadChildren(node, cache, filterWith) + // load children on the target, not the link + return loadChildren(node.target || node, cache, filterWith, rpcache, stcache) .then(node => Promise.all( node.children .filter(kid => !did.has(kid.realpath)) - .map(kid => loadTree(kid, did, cache, filterWith)) + .map(kid => loadTree(kid, did, cache, filterWith, rpcache, stcache)) )).then(() => node) } @@ -100,10 +149,15 @@ const rpt = (root, filterWith, cb) => { filterWith = null } + root = resolve(root) const cache = new Map() - const p = realpath(root) - .then(realRoot => loadNode(root, realRoot, cache)) - .then(node => loadTree(node, new Set(), cache, filterWith)) + // we can assume that the cwd is real enough + const cwd = process.cwd() + const rpcache = new Map([[ cwd, cwd ]]) + const stcache = new Map() + const p = realpath(root, rpcache, stcache, 0) + .then(realRoot => loadNode(root, realRoot, cache, rpcache, stcache)) + .then(node => loadTree(node, new Set(), cache, filterWith, rpcache, stcache)) if (typeof cb === 'function') p.then(tree => cb(null, tree), cb) diff --git a/tap-snapshots/test-basic.js-TAP.test.js b/tap-snapshots/test-basic.js-TAP.test.js index 9100530..46b4dc6 100644 --- a/tap-snapshots/test-basic.js-TAP.test.js +++ b/tap-snapshots/test-basic.js-TAP.test.js @@ -40,6 +40,20 @@ root@1.2.3 test/fixtures/linkedroot └── foo@1.2.3 test/fixtures/linkedroot/node_modules/foo ` +exports[`test/basic.js TAP looking outside of cwd > must match snapshot 1`] = ` +root@1.2.3 test/fixtures/root +├─┬ @scope/x@1.2.3 test/fixtures/root/node_modules/@scope/x +│ └─┬ glob@4.0.5 test/fixtures/root/node_modules/@scope/x/node_modules/glob +│ ├── graceful-fs@3.0.2 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/graceful-fs +│ ├── inherits@2.0.1 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/inherits +│ ├─┬ minimatch@1.0.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/minimatch +│ │ ├── lru-cache@2.5.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/minimatch/node_modules/lru-cache +│ │ └── sigmund@1.0.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/minimatch/node_modules/sigmund +│ └── once@1.3.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/once +├── @scope/y@1.2.3 test/fixtures/root/node_modules/@scope/y +└── foo@1.2.3 test/fixtures/root/node_modules/foo +` + exports[`test/basic.js TAP noname > noname tree 1`] = ` test/fixtures/noname └── test/fixtures/noname/node_modules/foo @@ -79,3 +93,19 @@ selflink@1.2.3 test/fixtures/selflink │ └── once@1.3.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/once └── selflink@1.2.3 test/fixtures/selflink (symlink) ` + +exports[`test/basic.js TAP shake out Link target timing issue > must match snapshot 1`] = ` +selflink@1.2.3 test/fixtures/selflink +├── @scope/y@1.2.3 test/fixtures/selflink/node_modules/@scope/y +├─┬ @scope/z@1.2.3 test/fixtures/selflink/node_modules/@scope/z +│ └── glob@4.0.5 test/fixtures/selflink/node_modules/foo/node_modules/glob (symlink) +└─┬ foo@1.2.3 test/fixtures/selflink/node_modules/foo + ├─┬ glob@4.0.5 test/fixtures/selflink/node_modules/foo/node_modules/glob + │ ├── graceful-fs@3.0.2 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/graceful-fs + │ ├── inherits@2.0.1 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/inherits + │ ├─┬ minimatch@1.0.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/minimatch + │ │ ├── lru-cache@2.5.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/minimatch/node_modules/lru-cache + │ │ └── sigmund@1.0.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/minimatch/node_modules/sigmund + │ └── once@1.3.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/once + └── selflink@1.2.3 test/fixtures/selflink (symlink) +` diff --git a/test/basic.js b/test/basic.js index 093f315..128002a 100644 --- a/test/basic.js +++ b/test/basic.js @@ -85,6 +85,22 @@ test('filterWith', t => ).then(d => t.matchSnapshot(archy(archyize(d)).trim()), 'only 1 level deep') ) +test('looking outside of cwd', t => { + const cwd = process.cwd() + t.teardown(() => process.chdir(cwd)) + process.chdir('test/fixtures/selflink') + return rpt('../root').then(d => + t.matchSnapshot(archy(archyize(d)).trim())) +}) + +test('shake out Link target timing issue', t => { + process.env._TEST_RPT_SLOW_LINK_TARGET_ = '1' + const cwd = process.cwd() + t.teardown(() => process.env._TEST_RPT_SLOW_LINK_TARGET_ = '') + return rpt(path.resolve(fixtures, 'selflink')).then(d => + t.matchSnapshot(archy(archyize(d)).trim())) +}) + test('broken json', function (t) { rpt(path.resolve(fixtures, 'bad'), function (er, d) { t.ok(d.error, 'Got an error object') @@ -152,6 +168,23 @@ function archyize (d, seen) { } } +test('realpath gutchecks', t => { + const d = path.resolve(cwd, 'test/fixtures') + const realpath = require('../realpath.js') + const {realpathSync} = fs + Object.keys(symlinks).map(link => t.test(link, t => + realpath( + path.resolve(d, link), + new Map(), + new Map(), + 0 + ).then( + real => t.equal(real, realpathSync(path.resolve(d, link))), + er => t.throws(()=> realpathSync(path.resolve(d, link))) + ))) + t.end() +}) + test('cleanup', function (t) { cleanup() t.end()