Skip to content
This repository has been archived by the owner on Jan 7, 2022. It is now read-only.

Commit

Permalink
Use custom cachable fs.realpath implementation
Browse files Browse the repository at this point in the history
In this use case, we don't care much about a lot of the stuff that
fs.realpath can (and should!) do.  The only thing that's relevant to
reading a package tree is whether package folders are symbolic links,
and if so, where they point.

Additionally, we don't need to re-start the fs.lstat party every time we
walk to a new directory.  While it makes sense for fs.realpath to do
this in the general case, it's not required when reading a package tree,
and results in a geometric explosion of lstat syscalls.  For example, if
a project is in /Users/hyooman/projects/company/website, and it has 1000
dependencies in node_modules, then a whopping 6,000 lstat calls will be
made just to repeatedly verify that
/Users/hyooman/projects/company/website/node_modules has not moved!

In this implementation, every realpath call is cached, as is every
lstat.  Additionally, process.cwd() is assumed to be "real enough", and
added to the cache initially, which means almost never having to walk
all the way up to the root directory.

In the npm cli project, this drops the lstat count from 14885 to 3054
for a single call to read-package-tree on my system.  Larger projects,
or projects deeper in a folder tree, will have even larger reductions.

This does not account, itself, for a particularly large speed-up, since
lstat calls do tend to be fairly fast, and the repetitiveness means that
there are a lot of hits in the file system's stat cache.  But it does
make read-package-tree 10-30% faster in common use cases.
  • Loading branch information
isaacs committed Jun 11, 2019
1 parent 4eed760 commit e9cd536
Show file tree
Hide file tree
Showing 5 changed files with 238 additions and 28 deletions.
3 changes: 2 additions & 1 deletion package.json
Expand Up @@ -31,7 +31,8 @@
},
"homepage": "https://github.com/npm/read-package-tree",
"files": [
"rpt.js"
"rpt.js",
"realpath.js"
],
"tap": {
"100": true
Expand Down
92 changes: 92 additions & 0 deletions realpath.js
@@ -0,0 +1,92 @@
// look up the realpath, but cache stats to minimize overhead
// If the parent folder is in the realpath cache, then we just
// lstat the child, since there's no need to do a full realpath
// This is not a separate module, and is much simpler than Node's
// built-in fs.realpath, because we only care about symbolic links,
// so we can handle many fewer edge cases.

const fs = require('fs')
const { promisify } = require('util')
const readlink = promisify(fs.readlink)
const lstat = promisify(fs.lstat)
const { resolve, basename, dirname } = require('path')

const realpathCached = (path, rpcache, stcache, depth) => {
// just a safety against extremely deep eloops
/* istanbul ignore next */
if (depth > 2000)
throw eloop(path)

if (rpcache.has(path))
return Promise.resolve(rpcache.get(path))

const dir = dirname(path)
const base = basename(path)

if (base && rpcache.has(dir))
return realpathChild(dir, base, rpcache, stcache, depth)

// if it's the root, then we know it's real
if (!base) {
rpcache.set(dir, dir)
return Promise.resolve(dir)
}

// the parent, what is that?
// find out, and then come back.
return realpathCached(dir, rpcache, stcache, depth + 1).then(() =>
realpathCached(path, rpcache, stcache, depth + 1))
}

const lstatCached = (path, stcache) => {
if (stcache.has(path))
return Promise.resolve(stcache.get(path))

const p = lstat(path).then(st => {
stcache.set(path, st)
return st
})
stcache.set(path, p)
return p
}

// This is a slight fib, as it doesn't actually occur during a stat syscall.
// But file systems are giant piles of lies, so whatever.
const eloop = path =>
Object.assign(new Error(
`ELOOP: too many symbolic links encountered, stat '${path}'`), {
errno: -62,
syscall: 'stat',
code: 'ELOOP',
path: path,
})

const realpathChild = (dir, base, rpcache, stcache, depth) => {
const realdir = rpcache.get(dir)
// that unpossible
/* istanbul ignore next */
if (typeof realdir === 'undefined')
throw new Error('in realpathChild without parent being in realpath cache')

const realish = resolve(realdir, base)
return lstatCached(realish, stcache).then(st => {
if (!st.isSymbolicLink()) {
rpcache.set(resolve(dir, base), realish)
return realish
}

let res
return readlink(realish).then(target => {
const resolved = res = resolve(realdir, target)
if (realish === resolved)
throw eloop(realish)

return realpathCached(resolved, rpcache, stcache, depth + 1)
}).then(real => {
rpcache.set(resolve(dir, base), real)
return real
})
})
}

module.exports = realpathCached
108 changes: 81 additions & 27 deletions rpt.js
@@ -1,16 +1,17 @@
const fs = require('fs')
const { promisify } = require('util')
const realpath = promisify(fs.realpath)
const { basename, dirname, join } = require('path')
const { resolve, basename, dirname, join } = require('path')
const rpj = promisify(require('read-package-json'))
const readdir = promisify(require('readdir-scoped-modules'))
const realpath = require('./realpath.js')

let ID = 0
class Node {
constructor (pkg, logical, physical, er, cache) {
// should be impossible.
const cached = cache.get(physical)
/* istanbul ignore next */
if (cache.get(physical))
if (cached && !cached.then)
throw new Error('re-creating already instantiated node')

cache.set(physical, this)
Expand All @@ -34,37 +35,84 @@ class Node {
class Link extends Node {
constructor (pkg, logical, physical, realpath, er, cache) {
super(pkg, logical, physical, er, cache)

// if the target has started, but not completed, then
// a Promise will be in the cache to indicate this.
const cachedTarget = cache.get(realpath)
if (cachedTarget && cachedTarget.then)
cachedTarget.then(node => this.target = node)

this.target = cachedTarget || new Node(pkg, logical, realpath, er, cache)
this.realpath = realpath
this.isLink = true
this.children = this.target.children
this.error = er
// convenience method only
/* istanbul ignore next */
Object.defineProperty(this, 'children', {
get () {
return this.target.children
},
set (c) {
this.target.children = c
},
enumerable: true
})
}
}

const loadNode = (logical, physical, cache) => new Promise((res, rej) => {
res(cache.get(physical) || realpath(physical)
.then(real =>
rpj(join(real, 'package.json'))
.then(pkg => [real, pkg, null], er => [real, null, er])
.then(([real, pkg, er]) =>
physical === real ? new Node(pkg, logical, physical, er, cache)
: new Link(pkg, logical, physical, real, er, cache)
),
// if the realpath fails, don't bother with the rest
er => new Node(null, logical, physical, er, cache))
)
})

const loadChildren = (node, cache, filterWith) => {
// this is the way it is to expose a timing issue which is difficult to
// test otherwise. The creation of a Node may take slightly longer than
// the creation of a Link that targets it. If the Node has _begun_ its
// creation phase (and put a Promise in the cache) then the Link will
// get a Promise as its cachedTarget instead of an actual Node object.
// This is not a problem, because it gets resolved prior to returning
// the tree or attempting to load children. However, it IS remarkably
// difficult to get to happen in a test environment to verify reliably.
// Hence this kludge.
const newNode = (pkg, logical, physical, er, cache) =>
process.env._TEST_RPT_SLOW_LINK_TARGET_ === '1'
? new Promise(res => setTimeout(() =>
res(new Node(pkg, logical, physical, er, cache)), 10))
: new Node(pkg, logical, physical, er, cache)

const loadNode = (logical, physical, cache, rpcache, stcache) => {
// cache temporarily holds a promise placeholder so we
// don't try to create the same node multiple times.
// this is very rare to encounter, given the aggressive
// caching on fs.realpath and fs.lstat calls, but
// it can happen in theory.
const cached = cache.get(physical)
/* istanbul ignore next */
if (cached)
return Promise.resolve(cached)

const p = realpath(physical, rpcache, stcache, 0).then(real =>
rpj(join(real, 'package.json'))
.then(pkg => [pkg, null], er => [null, er])
.then(([pkg, er]) =>
physical === real ? newNode(pkg, logical, physical, er, cache)
: new Link(pkg, logical, physical, real, er, cache)
),
// if the realpath fails, don't bother with the rest
er => new Node(null, logical, physical, er, cache))

cache.set(physical, p)
return p
}

const loadChildren = (node, cache, filterWith, rpcache, stcache) => {
// if a Link target has started, but not completed, then
// a Promise will be in the cache to indicate this.
if (node.then)
return node.then(node => loadChildren(node, cache, filterWith, rpcache, stcache))

const nm = join(node.path, 'node_modules')
return realpath(nm)
return realpath(nm, rpcache, stcache, 0)
.then(rm => readdir(rm).then(kids => [rm, kids]))
.then(([rm, kids]) => Promise.all(
kids.filter(kid =>
kid.charAt(0) !== '.' && (!filterWith || filterWith(node, kid)))
.map(kid => loadNode(join(nm, kid), join(rm, kid), cache)))
.map(kid => loadNode(join(nm, kid), join(rm, kid), cache, rpcache, stcache)))
).then(kidNodes => {
kidNodes.forEach(k => k.parent = node)
node.children = kidNodes.sort((a, b) =>
Expand All @@ -77,19 +125,20 @@ const loadChildren = (node, cache, filterWith) => {
.catch(() => node)
}

const loadTree = (node, did, cache, filterWith) => {
const loadTree = (node, did, cache, filterWith, rpcache, stcache) => {
// impossible except in pathological ELOOP cases
/* istanbul ignore next */
if (did.has(node.realpath))
return Promise.resolve(node)

did.add(node.realpath)

return loadChildren(node, cache, filterWith)
// load children on the target, not the link
return loadChildren(node.target || node, cache, filterWith, rpcache, stcache)
.then(node => Promise.all(
node.children
.filter(kid => !did.has(kid.realpath))
.map(kid => loadTree(kid, did, cache, filterWith))
.map(kid => loadTree(kid, did, cache, filterWith, rpcache, stcache))
)).then(() => node)
}

Expand All @@ -100,10 +149,15 @@ const rpt = (root, filterWith, cb) => {
filterWith = null
}

root = resolve(root)
const cache = new Map()
const p = realpath(root)
.then(realRoot => loadNode(root, realRoot, cache))
.then(node => loadTree(node, new Set(), cache, filterWith))
// we can assume that the cwd is real enough
const cwd = process.cwd()
const rpcache = new Map([[ cwd, cwd ]])
const stcache = new Map()
const p = realpath(root, rpcache, stcache, 0)
.then(realRoot => loadNode(root, realRoot, cache, rpcache, stcache))
.then(node => loadTree(node, new Set(), cache, filterWith, rpcache, stcache))

if (typeof cb === 'function')
p.then(tree => cb(null, tree), cb)
Expand Down
30 changes: 30 additions & 0 deletions tap-snapshots/test-basic.js-TAP.test.js
Expand Up @@ -40,6 +40,20 @@ root@1.2.3 test/fixtures/linkedroot
└── foo@1.2.3 test/fixtures/linkedroot/node_modules/foo
`

exports[`test/basic.js TAP looking outside of cwd > must match snapshot 1`] = `
root@1.2.3 test/fixtures/root
├─┬ @scope/x@1.2.3 test/fixtures/root/node_modules/@scope/x
│ └─┬ glob@4.0.5 test/fixtures/root/node_modules/@scope/x/node_modules/glob
│ ├── graceful-fs@3.0.2 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/graceful-fs
│ ├── inherits@2.0.1 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/inherits
│ ├─┬ minimatch@1.0.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/minimatch
│ │ ├── lru-cache@2.5.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/minimatch/node_modules/lru-cache
│ │ └── sigmund@1.0.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/minimatch/node_modules/sigmund
│ └── once@1.3.0 test/fixtures/root/node_modules/@scope/x/node_modules/glob/node_modules/once
├── @scope/y@1.2.3 test/fixtures/root/node_modules/@scope/y
└── foo@1.2.3 test/fixtures/root/node_modules/foo
`

exports[`test/basic.js TAP noname > noname tree 1`] = `
test/fixtures/noname
└── test/fixtures/noname/node_modules/foo
Expand Down Expand Up @@ -79,3 +93,19 @@ selflink@1.2.3 test/fixtures/selflink
│ └── once@1.3.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/once
└── selflink@1.2.3 test/fixtures/selflink (symlink)
`

exports[`test/basic.js TAP shake out Link target timing issue > must match snapshot 1`] = `
selflink@1.2.3 test/fixtures/selflink
├── @scope/y@1.2.3 test/fixtures/selflink/node_modules/@scope/y
├─┬ @scope/z@1.2.3 test/fixtures/selflink/node_modules/@scope/z
│ └── glob@4.0.5 test/fixtures/selflink/node_modules/foo/node_modules/glob (symlink)
└─┬ foo@1.2.3 test/fixtures/selflink/node_modules/foo
├─┬ glob@4.0.5 test/fixtures/selflink/node_modules/foo/node_modules/glob
│ ├── graceful-fs@3.0.2 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/graceful-fs
│ ├── inherits@2.0.1 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/inherits
│ ├─┬ minimatch@1.0.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/minimatch
│ │ ├── lru-cache@2.5.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/minimatch/node_modules/lru-cache
│ │ └── sigmund@1.0.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/minimatch/node_modules/sigmund
│ └── once@1.3.0 test/fixtures/selflink/node_modules/foo/node_modules/glob/node_modules/once
└── selflink@1.2.3 test/fixtures/selflink (symlink)
`
33 changes: 33 additions & 0 deletions test/basic.js
Expand Up @@ -85,6 +85,22 @@ test('filterWith', t =>
).then(d => t.matchSnapshot(archy(archyize(d)).trim()), 'only 1 level deep')
)

test('looking outside of cwd', t => {
const cwd = process.cwd()
t.teardown(() => process.chdir(cwd))
process.chdir('test/fixtures/selflink')
return rpt('../root').then(d =>
t.matchSnapshot(archy(archyize(d)).trim()))
})

test('shake out Link target timing issue', t => {
process.env._TEST_RPT_SLOW_LINK_TARGET_ = '1'
const cwd = process.cwd()
t.teardown(() => process.env._TEST_RPT_SLOW_LINK_TARGET_ = '')
return rpt(path.resolve(fixtures, 'selflink')).then(d =>
t.matchSnapshot(archy(archyize(d)).trim()))
})

test('broken json', function (t) {
rpt(path.resolve(fixtures, 'bad'), function (er, d) {
t.ok(d.error, 'Got an error object')
Expand Down Expand Up @@ -152,6 +168,23 @@ function archyize (d, seen) {
}
}

test('realpath gutchecks', t => {
const d = path.resolve(cwd, 'test/fixtures')
const realpath = require('../realpath.js')
const {realpathSync} = fs
Object.keys(symlinks).map(link => t.test(link, t =>
realpath(
path.resolve(d, link),
new Map(),
new Map(),
0
).then(
real => t.equal(real, realpathSync(path.resolve(d, link))),
er => t.throws(()=> realpathSync(path.resolve(d, link)))
)))
t.end()
})

test('cleanup', function (t) {
cleanup()
t.end()
Expand Down

0 comments on commit e9cd536

Please sign in to comment.