perf: improving performance #528

H4ad · 2023-03-31T00:38:20Z

I saw this tweet by @jakebailey, describing the time spent just parsing the options.

parsing-options.js

So I did a simple refactoring removing two loops from the parsing options, before:

includePrerelease x 14,044,087 ops/sec ±3.19% (87 runs sampled)
includePrerelease + loose x 7,062,745 ops/sec ±1.14% (86 runs sampled)
includePrerelease + loose + rtl x 7,210,500 ops/sec ±1.41% (89 runs sampled)

After:

includePrerelease x 1,107,553,453 ops/sec ±1.07% (93 runs sampled)
includePrerelease + loose x 1,119,018,905 ops/sec ±0.16% (88 runs sampled)
includePrerelease + loose + rtl x 1,126,801,317 ops/sec ±0.15% (89 runs sampled)

benchmark.js

const Benchmark = require('benchmark')
const parseOptions = require('./internal/parse-options');
const suite = new Benchmark.Suite;

const options1 = {
  includePrerelease: true,
};

const options2 = {
  includePrerelease: true,
  loose: true,
};

const options3 = {
  includePrerelease: true,
  loose: true,
  rtl: false,
};

suite
.add('includePrerelease', function () {
  parseOptions(options1);
})
.add('includePrerelease + loose', function () {
  parseOptions(options2);
})
.add('includePrerelease + loose + rtl', function () {
  parseOptions(options3);
})
.on('cycle', function(event) {
  console.log(String(event.target))
})
.run({ 'async': false });

range.js

Inside range.js, I saw this piece of code:

node-semver/classes/range.js

Line 84 in da08e01

const memoOpts = Object.keys(this.options).join(',')

Object.keys is not the fastest method, so I create a function to simply return a key depending of the options.

Before:

bject.keys({"includePrelease":true}).join(',') x 13,643,510 ops/sec ±3.43% (81 runs sampled)
Object.keys({"includePrelease":true,"loose":true}).join(',') x 7,877,215 ops/sec ±1.57% (92 runs sampled)
Object.keys({"includePrelease":true,"loose":true,"rtl":true}).join(',') x 5,786,808 ops/sec ±1.09% (94 runs sampled)

After:

buildMemoKeyFromOptions({"includePrelease":true}) x 1,128,743,301 ops/sec ±0.03% (96 runs sampled)
buildMemoKeyFromOptions({"includePrelease":true,"loose":true}) x 1,115,671,401 ops/sec ±0.87% (95 runs sampled)
buildMemoKeyFromOptions({"includePrelease":true,"loose":true,"rtl":true}) x 1,101,585,690 ops/sec ±1.59% (90 runs sampled)

benchmark.js

const Benchmark = require('benchmark');
const suite = new Benchmark.Suite();

const option1 = { includePrelease: true };
const option2 = { includePrelease: true, loose: true };
const option3 = { includePrelease: true, loose: true, rtl: true };

function buildMemoKeyFromOptions(options) {
  if (options.includePrerelease === true) {
    if (options.loose === true && options.rtl === true) {
      return '1';
    }

    if (options.loose === true) {
      return '2';
    }

    if (options.rtl === true) {
      return '3';
    }

    return '4';
  } else if (options.loose === true) {
    if (options.rtl === true) {
      return '5';
    }

    return '6';
  } else if (options.rtl === true) {
    return '7';
  } else {
    return '8';
  }
}

suite
  .add(`Object.keys(${JSON.stringify(option1)}).join(',')`, function () {
    Object.keys(option1).join(',');
  })
  .add(`Object.keys(${JSON.stringify(option2)}).join(',')`, function () {
    Object.keys(option2).join(',');
  })
  .add(`Object.keys(${JSON.stringify(option3)}).join(',')`, function () {
    Object.keys(option3).join(',');
  });

suite
  .add(`buildMemoKeyFromOptions(${JSON.stringify(option1)})`, function () {
    buildMemoKeyFromOptions(option1);
  })
  .add(`buildMemoKeyFromOptions(${JSON.stringify(option2)})`, function () {
    buildMemoKeyFromOptions(option2);
  })
  .add(`buildMemoKeyFromOptions(${JSON.stringify(option3)})`, function () {
    buildMemoKeyFromOptions(option3);
  });

suite
  .on('cycle', function (event) {
    console.log(String(event.target));
  })
  .run({ async: false });

When we compare the performance improvements on satisfies function:

Before:

satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 219,080 ops/sec ±0.98% (94 runs sampled)
satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 226,188 ops/sec ±1.00% (91 runs sampled)
satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 236,135 ops/sec ±1.10% (90 runs sampled)

After:

satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 382,478 ops/sec ±0.80% (95 runs sampled)
satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 404,203 ops/sec ±0.85% (97 runs sampled)
satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 429,043 ops/sec ±0.78% (91 runs sampled)

benchmark.js

const Benchmark = require('benchmark');
const satisfies = require('./functions/satisfies');
const suite = new Benchmark.Suite();

const versions = ['1.0.3||^2.0.0', '2.2.2||~3.0.0', '2.3.0||<4.0.0'];
const versionToCompare = '1.0.6';
const option1 = { includePrelease: true };
const option2 = { includePrelease: true, loose: true };
const option3 = { includePrelease: true, loose: true, rtl: true };

for (const version of versions) {
  suite.add(`satisfies(${versionToCompare}, ${version})`, function () {
    satisfies(versionToCompare, version);
  });
}

for (const version of versions) {
  suite.add(`satisfies(${versionToCompare}, ${version}, ${JSON.stringify(option1)})`, function () {
    satisfies(versionToCompare, version, option1);
  });
}

for (const version of versions) {
  suite.add(`satisfies(${versionToCompare}, ${version}, ${JSON.stringify(option2)})`, function () {
    satisfies(versionToCompare, version, option2);
  });
}

for (const version of versions) {
  suite.add(`satisfies(${versionToCompare}, ${version}, ${JSON.stringify(option3)})`, function () {
    satisfies(versionToCompare, version, option3);
  });
}
suite
  .on('cycle', function (event) {
    console.log(String(event.target));
  })
  .run({ async: false });

I will keep as draft for now because I want to see if I can find more optimizations.

jakebailey · 2023-03-31T01:47:32Z

classes/range.js

@@ -190,6 +190,35 @@ class Range {
    return false
  }
 }
+
+function buildMemoKeyFromOptions(options) {


You may also want to try a handwritten key such as:

`${options.includePrerelease ? 1 : 0},${options.loose ? 1 : 0}...

Which is what we do in the TypeScript compiler. Not sure if it's faster than here but could be interesting to test.

It doesn't hurt the performance:

buildMemoKeyFromOptions({"includePrelease":true}) x 1,121,468,719 ops/sec ±0.63% (96 runs sampled) buildMemoKeyFromOptions({"includePrelease":true,"loose":true}) x 1,132,354,133 ops/sec ±0.67% (95 runs sampled) buildMemoKeyFromOptions({"includePrelease":true,"loose":true,"rtl":true}) x 1,122,788,698 ops/sec ±0.53% (96 runs sampled)

After 1B ops/s, it almost has no effect choosing one or another.
The only drawback is increasing the lru-cache key by 6 characters instead of having it as 1.

If NPM team prefer readability over a tiny cache key, I can change the implementation.

This is actually a good use case for a bit flag. It's a bunch of booleans, and bitwise operations are super fast. And while bit ops are often frowned upon as being dense or obscure, if organized properly, they're much more maintainable than a series of if/else statements and hoping that you captured each case.

let s = 1 const FLAG_includePrerelease = s s<<=1 const FLAG_loose = s s<<=1 const FLAG_rtl = s s<<=1 // ... function buildMemoKeyFromOptions(options) { // now just bitwise-OR them all together as appropriate return ( options.loose ? FLAG_loose : 0 ) | ( options.rtl ? FLAG_rtl : 0 ) | ( options.includePrerelease ? FLAG_includePrerelease : 0 ) }

I don't think the suggestion is to change Options, but to use a different way of constructing a key.

That being said, I think it should be possible to internally store flags for these and then recustruct the options bag when asked on whichever methods that request it. Then, internally, things can use the fast check and pass number around. Spitballing, though.

Yes, I realized a few seconds after writing that, and yes, using the option internally and then exposing makes sense, I'll try using the getter to compute the options and internally using just the bit flag.

Thinking further, the user already has to pass the object in, so you probably can just keep that one and not reconstruct it.

@jakebailey I tried, but it would break a lot of tests if I did.

I put the version of parseOptions with bit flags here: h4ad-forks@6c7a68a

@isaacs Take a look and see if this PR is worth pushing, I made a lot of changes.

Also, I introduce 1 small breaking change, now the numbers passed to parseOptions are considered valid options, so instead of re-parsing it by passing the object, I use many of the options as bit flags to construct new objects that are inside the functions.

About performance, no performance penalties and I assume it got a bit faster because I now parse the options as an object once and then just parse the options again as a number which saves some comparisons.

The only downside I notice when using bitmask is the memory usage (ref: #528 (comment)) is the same as allocating the objects at runtime (8.7mb).

I don't know why this behavior is happening, but I just want to point out that it's not as memory efficient as Object.freeze.

ljharb · 2023-03-31T17:16:15Z

internal/parse-options.js

+const var1 = Object.freeze({ includePrerelease: true, loose: true, rtl: true });
+const var2 = Object.freeze({ includePrerelease: true, loose: true });
+const var3 = Object.freeze({ includePrerelease: true, rtl: true });
+const var4 = Object.freeze({ includePrerelease: true });
+const var5 = Object.freeze({ loose: true, rtl: true });
+const var6 = Object.freeze({ loose: true });
+const var7 = Object.freeze({ rtl: true });
+const emptyOpts = Object.freeze({});


typically freezing objects makes them much slower, so it's worth benchmarking this.

Also, all of these should be __proto__: null so lookups can be faster.

+1 on trying out Object.create(null), but I wonder if trying out making this a Map, or maybe even using Bitmasks could speed it up further?

Yeah, freezing has never changed performance in anything I've ever tried, unfortuantely.

Honestly, I think that the way forward in optimizing things is not going to be down to tricks like freezing or looking for the fastest LRU cache; it's going to be overall algorithmic changes to the code.

The idea of freezing is just to prevent modification in the object, also, is not to gain performance but to save memory allocation.

With the older version, I did a test that ended with 8.7mb of memory usage and with this freeze version, it was consistent at 6.7mb.

I think @ljharb 's point was more about using Object.freeze (the fn itself) makes the code slower, not faster (because the freezing itself is an operation).

@kurtextrem like #528 (comment), if we do it inline it can be slower, so as not to break the tests, I think it's better not to include __proto__: null.

It would be fine to change the tests here, to be clear. I'm very surprised doing it inline is slower - in which engines is that true?

Note that we're not just concerned with node here, this package is used in browsers as well.

@ljharb The benchmark site I've used (from the other thread) doesn't work for me in Firefox, so I can only say it's definitely much slower in Chromium, Safari and Node (that caught my be surprise too, I assume __proto__ has a special treatment during creation if not done with Object.create(null), thus making it slower than the standard object). It's even slower than Object.freeze/seal.
Just from my microbenchmark, I'd stick to a regular object creation (without freezing), but if we can benchmark all browser vendors + node and see that Object.freeze is not slower than without, we can as well pick up the small memory reduction.

screenshot, because maybe not everyone has access to a Mac following this discussion (higher is better):

@kurtextrem The main problem is if we were returning the object but since we run it once, the op/s is not a problem.

It will be a problem if it slows down the property access but I don't think that is the case.

ljharb · 2023-03-31T17:16:31Z

internal/parse-options.js

+const emptyOpts = Object.freeze({});
+
+const parseOptions = options => {
+  if (!options) return emptyOpts;


there's no way this is faster than return {}, have you benchmarked this?

why shouldn't it? object allocation (and cleanup later) isn't free

Yeah, the idea is to reduce memory allocation.

Have you benchmarked it? There are a great many optimizations people attempt to make in JS that are counterintuitively slower than the idiomatic approach.

@ljharb Yes,

benchmark-memory.js

const satisfies = require('./functions/satisfies'); (() => { const versions = ['1.0.3', '2.2.2', '2.3.0']; const versionToCompare = '>=1.0.2'; const option1 = { includePrelease: true }; const option2 = { includePrelease: true, loose: true }; const option3 = { includePrelease: true, loose: true, rtl: true }; for (let i = 0; i < 1e5; i++) { for (const version of versions) { satisfies(versionToCompare, version); satisfies(versionToCompare, version, option1); satisfies(versionToCompare, version, option2); satisfies(versionToCompare, version, option3); } } })();

Run the benchmark with the following command: node --trace-gc --trace-deopt benchmark-memory.js.

Without object freeze, this is the output: without-freeze-object.txt

With object: with-freeze-object.txt

The difference shows up at the end of the text file, without object freeze with ended up with 8.7mb of memory, and with object freeze, we ended up with 6.7mb.

About the pure performance of satisfies, I didn't notice any drawback:

Without Object.freeze:

satisfies(1.0.6, 1.0.3||^2.0.0) x 420,051 ops/sec ±1.14% (94 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0) x 429,208 ops/sec ±0.59% (94 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0) x 459,359 ops/sec ±0.67% (88 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true}) x 409,670 ops/sec ±1.02% (94 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true}) x 428,150 ops/sec ±0.91% (94 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true}) x 464,502 ops/sec ±0.78% (97 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true}) x 406,053 ops/sec ±0.83% (96 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true}) x 429,494 ops/sec ±0.58% (96 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true}) x 433,164 ops/sec ±1.11% (95 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 385,354 ops/sec ±0.87% (95 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 404,200 ops/sec ±0.72% (94 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 434,975 ops/sec ±1.02% (97 runs sampled)

With Object.freeze

satisfies(1.0.6, 1.0.3||^2.0.0) x 419,615 ops/sec ±1.38% (95 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0) x 438,583 ops/sec ±0.48% (96 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0) x 447,261 ops/sec ±0.92% (93 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true}) x 415,861 ops/sec ±1.10% (95 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true}) x 433,068 ops/sec ±0.98% (96 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true}) x 461,636 ops/sec ±0.92% (92 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true}) x 412,220 ops/sec ±0.06% (96 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true}) x 428,966 ops/sec ±0.65% (97 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true}) x 448,587 ops/sec ±0.29% (95 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 388,662 ops/sec ±0.48% (94 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 412,242 ops/sec ±0.16% (93 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 432,172 ops/sec ±1.03% (96 runs sampled)

@ljharb Like @kurtextrem mentioned in the other comment, inline static null proto is slower and we ended up with 8.7mb.

Code:

const parseOptions = options => { if (!options) return Object.create(null); if (typeof options !== 'object') return { __proto__: null, loose: true }; const opts = Object.create(null); if (options.includePrerelease) opts.includePrerelease = true; if (options.loose) opts.loose = true; if (options.rtl) opts.rtl = true; return opts; };

Perf:

satisfies(1.0.6, 1.0.3||^2.0.0) x 341,924 ops/sec ±0.42% (93 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0) x 357,034 ops/sec ±0.13% (97 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0) x 384,200 ops/sec ±0.54% (92 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true}) x 328,873 ops/sec ±0.98% (96 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true}) x 345,029 ops/sec ±0.67% (96 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true}) x 367,659 ops/sec ±0.37% (95 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true}) x 316,462 ops/sec ±0.43% (94 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true}) x 323,678 ops/sec ±0.90% (93 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true}) x 339,436 ops/sec ±0.94% (96 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 287,875 ops/sec ±4.03% (88 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 314,779 ops/sec ±1.12% (94 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 332,332 ops/sec ±0.25% (96 runs sampled)

Thanks for clarifying. What i meant was, adding in to your static reference frozen objects a proto:null inline to those declarations.

https://esbench.com/bench/64271fec6c89f600a57022e8 as mentioned, does not appear to be faster to initialize

hm, i can’t seem to run the benchmarks on iOS Safari. Have they been run on every browser, not just chrome? what about in node, where perf differs from chrome at times?

@ljharb About freeze with proto null:

satisfies(1.0.6, 1.0.3||^2.0.0) x 21,856 ops/sec ±0.60% (90 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0) x 21,716 ops/sec ±0.79% (95 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0) x 21,643 ops/sec ±0.76% (95 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true}) x 21,216 ops/sec ±0.25% (93 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true}) x 21,171 ops/sec ±0.41% (86 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true}) x 21,113 ops/sec ±0.38% (92 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true}) x 310,400 ops/sec ±4.69% (95 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true}) x 361,617 ops/sec ±0.58% (89 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true}) x 381,054 ops/sec ±0.07% (97 runs sampled) satisfies(1.0.6, 1.0.3||^2.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 335,386 ops/sec ±0.44% (96 runs sampled) satisfies(1.0.6, 2.2.2||~3.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 350,006 ops/sec ±0.62% (97 runs sampled) satisfies(1.0.6, 2.3.0||<4.0.0, {"includePrelease":true,"loose":true,"rtl":true}) x 370,139 ops/sec ±0.09% (94 runs sampled)

Soooo slow, I don't even know why.

kurtextrem · 2023-03-31T17:31:06Z

Just so it doesn't get lost, someone from Tencent wrote this one: https://github.com/Zzzen/fast-semver (https://twitter.com/callmetsing/status/1641810237404614660), which is much faster (original semver is 77% slower).

jakebailey · 2023-03-31T17:34:20Z

Just so it doesn't get lost, someone from Tencent wrote this one: Zzzen/fast-semver (twitter.com/callmetsing/status/1641810237404614660), which is much faster (original semver is 77% slower).

I mentioned this on the Twitter thread, but given it's "only" 4x faster, I really think that an optimized version can be written in JS and be very close. It will just take some benchmarks and profiling to get to the bottom of things. Things like #458 and so on are related here.

Given all of the users downstream appear to usually compare strings, that's the worst case path in the current implementation; it allocates loads of temporary objects to perform the comparisons and then they are thrown away. yarn berry already caches these objects to try and help with this, and I'm considering sending a PR to pnpm to do something similar as it results in a 33% speedup for the DT monorepo.

If there were a fast path for string-ly comparison, that might end up being good, though that may be challenging. But, it's pretty straightfoward to fuzz test two implementations for differences.

H4ad · 2023-03-31T18:27:11Z

I tried to find more optimizations, the general ideas are:

reduce the number of loops: instead call filter, split, map, etc... do all at once.
reduce the usage of RegExps: this lib heavily depends on it and every time I see this pattern, the JS parser to do the same thing is usually faster.
reduce the number of calls for trim: this one I was very confusing, it splits spaces, rewrite spaces, trim, and do this again and again, not sure why but I think we can reduce some call and gain some perf.
as mentioned by @jakebailey, use more lru-cache: especially for versions and semver obj like yarn did.

Any of these ideas will take more time than I currently have, so I'll stop for now and do some more research another time.

@jakebailey Can you do mockey-patching on semver used by pnpm and see if some of these optimizations helped? Both in terms of memory and speed.

jakebailey · 2023-03-31T19:08:10Z

I have a PR I'm going to send later today to pnpm which simply caches new Range and then passes in new SemVer to test, which eliminates the slowdown with minimal memory overhead.

But, I can try and pull this branch in and see if anything changes.

jakebailey · 2023-03-31T21:51:54Z

Alright, so on pnpm main, this PR actually appears to be quite good!

Comparing to pnpm/pnpm#6336:

Done in 1m 53.8s
total time:  114.05s
user time:   132.65s
system time: 30.62s
CPU percent: 143%
max memory:  1661 MB

Let me try the combo of both.

jakebailey · 2023-03-31T21:55:30Z

The combo of caching + this PR seems to be the fastest yet:

Done in 1m 44s
total time:  104.31s
user time:   118.27s
system time: 30.84s
CPU percent: 142%
max memory:  1771 MB

So, this PR is a pretty good help for the current code.

jakebailey · 2023-03-31T21:58:57Z

Of course, it's not as fast as blindly cachinng the world, but that's not the best idea either.

isaacs · 2023-04-02T17:17:06Z

Also, if you're looking to improve performance, and relying on caching in hot paths, probably a good idea to upgrade to the latest version of lru-cache.

I haven't tried it in this particular use case, but that might pretty dramatically cut down the gc overhead, since v6 of lru-cache creates a ton of tiny objects that live just long enough to fall out of the short-lived gc generation.

jakebailey · 2023-04-02T17:24:07Z

My testing in general is pointing to the recreation of semver/range objects as being "the thing" that is making things slow down. I had tried using an LRU cache, but found it to not help because the repeated comparisons happened so far apart that their entries ended up being evicted. Caching the Range object (forever) is what pnpm and yarn now do, as those objects are seemingly the worst overall, but there are thankfully not enough ranges in the entire program usually for this to cause a problem.

I'm working on a recreation of the API in a way that tries to avoid recreation as much as possible, but I think any short term improvements like making parseOptions faster are definitely a good idea.

wraithgar · 2023-04-04T16:54:15Z

Sorry that I missed this, catching up now. @H4ad has done some very helpful work over on ssri already and I'm working through the last of them now. Thanks to all of you who are helping comment on this PR and are doing other performance testing/improvement.

I would humbly make the same suggestion here that I made there, that the PRs be broken up logically by optimization type or code area for easier isolation of reviews. By all means continue the discussion here if you want to solve problems or come up w/ other solutions before then.

H4ad · 2023-04-06T03:29:51Z

@wraithgar PRs created, I think we had a really good conversion on this PR about the performance improvements, I'm going to close this PR now in favor of the other two, and I'm going to ask you to put all the new suggestions in the other PR, I've tried to link almost all of the solutions and describe all the benefits and drawbacks of the selected solution in the new PRs, then I think we can start a new conversion there about any other optimization or design decision.

H4ad · 2023-04-06T15:19:10Z

In case of someone wants to push more perf optimizations in this release, I found two interesting things.

Those objects are created every time this method is called but the return is a boolean flag, so we could simply cache those values.

node-semver/ranges/subset.js

Lines 76 to 92 in f4fa069

    
           if (sub.length === 1 && sub[0].semver === ANY) { 
        
             if (dom.length === 1 && dom[0].semver === ANY) { 
        
               return true 
        
             } else if (options.includePrerelease) { 
        
               sub = [new Comparator('>=0.0.0-0')] 
        
             } else { 
        
               sub = [new Comparator('>=0.0.0')] 
        
             } 
        
           } 
        
           if (dom.length === 1 && dom[0].semver === ANY) { 
        
             if (options.includePrerelease) { 
        
               return true 
        
             } else { 
        
               dom = [new Comparator('>=0.0.0')] 
        
             } 
        
           }

Above, a lot of comparisons could be early returned when true, this will save some time and we could avoid calling cmp.

node-semver/classes/comparator.js

Lines 100 to 125 in f4fa069

    
           const sameDirectionIncreasing = 
        
             (this.operator === '>=' || this.operator === '>') && 
        
             (comp.operator === '>=' || comp.operator === '>') 
        
           const sameDirectionDecreasing = 
        
             (this.operator === '<=' || this.operator === '<') && 
        
             (comp.operator === '<=' || comp.operator === '<') 
        
           const sameSemVer = this.semver.version === comp.semver.version 
        
           const differentDirectionsInclusive = 
        
             (this.operator === '>=' || this.operator === '<=') && 
        
             (comp.operator === '>=' || comp.operator === '<=') 
        
           const oppositeDirectionsLessThan = 
        
             cmp(this.semver, '<', comp.semver, options) && 
        
             (this.operator === '>=' || this.operator === '>') && 
        
               (comp.operator === '<=' || comp.operator === '<') 
        
           const oppositeDirectionsGreaterThan = 
        
             cmp(this.semver, '>', comp.semver, options) && 
        
             (this.operator === '<=' || this.operator === '<') && 
        
               (comp.operator === '>=' || comp.operator === '>') 
        
           return ( 
        
             sameDirectionIncreasing || 
        
             sameDirectionDecreasing || 
        
             (sameSemVer && differentDirectionsInclusive) || 
        
             oppositeDirectionsLessThan || 
        
             oppositeDirectionsGreaterThan 
        
           )

I will probably investigate those changes as soon I finish the two PRs opened now, but in case someone wants to contribute, I'm happy to help with a coding review.

wraithgar · 2023-04-06T15:23:21Z

If someone does end up looking at the classes/comparator.js they should also be aware of #521, for which the code snippet above is responsible.

H4ad added 2 commits March 30, 2023 21:33

perf: improving perf of parsing options

ca185a0

perf: faster memoOpts for range

8a20c28

H4ad force-pushed the perf/node-semver branch from 2cea1a4 to 8a20c28 Compare March 31, 2023 01:43

jakebailey reviewed Mar 31, 2023

View reviewed changes

H4ad force-pushed the perf/node-semver branch 2 times, most recently from 684d866 to 8a20c28 Compare March 31, 2023 03:05

H4ad changed the title ~~perf: improving perf of parsing options~~ perf: improving performance Mar 31, 2023

perf: freeze parse options to avoid object allocation

665eee4

H4ad force-pushed the perf/node-semver branch from 425b35e to 665eee4 Compare March 31, 2023 17:15

ljharb reviewed Mar 31, 2023

View reviewed changes

H4ad marked this pull request as ready for review March 31, 2023 18:20

H4ad requested a review from a team as a code owner March 31, 2023 18:20

H4ad requested review from nlf and removed request for a team March 31, 2023 18:20

jakebailey mentioned this pull request Mar 31, 2023

perf(npm-resolver): improve performance of semver.satisfies pnpm/pnpm#6336

Merged

fengmk2 mentioned this pull request Apr 1, 2023

给 semver 加一个缓存 cnpm/npminstall#453

Closed

wraithgar mentioned this pull request Apr 4, 2023

refactor: optimize code #504

Closed

This was referenced Apr 6, 2023

fix: faster parse options #535

Merged

fix: faster cache key factory for range #536

Merged

H4ad closed this Apr 6, 2023

This was referenced Apr 6, 2023

fix: reuse comparators on subset #537

Merged

fix: intersects with v0.0.0 and v0.0.0-0 #538

Merged

perf: improving performance #528

perf: improving performance #528

Conversation

H4ad commented Mar 31, 2023 • edited

parsing-options.js

range.js

Choose a reason for hiding this comment

H4ad Mar 31, 2023 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jakebailey Mar 31, 2023 • edited

Choose a reason for hiding this comment

H4ad Mar 31, 2023 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kurtextrem Apr 3, 2023 • edited

Choose a reason for hiding this comment

kurtextrem Apr 3, 2023 • edited

Choose a reason for hiding this comment

H4ad Apr 3, 2023 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

H4ad Apr 1, 2023 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kurtextrem commented Mar 31, 2023

jakebailey commented Mar 31, 2023 • edited

H4ad commented Mar 31, 2023

jakebailey commented Mar 31, 2023

jakebailey commented Mar 31, 2023 • edited

jakebailey commented Mar 31, 2023 • edited

jakebailey commented Mar 31, 2023

isaacs commented Apr 2, 2023 • edited

jakebailey commented Apr 2, 2023

wraithgar commented Apr 4, 2023

H4ad commented Apr 6, 2023

H4ad commented Apr 6, 2023 • edited

wraithgar commented Apr 6, 2023

H4ad commented Mar 31, 2023 •

edited

H4ad Mar 31, 2023 •

edited

jakebailey Mar 31, 2023 •

edited

H4ad Mar 31, 2023 •

edited

kurtextrem Apr 3, 2023 •

edited

kurtextrem Apr 3, 2023 •

edited

H4ad Apr 3, 2023 •

edited

H4ad Apr 1, 2023 •

edited

jakebailey commented Mar 31, 2023 •

edited

jakebailey commented Mar 31, 2023 •

edited

jakebailey commented Mar 31, 2023 •

edited

isaacs commented Apr 2, 2023 •

edited

H4ad commented Apr 6, 2023 •

edited