cluster: use linkedlist for round_robin_handle #40615

twchn · 2021-10-26T16:12:34Z

Currently, an array is used as a queue to manage handles, when there are many handles, the ArrayPrototypeShift may become a bottleneck, so using the builtin linkedlist to reduce the time complexity of handoff method to a constant level and may be helpful for #37343.

Trott · 2021-10-26T18:23:24Z

I thought we had a benchmark for round robin somewhere but I'm not seeing it. @nodejs/benchmarking

yunnysunny · 2021-11-19T10:16:01Z

I thought we had a benchmark for round robin somewhere but I'm not seeing it. @nodejs/benchmarking

You can just use the offical code of cluster:

const cluster = require('cluster');
const http = require('http');
const numCPUs = 2;//require('os').cpus().length;
const process = require('process');

if (cluster.isMaster) {
  console.log(`Primary ${process.pid} is running`);

  // Fork workers.
  for (let i = 0; i < numCPUs; i++) {
    cluster.fork();
  }

  cluster.on('exit', (worker, code, signal) => {
    console.log(`worker ${worker.process.pid} died`);
  });
} else {
  // Workers can share any TCP connection
  // In this case it is an HTTP server
  http.createServer((req, res) => {
    res.writeHead(200);
    res.end('hello world\n');
  }).listen(8000);

  console.log(`Worker ${process.pid} started`);
}

After run the log is print as follows:

Primary 5308 is running
Worker 5321 started
Worker 5322 started

And then you can use some benchmark tools, such as jmeter, to request to the service you have just started. Here is an example configuration of jmeter:

<?xml version="1.0" encoding="UTF-8"?>
<jmeterTestPlan version="1.2" properties="5.0" jmeter="5.3">
  <hashTree>
    <TestPlan guiclass="TestPlanGui" testclass="TestPlan" testname="Test Plan" enabled="true">
      <stringProp name="TestPlan.comments"></stringProp>
      <boolProp name="TestPlan.functional_mode">false</boolProp>
      <boolProp name="TestPlan.tearDown_on_shutdown">true</boolProp>
      <boolProp name="TestPlan.serialize_threadgroups">false</boolProp>
      <elementProp name="TestPlan.user_defined_variables" elementType="Arguments" guiclass="ArgumentsPanel" testclass="Arguments" testname="User Defined Variables" enabled="true">
        <collectionProp name="Arguments.arguments"/>
      </elementProp>
      <stringProp name="TestPlan.user_define_classpath"></stringProp>
    </TestPlan>
    <hashTree>
      <ThreadGroup guiclass="ThreadGroupGui" testclass="ThreadGroup" testname="Thread Group" enabled="true">
        <stringProp name="ThreadGroup.on_sample_error">continue</stringProp>
        <elementProp name="ThreadGroup.main_controller" elementType="LoopController" guiclass="LoopControlPanel" testclass="LoopController" testname="Loop Controller" enabled="true">
          <boolProp name="LoopController.continue_forever">false</boolProp>
          <intProp name="LoopController.loops">-1</intProp>
        </elementProp>
        <stringProp name="ThreadGroup.num_threads">8</stringProp>
        <stringProp name="ThreadGroup.ramp_time">1</stringProp>
        <boolProp name="ThreadGroup.scheduler">false</boolProp>
        <stringProp name="ThreadGroup.duration"></stringProp>
        <stringProp name="ThreadGroup.delay"></stringProp>
        <boolProp name="ThreadGroup.same_user_on_next_iteration">true</boolProp>
      </ThreadGroup>
      <hashTree>
        <HTTPSamplerProxy guiclass="HttpTestSampleGui" testclass="HTTPSamplerProxy" testname="HTTP Request" enabled="true">
          <elementProp name="HTTPsampler.Arguments" elementType="Arguments" guiclass="HTTPArgumentsPanel" testclass="Arguments" testname="User Defined Variables" enabled="true">
            <collectionProp name="Arguments.arguments"/>
          </elementProp>
          <stringProp name="HTTPSampler.domain">please_replace_this_to_the_ip_of_your_service</stringProp>
          <stringProp name="HTTPSampler.port">8000</stringProp>
          <stringProp name="HTTPSampler.protocol"></stringProp>
          <stringProp name="HTTPSampler.contentEncoding"></stringProp>
          <stringProp name="HTTPSampler.path">/</stringProp>
          <stringProp name="HTTPSampler.method">GET</stringProp>
          <boolProp name="HTTPSampler.follow_redirects">true</boolProp>
          <boolProp name="HTTPSampler.auto_redirects">false</boolProp>
          <boolProp name="HTTPSampler.use_keepalive">false</boolProp><!-- disable keepalive to create connection as fast as it can-->
          <boolProp name="HTTPSampler.DO_MULTIPART_POST">false</boolProp>
          <stringProp name="HTTPSampler.embedded_url_re"></stringProp>
          <stringProp name="HTTPSampler.connect_timeout"></stringProp>
          <stringProp name="HTTPSampler.response_timeout"></stringProp>
        </HTTPSamplerProxy>
        <hashTree>
          <ResultCollector guiclass="ViewResultsFullVisualizer" testclass="ResultCollector" testname="View Results Tree" enabled="false">
            <boolProp name="ResultCollector.error_logging">false</boolProp>
            <objProp>
              <name>saveConfig</name>
              <value class="SampleSaveConfiguration">
                <time>true</time>
                <latency>true</latency>
                <timestamp>true</timestamp>
                <success>true</success>
                <label>true</label>
                <code>true</code>
                <message>true</message>
                <threadName>true</threadName>
                <dataType>true</dataType>
                <encoding>false</encoding>
                <assertions>true</assertions>
                <subresults>true</subresults>
                <responseData>false</responseData>
                <samplerData>false</samplerData>
                <xml>false</xml>
                <fieldNames>true</fieldNames>
                <responseHeaders>false</responseHeaders>
                <requestHeaders>false</requestHeaders>
                <responseDataOnError>false</responseDataOnError>
                <saveAssertionResultsFailureMessage>true</saveAssertionResultsFailureMessage>
                <assertionsResultsToSave>0</assertionsResultsToSave>
                <bytes>true</bytes>
                <sentBytes>true</sentBytes>
                <url>true</url>
                <threadCounts>true</threadCounts>
                <idleTime>true</idleTime>
                <connectTime>true</connectTime>
              </value>
            </objProp>
            <stringProp name="filename"></stringProp>
          </ResultCollector>
          <hashTree/>
          <ResultCollector guiclass="SummaryReport" testclass="ResultCollector" testname="Summary Report" enabled="true">
            <boolProp name="ResultCollector.error_logging">false</boolProp>
            <objProp>
              <name>saveConfig</name>
              <value class="SampleSaveConfiguration">
                <time>true</time>
                <latency>true</latency>
                <timestamp>true</timestamp>
                <success>true</success>
                <label>true</label>
                <code>true</code>
                <message>true</message>
                <threadName>true</threadName>
                <dataType>true</dataType>
                <encoding>false</encoding>
                <assertions>true</assertions>
                <subresults>true</subresults>
                <responseData>false</responseData>
                <samplerData>false</samplerData>
                <xml>false</xml>
                <fieldNames>true</fieldNames>
                <responseHeaders>false</responseHeaders>
                <requestHeaders>false</requestHeaders>
                <responseDataOnError>false</responseDataOnError>
                <saveAssertionResultsFailureMessage>true</saveAssertionResultsFailureMessage>
                <assertionsResultsToSave>0</assertionsResultsToSave>
                <bytes>true</bytes>
                <sentBytes>true</sentBytes>
                <url>true</url>
                <threadCounts>true</threadCounts>
                <idleTime>true</idleTime>
                <connectTime>true</connectTime>
              </value>
            </objProp>
            <stringProp name="filename"></stringProp>
          </ResultCollector>
          <hashTree/>
        </hashTree>
      </hashTree>
    </hashTree>
  </hashTree>
</jmeterTestPlan>

We give a name of this confugration of cluster.jmx.
Notice that , we have disabled the header of keepalive to make jmeter create sockets between server quickly.

At last , run the command of jmeter:

 bin/jmeter.sh -n -t /dir_of_jmx/cluster.jmx -l /tmp/cluster.jtl -e -o /tmp/cluster.out

you can see the CPU usages via the command of top:

The primary process (with pid 5308 ) also uses high CPU time.

Trott · 2021-11-19T13:50:47Z

@nodejs/cluster

mcollina · 2021-11-19T16:05:03Z

Bench CI for cluster: https://ci.nodejs.org/view/Node.js%20benchmark/job/benchmark-node-micro-benchmarks/1058/

mcollina

lgtm with a positive or neutral benchmark CI

mcollina · 2021-11-20T15:08:10Z

Benchmark CI is neutral.

nodejs-github-bot · 2021-11-23T13:18:03Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41056/

nodejs-github-bot · 2021-11-24T09:09:54Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41070/

nodejs-github-bot · 2021-11-25T10:26:15Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41104/

nodejs-github-bot · 2021-11-26T15:15:23Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41136/

mcollina · 2021-11-28T10:56:20Z

cc @Trott could you take a look on how to land this?

nodejs-github-bot · 2021-11-28T10:56:55Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41166/

nodejs-github-bot · 2021-11-28T17:49:51Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41168/

Trott · 2021-11-28T17:51:26Z

cc @Trott could you take a look on how to land this?

@mcollina I pulled the changes down locally, rebased against master (to get the fix on master branch for the test that was failing on debug builds in Jenkins), squashed the two commits into one (to fix the GitHub Action commit linter complaint), force pushed to twchn's branch for this PR, and did a rebuild on Jenkins. Hopefully everything is green and then the commit-queue label can be added.

nodejs-github-bot · 2021-11-29T01:20:40Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41178/

nodejs-github-bot · 2021-11-29T02:31:19Z

CI: https://ci.nodejs.org/job/node-test-pull-request/41180/

nodejs-github-bot · 2021-11-29T05:33:35Z

Commit Queue failed

- Loading data for nodejs/node/pull/40615
✔  Done loading data for nodejs/node/pull/40615
----------------------------------- PR info ------------------------------------
Title      cluster: use linkedlist for round_robin_handle (#40615)
   ⚠  Could not retrieve the email or name of the PR author's from user's GitHub profile!
Branch     twchn:feat_use_linkedlist_for_round_robin -> nodejs:master
Labels     cluster, timers, needs-ci
Commits    1
 - cluster: use linkedlist for round_robin_handle
Committers 1
 - Rich Trott 
PR-URL: https://github.com/nodejs/node/pull/40615
Reviewed-By: Matteo Collina 
Reviewed-By: James M Snell 
------------------------------ Generated metadata ------------------------------
PR-URL: https://github.com/nodejs/node/pull/40615
Reviewed-By: Matteo Collina 
Reviewed-By: James M Snell 
--------------------------------------------------------------------------------
   ⚠  Commits were pushed since the last review:
   ⚠  - cluster: use linkedlist for round_robin_handle
   ℹ  This PR was created on Tue, 26 Oct 2021 16:12:34 GMT
   ✔  Approvals: 2
   ✔  - Matteo Collina (@mcollina) (TSC): https://github.com/nodejs/node/pull/40615#pullrequestreview-811397947
   ✔  - James M Snell (@jasnell) (TSC): https://github.com/nodejs/node/pull/40615#pullrequestreview-811856870
   ✔  Last GitHub Actions successful
   ℹ  Last Benchmark CI on 2021-11-19T16:05:03Z: https://ci.nodejs.org/view/Node.js%20benchmark/job/benchmark-node-micro-benchmarks/1058/
   ℹ  Last Full PR CI on 2021-11-29T02:31:19Z: https://ci.nodejs.org/job/node-test-pull-request/41180/
- Querying data for job/node-test-pull-request/41180/
   ✔  Last Jenkins CI successful
--------------------------------------------------------------------------------
   ✔  Aborted `git node land` session in /home/runner/work/node/node/.ncu

https://github.com/nodejs/node/actions/runs/1514597734

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

Trott · 2021-11-29T05:39:48Z

Landed in 4b65dec

Trott · 2021-11-29T05:40:29Z

Thanks for the contribution! 🎉

yunnysunny · 2021-11-29T12:07:18Z

I have writen a kubernetes yaml file to test the performance of cluster. The node version I used is 14.18.1.

apiVersion: apps/v1 
kind: Deployment
metadata:
  name: hello-deployment
  labels:
    app: hello
spec:
  replicas: 1
  selector:
    matchLabels:
      app: hello
  template:
    metadata:
      name: hello-app
      labels:
        app: hello
    spec:
      initContainers:
      - image: busybox
        command:
        - sh
        - -c
        - |
          sysctl -w net.core.somaxconn=10240
          sysctl -w net.ipv4.ip_local_port_range="1024 65535"
          sysctl -w net.ipv4.tcp_tw_reuse=1
          sysctl -w fs.file-max=6048576
        name: setsysctl
        securityContext:
          privileged: true
      containers:
        - name: hello-app
          image: registry.cn-hangzhou.aliyuncs.com/whyun/base:hello-latest
          imagePullPolicy: Always
          resources:
            requests:
              cpu: 2000m
              memory: 2Gi
            limits:
              cpu: 4000m
              memory: 4Gi
          env:
            - name: APP_ID
              value: "17959"
            - name: APP_SECRET
              value: "6994ea9b6a8d1e673d9cc53aab8e45dd8eaec8d2"

---
kind: Service
apiVersion: v1
metadata:
  name: hello-service
spec:
  selector:
    app: hello
  ports:
    - port: 8000 # Default port for image


---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: bench-deployment
  labels:
    app: bench
spec: 
  replicas: 16
  selector:
    matchLabels:
      app: bench
  template:
    metadata:
      name: bench-node
      labels:
        app: bench
    spec:
      initContainers:
      - image: busybox
        command:
        - sh
        - -c
        - |
          sysctl -w net.core.somaxconn=10240
          sysctl -w net.ipv4.ip_local_port_range="1024 65535"
          sysctl -w net.ipv4.tcp_tw_reuse=1
          sysctl -w fs.file-max=1048576
        name: setsysctl
        securityContext:
          privileged: true
      containers:
        - name: bench-node
          image: registry.cn-hangzhou.aliyuncs.com/whyun/base:node-bench-0.2.0
          env:
            - name: APP_ID
              value: "the app id from alinode"
            - name: APP_SECRET
              value: "the app secret from alinode"
            - name: REQ_URL
              value: http://hello-service:8000/
            - name: REQ_INTERVAL_MS
              value: "5"
            - name: REQ_TIMEOUT_MS
              value: "20"

I use alinode as a tool to generate CPU flame graph, which integrated in the docker image. You have to change the environment variable of APP_ID and APP_SECRET to the correct value you applied from alinode.

We can just change the environment variable of REQ_INTERVAL_MS to a special number and then restart your kubernetes deployment via the command kubectl apply -f the_path_of_the_yaml_file_above.

I first set the value of REQ_INTERVAL_MS to 5, and get the CPU usage from top command, and generate the flame graph from alinode.

5 REQ_INTERVAL_MS CPU usage

5 REQ_INTERVAL_MS CPU flame graph

Then set it to 4:

4 REQ_INTERVAL_MS CPU usage

4 REQ_INTERVAL_MS CPU flame graph

It changed little from 5 ms.

Then set it to 3:

3 REQ_INTERVAL_MS CPU usage

3 REQ_INTERVAL_MS CPU flame graph

We can figure out that it uses more CPU time than 4ms.

Then set it to 2:

2 REQ_INTERVAL_MS CPU usage before generate flame graph

Firstly I took a screenshot of the CPU uages. It's almost the same as the usage under 3ms. Next step, when I generated the CPU flame graph, the cpu usage of the parent process raised to 100%.

2 REQ_INTERVAL_MS CPU usage during generate flame graph

And I also found that the memery usage raised fast.

2 REQ_INTERVAL_MS CPU flame graph

Combine the CPU flame graph , we can deduce the reason, 100% CPU is used by V8 engine, the task of sending socket handle to the children processes slows down, some socket handles would be saved in the queue (which is an array in js), the queue length becomes larger, it will use more time to do shift operation of the array, the CPU usage will be still high, the queue length will be larger next time. Whe the queue length is too large, the memory will exceed the limit of old generation, and the process will be OOM.

The merge request can reduce the probability of OOM, but we can also found the commuication between parent and children is not cheap. So I will be glad to see the feature of SO_REUSEPORT will be brought in node, this already a pull request #3198 in libuv on it.

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

nodejs-github-bot added cluster Issues and PRs related to the cluster subsystem. needs-ci PRs that need a full CI run. timers Issues and PRs related to the timers subsystem / setImmediate, setInterval, setTimeout. labels Oct 26, 2021

mcollina approved these changes Nov 19, 2021

View reviewed changes

jasnell approved these changes Nov 20, 2021

View reviewed changes

github-actions bot mentioned this pull request Nov 24, 2021

CI Reliability 2021-11-24 nodejs/reliability#127

Open

45 tasks

github-actions bot mentioned this pull request Nov 25, 2021

CI Reliability 2021-11-25 nodejs/reliability#128

Open

38 tasks

github-actions bot mentioned this pull request Nov 26, 2021

CI Reliability 2021-11-26 nodejs/reliability#129

Open

33 tasks

This was referenced Nov 27, 2021

CI Reliability 2021-11-27 nodejs/reliability#130

Open

CI Reliability 2021-11-28 nodejs/reliability#131

Open

github-actions bot mentioned this pull request Nov 29, 2021

CI Reliability 2021-11-29 nodejs/reliability#132

Open

27 tasks

Trott added the commit-queue Add this label to land a pull request using GitHub Actions. label Nov 29, 2021

nodejs-github-bot added commit-queue-failed An error occurred while landing this pull request using GitHub Actions. and removed commit-queue Add this label to land a pull request using GitHub Actions. labels Nov 29, 2021

cluster: use linkedlist for round_robin_handle

4b65dec

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

Trott closed this Nov 29, 2021

twchn deleted the feat_use_linkedlist_for_round_robin branch December 1, 2021 08:27

danielleadams pushed a commit that referenced this pull request Dec 13, 2021

cluster: use linkedlist for round_robin_handle

3c48ae9

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

danielleadams pushed a commit that referenced this pull request Dec 14, 2021

cluster: use linkedlist for round_robin_handle

e924dc7

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

danielleadams mentioned this pull request Dec 14, 2021

v17.3.0 release proposal #41167

Merged

danielleadams pushed a commit that referenced this pull request Jan 30, 2022

cluster: use linkedlist for round_robin_handle

da3ee13

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

danielleadams pushed a commit that referenced this pull request Feb 1, 2022

cluster: use linkedlist for round_robin_handle

06b775c

PR-URL: #40615 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>

danielleadams mentioned this pull request Feb 1, 2022

v16.14.0 proposal #41804

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cluster: use linkedlist for round_robin_handle #40615

cluster: use linkedlist for round_robin_handle #40615

twchn commented Oct 26, 2021

Trott commented Oct 26, 2021

yunnysunny commented Nov 19, 2021 •

edited

Trott commented Nov 19, 2021

mcollina commented Nov 19, 2021

mcollina left a comment

mcollina commented Nov 20, 2021

nodejs-github-bot commented Nov 23, 2021

nodejs-github-bot commented Nov 24, 2021

nodejs-github-bot commented Nov 25, 2021

nodejs-github-bot commented Nov 26, 2021

mcollina commented Nov 28, 2021

nodejs-github-bot commented Nov 28, 2021

nodejs-github-bot commented Nov 28, 2021

Trott commented Nov 28, 2021

nodejs-github-bot commented Nov 29, 2021

nodejs-github-bot commented Nov 29, 2021

nodejs-github-bot commented Nov 29, 2021

Trott commented Nov 29, 2021

Trott commented Nov 29, 2021

yunnysunny commented Nov 29, 2021 •

edited

cluster: use linkedlist for round_robin_handle #40615

cluster: use linkedlist for round_robin_handle #40615

Conversation

twchn commented Oct 26, 2021

Trott commented Oct 26, 2021

yunnysunny commented Nov 19, 2021 • edited

Trott commented Nov 19, 2021

mcollina commented Nov 19, 2021

mcollina left a comment

Choose a reason for hiding this comment

mcollina commented Nov 20, 2021

nodejs-github-bot commented Nov 23, 2021

nodejs-github-bot commented Nov 24, 2021

nodejs-github-bot commented Nov 25, 2021

nodejs-github-bot commented Nov 26, 2021

mcollina commented Nov 28, 2021

nodejs-github-bot commented Nov 28, 2021

nodejs-github-bot commented Nov 28, 2021

Trott commented Nov 28, 2021

nodejs-github-bot commented Nov 29, 2021

nodejs-github-bot commented Nov 29, 2021

nodejs-github-bot commented Nov 29, 2021

Trott commented Nov 29, 2021

Trott commented Nov 29, 2021

yunnysunny commented Nov 29, 2021 • edited

yunnysunny commented Nov 19, 2021 •

edited

yunnysunny commented Nov 29, 2021 •

edited