diff --git a/Makefile b/Makefile index b1e4785..41763ab 100644 --- a/Makefile +++ b/Makefile @@ -56,8 +56,9 @@ gen-golden: ensure-prometheus docker-compose-down ping-postgres ## Update golden @$(COMPOSE_CMD) $(compose_args) down .PHONY: fmt -fmt: ## Run 'go fmt' against code +fmt: ## Run 'go fmt' and `jsonnetfmt` against code go fmt ./... + find . \( -name '*.jsonnet' -o -name '*.libsonnet' \) -exec jsonnetfmt -i -- {} \; .PHONY: vet vet: ## Run 'go vet' against code diff --git a/pkg/db/seeds/appuio_cloud_memory.promql b/pkg/db/seeds/appuio_cloud_memory.promql index dc0787b..a3ee705 100644 --- a/pkg/db/seeds/appuio_cloud_memory.promql +++ b/pkg/db/seeds/appuio_cloud_memory.promql @@ -17,12 +17,12 @@ sum_over_time( # Select used memory if higher. ( sum by(cluster_id, namespace, label_appuio_io_node_class) (container_memory_working_set_bytes{image!=""} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))) # IMPORTANT: one clause must use equal. If used grater and lesser than, equal values will be dropped. >= sum by(cluster_id, namespace, label_appuio_io_node_class) (kube_pod_container_resource_requests{resource="memory"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))) ) or # Select reserved memory if higher. @@ -30,10 +30,10 @@ sum_over_time( # IMPORTANT: The desired time series must always be first. sum by(cluster_id, namespace, label_appuio_io_node_class) (kube_pod_container_resource_requests{resource="memory"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))) > sum by(cluster_id, namespace, label_appuio_io_node_class) (container_memory_working_set_bytes{image!=""} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))) ) ) # Add CPU requests in violation to the ratio provided by the platform. @@ -41,7 +41,7 @@ sum_over_time( # Convert CPU request to their memory equivalent. sum by(cluster_id, namespace, label_appuio_io_node_class) ( kube_pod_container_resource_requests{resource="cpu"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""}) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""})) # Build that ratio from static values * on(cluster_id) group_left()( # Build a time series of ratio for Cloudscale LPG 2 (4096 MiB/core) @@ -52,7 +52,7 @@ sum_over_time( ) # Subtract memory request - sum by(cluster_id, namespace, label_appuio_io_node_class) (kube_pod_container_resource_requests{resource="memory"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""}) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""})) # Only values above zero are in violation. ), 0) ) diff --git a/pkg/db/seeds/appuio_cloud_memory_sub_cpu.promql b/pkg/db/seeds/appuio_cloud_memory_sub_cpu.promql index 44f0a15..aaa5f21 100644 --- a/pkg/db/seeds/appuio_cloud_memory_sub_cpu.promql +++ b/pkg/db/seeds/appuio_cloud_memory_sub_cpu.promql @@ -1,3 +1,5 @@ +# Calculates CPU requests higher than memory requests respecting the fair-use ratio + # Sum values over one hour. sum_over_time( # Average over a one-minute time frame. @@ -14,7 +16,7 @@ sum_over_time( sum by(cluster_id, namespace, label_appuio_io_node_class) ( # Get the CPU requests kube_pod_container_resource_requests{resource="cpu"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""}) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""})) # Convert them to their memory equivalent by multiplying them by the memory to CPU ratio # Build that ratio from static values * on(cluster_id) group_left()( @@ -25,7 +27,7 @@ sum_over_time( ) ) - sum by(cluster_id, namespace, label_appuio_io_node_class) (kube_pod_container_resource_requests{resource="memory"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))) ) * # Join namespace label `label_appuio_io_organization` as `tenant_id`. diff --git a/pkg/db/seeds/appuio_cloud_memory_sub_memory.promql b/pkg/db/seeds/appuio_cloud_memory_sub_memory.promql index 679ba90..85f80bb 100644 --- a/pkg/db/seeds/appuio_cloud_memory_sub_memory.promql +++ b/pkg/db/seeds/appuio_cloud_memory_sub_memory.promql @@ -1,3 +1,5 @@ +# Calculates memory requests higher than the real memory usage + # Sum values over one hour. sum_over_time( # Average over a one-minute time frame. @@ -14,11 +16,11 @@ sum_over_time( clamp_min( sum by(cluster_id, namespace, label_appuio_io_node_class) (kube_pod_container_resource_requests{resource="memory"} * on(uid, cluster_id, pod, namespace) group_left kube_pod_status_phase{phase="Running"} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})), + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))), 128 * 1024 * 1024 ) - sum by(cluster_id, namespace, label_appuio_io_node_class) (container_memory_working_set_bytes{image!=""} - * on(node) group_left(label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(node) kube_node_labels{label_appuio_io_node_class=""})) + * on(cluster_id, node) group_left(label_appuio_io_node_class) (min by(cluster_id, node, label_appuio_io_node_class) (kube_node_labels{label_appuio_io_node_class!=""} or on(cluster_id, node) kube_node_labels{label_appuio_io_node_class=""}))) ), 0 ) diff --git a/pkg/db/seeds/promtest/appuio_cloud_memory.jsonnet b/pkg/db/seeds/promtest/appuio_cloud_memory.jsonnet new file mode 100644 index 0000000..8003590 --- /dev/null +++ b/pkg/db/seeds/promtest/appuio_cloud_memory.jsonnet @@ -0,0 +1,211 @@ +local c = import 'common.libsonnet'; + +local query = importstr '../appuio_cloud_memory.promql'; +local subCPUQuery = importstr '../appuio_cloud_memory_sub_cpu.promql'; +local subMemoryQuery = importstr '../appuio_cloud_memory_sub_memory.promql'; + +local commonLabels = { + cluster_id: 'c-appuio-cloudscale-lpg-2', + tenant_id: 'c-appuio-cloudscale-lpg-2', +}; + +// One running pod, minimal (=1 byte) memory request and usage, no CPU request +// 10 samples +local baseSeries = { + flexNodeLabel: c.series('kube_node_labels', commonLabels { + label_appuio_io_node_class: 'flex', + label_kubernetes_io_hostname: 'flex-x666', + node: 'flex-x666', + }, '1x10'), + testprojectNamespaceOrgLabel: c.series('kube_namespace_labels', commonLabels { + namespace: 'testproject', + label_appuio_io_organization: 'cherry-pickers-inc', + }, '1x10'), + + local podLbls = commonLabels { + namespace: 'testproject', + pod: 'running-pod', + uid: '35e3a8b1-b46d-496c-b2b7-1b52953bf904', + }, + // Phases + runningPodPhase: c.series('kube_pod_status_phase', podLbls { + phase: 'Running', + }, '1x10'), + // Requests + runningPodMemoryRequests: c.series('kube_pod_container_resource_requests', podLbls { + resource: 'memory', + node: 'flex-x666', + }, '1x10'), + runningPodCPURequests: c.series('kube_pod_container_resource_requests', podLbls { + resource: 'cpu', + node: 'flex-x666', + }, '0x10'), + // Real usage + runningPodMemoryUsage: c.series('container_memory_working_set_bytes', podLbls { + image: 'busybox', + node: 'flex-x666', + }, '1x10'), +}; + +local baseCalculatedLabels = { + category: 'c-appuio-cloudscale-lpg-2:testproject', + cluster_id: 'c-appuio-cloudscale-lpg-2', + label_appuio_io_node_class: 'flex', + namespace: 'testproject', + product: 'appuio_cloud_memory:c-appuio-cloudscale-lpg-2:cherry-pickers-inc:testproject:flex', + tenant_id: 'cherry-pickers-inc', +}; + +// Constants from the query +local minMemoryRequestMib = 128; +local cloudscaleFairUseRatio = 4294967296; + +{ + tests: [ + c.test('minimal pod', + baseSeries, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + value: minMemoryRequestMib * 10, + }), + c.test('pod with higher memory usage', + baseSeries { + runningPodMemoryUsage+: { + values: '%sx10' % (500 * 1024 * 1024), + }, + }, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + value: 500 * 10, + }), + c.test('pod with higher memory requests', + baseSeries { + runningPodMemoryRequests+: { + values: '%sx10' % (500 * 1024 * 1024), + }, + }, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + value: 500 * 10, + }), + c.test('pod with CPU requests violating fair use', + baseSeries { + runningPodCPURequests+: { + values: '1x10', + }, + }, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + // See per cluster fair use ratio in query + // value: 2.048E+04, + value: (cloudscaleFairUseRatio / 1024 / 1024) * 10, + }), + c.test('non-running pods are not counted', + baseSeries { + local lbls = commonLabels { + namespace: 'testproject', + pod: 'succeeded-pod', + uid: '2a7a6e32-0840-4ac3-bab4-52d7e16f4a0a', + }, + succeededPodPhase: c.series('kube_pod_status_phase', lbls { + phase: 'Succeeded', + }, '1x10'), + succeededPodMemoryRequests: c.series('kube_pod_container_resource_requests', lbls { + resource: 'memory', + node: 'flex-x666', + }, '1x10'), + succeededPodCPURequests: c.series('kube_pod_container_resource_requests', lbls { + node: 'flex-x666', + resource: 'cpu', + }, '1x10'), + }, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + value: minMemoryRequestMib * 10, + }), + c.test('unrelated kube node label changes do not throw errors - there is an overlap since series go stale only after a few missed scrapes', + baseSeries { + flexNodeLabel+: { + _labels+:: { + label_csi_driver_id: 'A09B8DDE-5435-4D74-923C-4866513E8F02', + }, + values: '1x10 _x10 stale', + }, + flexNodeLabelUpdated: self.flexNodeLabel { + _labels+:: { + label_csi_driver_id: '18539CC3-0B6C-4E72-82BD-90A9BEF7D807', + }, + values: '_x5 1x15', + }, + }, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + value: minMemoryRequestMib * 10, + }), + c.test('unrelated kube node label adds do not throw errors - there is an overlap since series go stale only after a few missed scrapes', + baseSeries { + flexNodeLabel+: { + values: '1x10 _x10 stale', + }, + flexNodeLabelUpdated: self.flexNodeLabel { + _labels+:: { + label_csi_driver_id: '18539CC3-0B6C-4E72-82BD-90A9BEF7D807', + }, + values: '_x5 1x15', + }, + }, + query, + { + labels: c.formatLabels(baseCalculatedLabels), + value: minMemoryRequestMib * 10, + }), + c.test('node class adds do not throw errors - there is an overlap since series go stale only after a few missed scrapes', + baseSeries { + flexNodeLabel+: { + _labels+:: { + label_appuio_io_node_class:: null, + }, + values: '1x10 _x10 stale', + }, + flexNodeLabelUpdated: super.flexNodeLabel { + values: '_x5 1x15', + }, + }, + query, + [ + // I'm not sure why this is 11 * minMemoryRequestMib, might have something to do with the intervals or intra minute switching + { + labels: c.formatLabels(baseCalculatedLabels), + value: minMemoryRequestMib * 8, + }, + { + labels: c.formatLabels(baseCalculatedLabels { + label_appuio_io_node_class:: null, + product: 'appuio_cloud_memory:c-appuio-cloudscale-lpg-2:cherry-pickers-inc:testproject:', + }), + value: minMemoryRequestMib * 3, + }, + ]), + + c.test('sub CPU requests query sanity check', + baseSeries, + subCPUQuery, + { + labels: c.formatLabels(baseCalculatedLabels), + value: 0, + }), + c.test('sub memory requests query sanity check', + baseSeries, + subMemoryQuery, + { + labels: c.formatLabels(baseCalculatedLabels), + value: (minMemoryRequestMib - (1 / 1024 / 1024)) * 10, + }), + ], +} diff --git a/pkg/db/seeds/promtest/common.libsonnet b/pkg/db/seeds/promtest/common.libsonnet index 0544b44..b3dd5e7 100644 --- a/pkg/db/seeds/promtest/common.libsonnet +++ b/pkg/db/seeds/promtest/common.libsonnet @@ -1,13 +1,33 @@ local formatLabels = function(labels) - local lf = std.join(', ', std.map(function(l) '%s="%s"' % [ l, labels[l] ], std.objectFields(labels))); - "{%s}" % [ lf ]; + local lf = std.join(', ', std.map(function(l) '%s="%s"' % [l, labels[l]], std.objectFields(labels))); + '{%s}' % [lf]; +// returns a series object with correctly formatted labels. +// labels can be modified post creation using `_labels`. local series = function(name, labels, values) { - series: name+formatLabels(labels), + _name:: name, + _labels:: labels, + series: self._name + formatLabels(self._labels), values: values, }; +// returns a test object with the given series and samples. Sample interval is 30s +// the evaluation time is set one hour in the future since all our queries operate on a 1h window +local test = function(name, series, query, samples) { + name: name, + interval: '30s', + input_series: if std.isArray(series) then series else std.objectValues(series), + promql_expr_test: [ + { + expr: query, + eval_time: '1h', + exp_samples: if std.isArray(samples) then samples else [samples], + }, + ], +}; + { series: series, formatLabels: formatLabels, + test: test, } diff --git a/pkg/db/seeds/promtest/query.jsonnet b/pkg/db/seeds/promtest/query.jsonnet deleted file mode 100644 index 94665ac..0000000 --- a/pkg/db/seeds/promtest/query.jsonnet +++ /dev/null @@ -1,105 +0,0 @@ -local c = import 'common.libsonnet'; - -local query = importstr '../appuio_cloud_memory.promql'; - -local commonLabels = { - cluster_id: 'c-appuio-cloudscale-lpg-2', - tenant_id: 'c-appuio-cloudscale-lpg-2', -}; - -{ - tests: [ - { - interval: '30s', - local runningUID = '35e3a8b1-b46d-496c-b2b7-1b52953bf904', - local succeededUID = '2a7a6e32-0840-4ac3-bab4-52d7e16f4a0a', - input_series: [ - c.series('kube_node_labels', commonLabels { - label_appuio_io_node_class: 'flex', - label_kubernetes_io_hostname: 'flex-x666', - node: 'flex-x666', - }, '1+0x10'), - c.series('kube_namespace_labels', commonLabels { - namespace: 'testproject', - label_appuio_io_organization: 'cherry-pickers-inc', - }, '1+0x10'), - // Phases - c.series('kube_pod_status_phase', commonLabels { - namespace: 'testproject', - phase: 'Succeeded', - pod: 'succeeded-pod', - uid: succeededUID, - }, '1+0x10'), - c.series('kube_pod_status_phase', commonLabels { - namespace: 'testproject', - phase: 'Running', - pod: 'running-pod', - uid: runningUID, - }, '1+0x10'), - // Requests - c.series('kube_pod_container_resource_requests', commonLabels { - namespace: 'testproject', - pod: 'succeeded-pod', - resource: 'memory', - node: 'flex-x666', - uid: succeededUID, - }, '1+0x10'), - c.series('kube_pod_container_resource_requests', commonLabels { - namespace: 'testproject', - pod: 'running-pod', - resource: 'memory', - node: 'flex-x666', - uid: runningUID, - }, '1+0x10'), - c.series('kube_pod_container_resource_requests', commonLabels { - namespace: 'testproject', - pod: 'succeeded-pod', - node: 'flex-x666', - resource: 'cpu', - uid: succeededUID, - }, '0+0x10'), - c.series('kube_pod_container_resource_requests', commonLabels { - namespace: 'testproject', - pod: 'running-pod', - node: 'flex-x666', - resource: 'cpu', - uid: runningUID, - }, '0+0x10'), - // Real usage - c.series('container_memory_working_set_bytes', commonLabels { - image: 'busybox', - namespace: 'testproject', - pod: 'succeeded-pod', - node: 'flex-x666', - uid: succeededUID, - }, '1+0x10'), - c.series('container_memory_working_set_bytes', commonLabels { - image: 'busybox', - namespace: 'testproject', - pod: 'running-pod', - node: 'flex-x666', - uid: runningUID, - }, '1+0x10'), - ], - promql_expr_test: [ - { - expr: query, - eval_time: '1h', - exp_samples: [ - { - labels: c.formatLabels({ - category: 'c-appuio-cloudscale-lpg-2:testproject', - cluster_id: 'c-appuio-cloudscale-lpg-2', - label_appuio_io_node_class: 'flex', - namespace: 'testproject', - product: 'appuio_cloud_memory:c-appuio-cloudscale-lpg-2:cherry-pickers-inc:testproject:flex', - tenant_id: 'cherry-pickers-inc', - }), - value: 128 * 10, - }, - ], - }, - ], - }, - ], -} diff --git a/pkg/db/seeds/queries_test.go b/pkg/db/seeds/queries_test.go index 0042eb9..d884d47 100644 --- a/pkg/db/seeds/queries_test.go +++ b/pkg/db/seeds/queries_test.go @@ -38,10 +38,10 @@ func runPromtool(t *testing.T, tmp string) { cmd.Stdout = &stdout assert.NoError(t, cmd.Run()) // Not using t.Log to keep formatting sane - fmt.Println("STDERR") - fmt.Println(stderr.String()) fmt.Println("STDOUT") fmt.Println(stdout.String()) + fmt.Println("STDERR") + fmt.Println(stderr.String()) } func renderJsonnet(t *testing.T, tFile string) string {