From 88b6bce567993423a06d13bfe4c11aba0fad300a Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Wed, 22 Sep 2021 19:58:35 +0530 Subject: [PATCH 1/2] Adding 'namespace' to the 'ceph_node_down' query Signed-off-by: Arun Kumar Mohan --- extras/manifests/prometheus-ceph-rules.yaml | 2 +- rules/rules.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index e590291..ae18032 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -11,7 +11,7 @@ spec: - name: ceph.rules rules: - expr: | - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace) record: cluster:ceph_node_down:join_kube - expr: | avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) diff --git a/rules/rules.libsonnet b/rules/rules.libsonnet index af5d821..7cb199f 100644 --- a/rules/rules.libsonnet +++ b/rules/rules.libsonnet @@ -7,7 +7,7 @@ { record: 'cluster:ceph_node_down:join_kube', expr: ||| - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node, namespace) ||| % $._config, }, { From 922f224a26217292504b2688848d0ed943f723e4 Mon Sep 17 00:00:00 2001 From: aruniiird Date: Thu, 30 Sep 2021 01:24:57 +0530 Subject: [PATCH 2/2] Change CephAbsentMgr to use 'up' query Instead of using 'absent' query, we are trying to use 'up' which should provide us with the needed 'namespace' field in the resultant metrics Signed-off-by: aruniiird --- alerts/absent_alerts.libsonnet | 2 +- extras/manifests/prometheus-ceph-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index e8630e7..b032117 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -7,7 +7,7 @@ { alert: 'CephMgrIsAbsent', expr: ||| - absent(up{%(cephExporterSelector)s} == 1) + up{%(cephExporterSelector)s} == 0 ||| % $._config, 'for': $._config.mgrIsAbsentAlertTime, labels: { diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index ae18032..cc46355 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -42,7 +42,7 @@ spec: severity_level: critical storage_type: ceph expr: | - absent(up{job="rook-ceph-mgr"} == 1) + up{job="rook-ceph-mgr"} == 0 for: 5m labels: severity: critical