From 88b6bce567993423a06d13bfe4c11aba0fad300a Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Wed, 22 Sep 2021 19:58:35 +0530 Subject: [PATCH 1/5] Adding 'namespace' to the 'ceph_node_down' query Signed-off-by: Arun Kumar Mohan --- extras/manifests/prometheus-ceph-rules.yaml | 2 +- rules/rules.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index e590291..ae18032 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -11,7 +11,7 @@ spec: - name: ceph.rules rules: - expr: | - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace) record: cluster:ceph_node_down:join_kube - expr: | avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) diff --git a/rules/rules.libsonnet b/rules/rules.libsonnet index af5d821..7cb199f 100644 --- a/rules/rules.libsonnet +++ b/rules/rules.libsonnet @@ -7,7 +7,7 @@ { record: 'cluster:ceph_node_down:join_kube', expr: ||| - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node, namespace) ||| % $._config, }, { From 922f224a26217292504b2688848d0ed943f723e4 Mon Sep 17 00:00:00 2001 From: aruniiird Date: Thu, 30 Sep 2021 01:24:57 +0530 Subject: [PATCH 2/5] Change CephAbsentMgr to use 'up' query Instead of using 'absent' query, we are trying to use 'up' which should provide us with the needed 'namespace' field in the resultant metrics Signed-off-by: aruniiird --- alerts/absent_alerts.libsonnet | 2 +- extras/manifests/prometheus-ceph-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index e8630e7..b032117 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -7,7 +7,7 @@ { alert: 'CephMgrIsAbsent', expr: ||| - absent(up{%(cephExporterSelector)s} == 1) + up{%(cephExporterSelector)s} == 0 ||| % $._config, 'for': $._config.mgrIsAbsentAlertTime, labels: { diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index ae18032..cc46355 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -42,7 +42,7 @@ spec: severity_level: critical storage_type: ceph expr: | - absent(up{job="rook-ceph-mgr"} == 1) + up{job="rook-ceph-mgr"} == 0 for: 5m labels: severity: critical From d99d6d55c6cf6d88952c3e017a9557fde1a5a29a Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Fri, 1 Oct 2021 15:50:44 +0530 Subject: [PATCH 3/5] Adding namespace field into other alert queries Signed-off-by: Arun Kumar Mohan --- alerts/absent_alerts.libsonnet | 4 ++-- alerts/monquorum.libsonnet | 2 +- alerts/state.libsonnet | 2 +- extras/manifests/prometheus-ceph-rules.yaml | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index b032117..b2da4ab 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -23,7 +23,7 @@ { alert: 'CephMgrIsMissingReplicas', expr: ||| - sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d + sum(up{%(cephExporterSelector)s}) by (namespace) < %(cephMgrCount)d ||| % $._config, 'for': $._config.mgrMissingReplicasAlertTime, labels: { @@ -44,7 +44,7 @@ { alert: 'CephMdsMissingReplicas', expr: ||| - sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) < %(cephMdsCount)d + sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) by (namespace) < %(cephMdsCount)d ||| % $._config, 'for': $._config.mdsMissingReplicasAlertTime, labels: { diff --git a/alerts/monquorum.libsonnet b/alerts/monquorum.libsonnet index b86a8e9..33a6da3 100644 --- a/alerts/monquorum.libsonnet +++ b/alerts/monquorum.libsonnet @@ -7,7 +7,7 @@ { alert: 'CephMonQuorumAtRisk', expr: ||| - count(ceph_mon_quorum_status{%s} == 1) <= (floor(count(ceph_mon_metadata{%s}) / 2) + 1) + count(ceph_mon_quorum_status{%s} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{%s}) by (namespace) / 2) + 1) ||| % [$._config.cephExporterSelector, $._config.cephExporterSelector], 'for': $._config.monQuorumAlertTime, labels: { diff --git a/alerts/state.libsonnet b/alerts/state.libsonnet index 1468da7..5300187 100644 --- a/alerts/state.libsonnet +++ b/alerts/state.libsonnet @@ -39,7 +39,7 @@ { alert: 'CephOSDVersionMismatch', expr: ||| - count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version)) > 1 + count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 ||| % $._config, 'for': $._config.clusterVersionAlertTime, labels: { diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index cc46355..cc4b370 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -53,7 +53,7 @@ spec: severity_level: warning storage_type: ceph expr: | - sum(up{job="rook-ceph-mgr"}) < 1 + sum(up{job="rook-ceph-mgr"}) by (namespace) < 1 for: 5m labels: severity: warning @@ -66,7 +66,7 @@ spec: severity_level: warning storage_type: ceph expr: | - sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2 + sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2 for: 5m labels: severity: warning @@ -79,7 +79,7 @@ spec: severity_level: error storage_type: ceph expr: | - count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) / 2) + 1) + count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1) for: 15m labels: severity: critical @@ -252,7 +252,7 @@ spec: severity_level: warning storage_type: ceph expr: | - count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 for: 10m labels: severity: warning From 5054c43987cb585fb7bd7235b2d2cb193aba0b13 Mon Sep 17 00:00:00 2001 From: aruniiird Date: Fri, 1 Oct 2021 01:23:52 +0530 Subject: [PATCH 4/5] Increasing the auto-resolvable alerts' delay to 15m The following alerts, CephMonHighNumberOfLeaderChanges CephOSDDiskNotResponding CephClusterWarningState , which are resolved automatically, in most cases, are causing unnecessary admin events. So we are increasing the alert delay time to '15m'. Signed-off-by: aruniiird --- alerts/osd.libsonnet | 4 ++-- alerts/state.libsonnet | 2 +- config.libsonnet | 5 ++++- extras/manifests/prometheus-ceph-rules.yaml | 6 +++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/alerts/osd.libsonnet b/alerts/osd.libsonnet index 57a11f5..e202c1b 100644 --- a/alerts/osd.libsonnet +++ b/alerts/osd.libsonnet @@ -57,7 +57,7 @@ expr: ||| label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskNotRespondingTime, labels: { severity: 'critical', }, @@ -73,7 +73,7 @@ expr: ||| label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskUnavailableTime, labels: { severity: 'critical', }, diff --git a/alerts/state.libsonnet b/alerts/state.libsonnet index 5300187..acd0cf8 100644 --- a/alerts/state.libsonnet +++ b/alerts/state.libsonnet @@ -25,7 +25,7 @@ expr: ||| ceph_health_status{%(cephExporterSelector)s} == 1 ||| % $._config, - 'for': $._config.clusterStateAlertTime, + 'for': $._config.clusterWarningStateAlertTime, labels: { severity: 'warning', }, diff --git a/config.libsonnet b/config.libsonnet index 248732f..a13b5b6 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -11,15 +11,18 @@ // Duration to raise various Alerts cephNodeDownAlertTime: '30s', clusterStateAlertTime: '10m', + clusterWarningStateAlertTime: '15m', clusterVersionAlertTime: '10m', clusterUtilizationAlertTime: '5s', clusterReadOnlyAlertTime: '0s', poolQuotaUtilizationAlertTime: '1m', monQuorumAlertTime: '15m', - monQuorumLeaderChangesAlertTime: '5m', + monQuorumLeaderChangesAlertTime: '15m', osdDataRebalanceAlertTime: '15s', osdDataRecoveryAlertTime: '2h', osdDataRecoveryInProgressAlertTime: '30s', + osdDiskNotRespondingTime: '15m', + osdDiskUnavailableTime: '1m', osdDiskAlertTime: '1m', osdDownAlertTime: '5m', osdFlapAlertTime: '0s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index cc4b370..a5518c7 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -91,7 +91,7 @@ spec: storage_type: ceph expr: | (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 - for: 5m + for: 15m labels: severity: warning - name: ceph-node-alert.rules @@ -150,7 +150,7 @@ spec: storage_type: ceph expr: | label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") - for: 1m + for: 15m labels: severity: critical - alert: CephOSDDiskUnavailable @@ -242,7 +242,7 @@ spec: storage_type: ceph expr: | ceph_health_status{job="rook-ceph-mgr"} == 1 - for: 10m + for: 15m labels: severity: warning - alert: CephOSDVersionMismatch From 922df79ac14668a144ff4e9bbb4c4426c7621309 Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Mon, 4 Oct 2021 09:18:08 +0530 Subject: [PATCH 5/5] Reverting the time delay of 'CephMonHighNumberOfLeaderChanges' Reverting the time delay of 'CephMonHighNumberOfLeaderChanges' back to 5m Signed-off-by: Arun Kumar Mohan --- config.libsonnet | 2 +- extras/manifests/prometheus-ceph-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.libsonnet b/config.libsonnet index a13b5b6..e7fb537 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -17,7 +17,7 @@ clusterReadOnlyAlertTime: '0s', poolQuotaUtilizationAlertTime: '1m', monQuorumAlertTime: '15m', - monQuorumLeaderChangesAlertTime: '15m', + monQuorumLeaderChangesAlertTime: '5m', osdDataRebalanceAlertTime: '15s', osdDataRecoveryAlertTime: '2h', osdDataRecoveryInProgressAlertTime: '30s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index a5518c7..336129c 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -91,7 +91,7 @@ spec: storage_type: ceph expr: | (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 - for: 15m + for: 5m labels: severity: warning - name: ceph-node-alert.rules