From d99d6d55c6cf6d88952c3e017a9557fde1a5a29a Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Fri, 1 Oct 2021 15:50:44 +0530 Subject: [PATCH 1/3] Adding namespace field into other alert queries Signed-off-by: Arun Kumar Mohan --- alerts/absent_alerts.libsonnet | 4 ++-- alerts/monquorum.libsonnet | 2 +- alerts/state.libsonnet | 2 +- extras/manifests/prometheus-ceph-rules.yaml | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index b032117..b2da4ab 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -23,7 +23,7 @@ { alert: 'CephMgrIsMissingReplicas', expr: ||| - sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d + sum(up{%(cephExporterSelector)s}) by (namespace) < %(cephMgrCount)d ||| % $._config, 'for': $._config.mgrMissingReplicasAlertTime, labels: { @@ -44,7 +44,7 @@ { alert: 'CephMdsMissingReplicas', expr: ||| - sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) < %(cephMdsCount)d + sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) by (namespace) < %(cephMdsCount)d ||| % $._config, 'for': $._config.mdsMissingReplicasAlertTime, labels: { diff --git a/alerts/monquorum.libsonnet b/alerts/monquorum.libsonnet index b86a8e9..33a6da3 100644 --- a/alerts/monquorum.libsonnet +++ b/alerts/monquorum.libsonnet @@ -7,7 +7,7 @@ { alert: 'CephMonQuorumAtRisk', expr: ||| - count(ceph_mon_quorum_status{%s} == 1) <= (floor(count(ceph_mon_metadata{%s}) / 2) + 1) + count(ceph_mon_quorum_status{%s} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{%s}) by (namespace) / 2) + 1) ||| % [$._config.cephExporterSelector, $._config.cephExporterSelector], 'for': $._config.monQuorumAlertTime, labels: { diff --git a/alerts/state.libsonnet b/alerts/state.libsonnet index 1468da7..5300187 100644 --- a/alerts/state.libsonnet +++ b/alerts/state.libsonnet @@ -39,7 +39,7 @@ { alert: 'CephOSDVersionMismatch', expr: ||| - count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version)) > 1 + count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 ||| % $._config, 'for': $._config.clusterVersionAlertTime, labels: { diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index cc46355..cc4b370 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -53,7 +53,7 @@ spec: severity_level: warning storage_type: ceph expr: | - sum(up{job="rook-ceph-mgr"}) < 1 + sum(up{job="rook-ceph-mgr"}) by (namespace) < 1 for: 5m labels: severity: warning @@ -66,7 +66,7 @@ spec: severity_level: warning storage_type: ceph expr: | - sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2 + sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2 for: 5m labels: severity: warning @@ -79,7 +79,7 @@ spec: severity_level: error storage_type: ceph expr: | - count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) / 2) + 1) + count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1) for: 15m labels: severity: critical @@ -252,7 +252,7 @@ spec: severity_level: warning storage_type: ceph expr: | - count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 for: 10m labels: severity: warning From 5054c43987cb585fb7bd7235b2d2cb193aba0b13 Mon Sep 17 00:00:00 2001 From: aruniiird Date: Fri, 1 Oct 2021 01:23:52 +0530 Subject: [PATCH 2/3] Increasing the auto-resolvable alerts' delay to 15m The following alerts, CephMonHighNumberOfLeaderChanges CephOSDDiskNotResponding CephClusterWarningState , which are resolved automatically, in most cases, are causing unnecessary admin events. So we are increasing the alert delay time to '15m'. Signed-off-by: aruniiird --- alerts/osd.libsonnet | 4 ++-- alerts/state.libsonnet | 2 +- config.libsonnet | 5 ++++- extras/manifests/prometheus-ceph-rules.yaml | 6 +++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/alerts/osd.libsonnet b/alerts/osd.libsonnet index 57a11f5..e202c1b 100644 --- a/alerts/osd.libsonnet +++ b/alerts/osd.libsonnet @@ -57,7 +57,7 @@ expr: ||| label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskNotRespondingTime, labels: { severity: 'critical', }, @@ -73,7 +73,7 @@ expr: ||| label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskUnavailableTime, labels: { severity: 'critical', }, diff --git a/alerts/state.libsonnet b/alerts/state.libsonnet index 5300187..acd0cf8 100644 --- a/alerts/state.libsonnet +++ b/alerts/state.libsonnet @@ -25,7 +25,7 @@ expr: ||| ceph_health_status{%(cephExporterSelector)s} == 1 ||| % $._config, - 'for': $._config.clusterStateAlertTime, + 'for': $._config.clusterWarningStateAlertTime, labels: { severity: 'warning', }, diff --git a/config.libsonnet b/config.libsonnet index 248732f..a13b5b6 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -11,15 +11,18 @@ // Duration to raise various Alerts cephNodeDownAlertTime: '30s', clusterStateAlertTime: '10m', + clusterWarningStateAlertTime: '15m', clusterVersionAlertTime: '10m', clusterUtilizationAlertTime: '5s', clusterReadOnlyAlertTime: '0s', poolQuotaUtilizationAlertTime: '1m', monQuorumAlertTime: '15m', - monQuorumLeaderChangesAlertTime: '5m', + monQuorumLeaderChangesAlertTime: '15m', osdDataRebalanceAlertTime: '15s', osdDataRecoveryAlertTime: '2h', osdDataRecoveryInProgressAlertTime: '30s', + osdDiskNotRespondingTime: '15m', + osdDiskUnavailableTime: '1m', osdDiskAlertTime: '1m', osdDownAlertTime: '5m', osdFlapAlertTime: '0s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index cc4b370..a5518c7 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -91,7 +91,7 @@ spec: storage_type: ceph expr: | (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 - for: 5m + for: 15m labels: severity: warning - name: ceph-node-alert.rules @@ -150,7 +150,7 @@ spec: storage_type: ceph expr: | label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") - for: 1m + for: 15m labels: severity: critical - alert: CephOSDDiskUnavailable @@ -242,7 +242,7 @@ spec: storage_type: ceph expr: | ceph_health_status{job="rook-ceph-mgr"} == 1 - for: 10m + for: 15m labels: severity: warning - alert: CephOSDVersionMismatch From 922df79ac14668a144ff4e9bbb4c4426c7621309 Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Mon, 4 Oct 2021 09:18:08 +0530 Subject: [PATCH 3/3] Reverting the time delay of 'CephMonHighNumberOfLeaderChanges' Reverting the time delay of 'CephMonHighNumberOfLeaderChanges' back to 5m Signed-off-by: Arun Kumar Mohan --- config.libsonnet | 2 +- extras/manifests/prometheus-ceph-rules.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config.libsonnet b/config.libsonnet index a13b5b6..e7fb537 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -17,7 +17,7 @@ clusterReadOnlyAlertTime: '0s', poolQuotaUtilizationAlertTime: '1m', monQuorumAlertTime: '15m', - monQuorumLeaderChangesAlertTime: '15m', + monQuorumLeaderChangesAlertTime: '5m', osdDataRebalanceAlertTime: '15s', osdDataRecoveryAlertTime: '2h', osdDataRecoveryInProgressAlertTime: '30s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index a5518c7..336129c 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -91,7 +91,7 @@ spec: storage_type: ceph expr: | (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 - for: 15m + for: 5m labels: severity: warning - name: ceph-node-alert.rules