diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index e8630e7..b2da4ab 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -7,7 +7,7 @@ { alert: 'CephMgrIsAbsent', expr: ||| - absent(up{%(cephExporterSelector)s} == 1) + up{%(cephExporterSelector)s} == 0 ||| % $._config, 'for': $._config.mgrIsAbsentAlertTime, labels: { @@ -23,7 +23,7 @@ { alert: 'CephMgrIsMissingReplicas', expr: ||| - sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d + sum(up{%(cephExporterSelector)s}) by (namespace) < %(cephMgrCount)d ||| % $._config, 'for': $._config.mgrMissingReplicasAlertTime, labels: { @@ -44,7 +44,7 @@ { alert: 'CephMdsMissingReplicas', expr: ||| - sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) < %(cephMdsCount)d + sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) by (namespace) < %(cephMdsCount)d ||| % $._config, 'for': $._config.mdsMissingReplicasAlertTime, labels: { diff --git a/alerts/monquorum.libsonnet b/alerts/monquorum.libsonnet index b86a8e9..33a6da3 100644 --- a/alerts/monquorum.libsonnet +++ b/alerts/monquorum.libsonnet @@ -7,7 +7,7 @@ { alert: 'CephMonQuorumAtRisk', expr: ||| - count(ceph_mon_quorum_status{%s} == 1) <= (floor(count(ceph_mon_metadata{%s}) / 2) + 1) + count(ceph_mon_quorum_status{%s} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{%s}) by (namespace) / 2) + 1) ||| % [$._config.cephExporterSelector, $._config.cephExporterSelector], 'for': $._config.monQuorumAlertTime, labels: { diff --git a/alerts/osd.libsonnet b/alerts/osd.libsonnet index 57a11f5..e202c1b 100644 --- a/alerts/osd.libsonnet +++ b/alerts/osd.libsonnet @@ -57,7 +57,7 @@ expr: ||| label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskNotRespondingTime, labels: { severity: 'critical', }, @@ -73,7 +73,7 @@ expr: ||| label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskUnavailableTime, labels: { severity: 'critical', }, diff --git a/alerts/state.libsonnet b/alerts/state.libsonnet index 1468da7..acd0cf8 100644 --- a/alerts/state.libsonnet +++ b/alerts/state.libsonnet @@ -25,7 +25,7 @@ expr: ||| ceph_health_status{%(cephExporterSelector)s} == 1 ||| % $._config, - 'for': $._config.clusterStateAlertTime, + 'for': $._config.clusterWarningStateAlertTime, labels: { severity: 'warning', }, @@ -39,7 +39,7 @@ { alert: 'CephOSDVersionMismatch', expr: ||| - count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version)) > 1 + count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 ||| % $._config, 'for': $._config.clusterVersionAlertTime, labels: { diff --git a/config.libsonnet b/config.libsonnet index 248732f..e7fb537 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -11,6 +11,7 @@ // Duration to raise various Alerts cephNodeDownAlertTime: '30s', clusterStateAlertTime: '10m', + clusterWarningStateAlertTime: '15m', clusterVersionAlertTime: '10m', clusterUtilizationAlertTime: '5s', clusterReadOnlyAlertTime: '0s', @@ -20,6 +21,8 @@ osdDataRebalanceAlertTime: '15s', osdDataRecoveryAlertTime: '2h', osdDataRecoveryInProgressAlertTime: '30s', + osdDiskNotRespondingTime: '15m', + osdDiskUnavailableTime: '1m', osdDiskAlertTime: '1m', osdDownAlertTime: '5m', osdFlapAlertTime: '0s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index e590291..336129c 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -11,7 +11,7 @@ spec: - name: ceph.rules rules: - expr: | - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace) record: cluster:ceph_node_down:join_kube - expr: | avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) @@ -42,7 +42,7 @@ spec: severity_level: critical storage_type: ceph expr: | - absent(up{job="rook-ceph-mgr"} == 1) + up{job="rook-ceph-mgr"} == 0 for: 5m labels: severity: critical @@ -53,7 +53,7 @@ spec: severity_level: warning storage_type: ceph expr: | - sum(up{job="rook-ceph-mgr"}) < 1 + sum(up{job="rook-ceph-mgr"}) by (namespace) < 1 for: 5m labels: severity: warning @@ -66,7 +66,7 @@ spec: severity_level: warning storage_type: ceph expr: | - sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2 + sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2 for: 5m labels: severity: warning @@ -79,7 +79,7 @@ spec: severity_level: error storage_type: ceph expr: | - count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) / 2) + 1) + count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1) for: 15m labels: severity: critical @@ -150,7 +150,7 @@ spec: storage_type: ceph expr: | label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") - for: 1m + for: 15m labels: severity: critical - alert: CephOSDDiskUnavailable @@ -242,7 +242,7 @@ spec: storage_type: ceph expr: | ceph_health_status{job="rook-ceph-mgr"} == 1 - for: 10m + for: 15m labels: severity: warning - alert: CephOSDVersionMismatch @@ -252,7 +252,7 @@ spec: severity_level: warning storage_type: ceph expr: | - count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 for: 10m labels: severity: warning diff --git a/rules/rules.libsonnet b/rules/rules.libsonnet index af5d821..7cb199f 100644 --- a/rules/rules.libsonnet +++ b/rules/rules.libsonnet @@ -7,7 +7,7 @@ { record: 'cluster:ceph_node_down:join_kube', expr: ||| - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node, namespace) ||| % $._config, }, {