diff --git a/alerts/osd.libsonnet b/alerts/osd.libsonnet index 57a11f5..e202c1b 100644 --- a/alerts/osd.libsonnet +++ b/alerts/osd.libsonnet @@ -57,7 +57,7 @@ expr: ||| label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskNotRespondingTime, labels: { severity: 'critical', }, @@ -73,7 +73,7 @@ expr: ||| label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") ||| % $._config, - 'for': $._config.osdDiskAlertTime, + 'for': $._config.osdDiskUnavailableTime, labels: { severity: 'critical', }, diff --git a/alerts/state.libsonnet b/alerts/state.libsonnet index 5300187..acd0cf8 100644 --- a/alerts/state.libsonnet +++ b/alerts/state.libsonnet @@ -25,7 +25,7 @@ expr: ||| ceph_health_status{%(cephExporterSelector)s} == 1 ||| % $._config, - 'for': $._config.clusterStateAlertTime, + 'for': $._config.clusterWarningStateAlertTime, labels: { severity: 'warning', }, diff --git a/config.libsonnet b/config.libsonnet index 248732f..e7fb537 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -11,6 +11,7 @@ // Duration to raise various Alerts cephNodeDownAlertTime: '30s', clusterStateAlertTime: '10m', + clusterWarningStateAlertTime: '15m', clusterVersionAlertTime: '10m', clusterUtilizationAlertTime: '5s', clusterReadOnlyAlertTime: '0s', @@ -20,6 +21,8 @@ osdDataRebalanceAlertTime: '15s', osdDataRecoveryAlertTime: '2h', osdDataRecoveryInProgressAlertTime: '30s', + osdDiskNotRespondingTime: '15m', + osdDiskUnavailableTime: '1m', osdDiskAlertTime: '1m', osdDownAlertTime: '5m', osdFlapAlertTime: '0s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index cc4b370..336129c 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -150,7 +150,7 @@ spec: storage_type: ceph expr: | label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") - for: 1m + for: 15m labels: severity: critical - alert: CephOSDDiskUnavailable @@ -242,7 +242,7 @@ spec: storage_type: ceph expr: | ceph_health_status{job="rook-ceph-mgr"} == 1 - for: 10m + for: 15m labels: severity: warning - alert: CephOSDVersionMismatch