Skip to content

Commit

Permalink
Creating a new 'CephMonQuorumLost' alert
Browse files Browse the repository at this point in the history
CephMonQuorumLost alert is a critical alert, where a ceph quorum is
completely lost and in a unretrievable state.

Signed-off-by: Arun Kumar Mohan <amohan@redhat.com>
  • Loading branch information
aruniiird committed Oct 14, 2021
1 parent a5fa42d commit a90f521
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 0 deletions.
16 changes: 16 additions & 0 deletions alerts/monquorum.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@
severity_level: 'error',
},
},
{
alert: 'CephMonQuorumLost',
expr: |||
count(kube_pod_info{created_by_name=~"rook-ceph-mon-.*"}) by (namespace) < 2
|||,
'for': $._config.monQuorumLostTime,
labels: {
severity: 'critical',
},
annotations: {
message: 'Storage quorum is lost',
description: 'Storage cluster quorum is lost. Contact Support.',
storage_type: $._config.storageType,
severity_level: 'critical',
},
},
{
alert: 'CephMonHighNumberOfLeaderChanges',
expr: |||
Expand Down
1 change: 1 addition & 0 deletions config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
clusterReadOnlyAlertTime: '0s',
poolQuotaUtilizationAlertTime: '1m',
monQuorumAlertTime: '15m',
monQuorumLostTime: '5m',
monQuorumLeaderChangesAlertTime: '5m',
osdDataRebalanceAlertTime: '15s',
osdDataRecoveryAlertTime: '2h',
Expand Down
11 changes: 11 additions & 0 deletions extras/manifests/prometheus-ceph-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@ spec:
for: 15m
labels:
severity: critical
- alert: CephMonQuorumLost
annotations:
description: Storage cluster quorum is lost. Contact Support.
message: Storage quorum is lost
severity_level: critical
storage_type: ceph
expr: |
count(kube_pod_info{created_by_name=~"rook-ceph-mon-.*"}) by (namespace) < 2
for: 5m
labels:
severity: critical
- alert: CephMonHighNumberOfLeaderChanges
annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
Expand Down

0 comments on commit a90f521

Please sign in to comment.