Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing the queries for alerts 'CephMgrIsAbsent' and 'CephMgrIsMissingReplicas' #96

Merged
merged 7 commits into from
Feb 2, 2022
6 changes: 3 additions & 3 deletions alerts/absent_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
alert: 'CephMgrIsAbsent',
expr: |||
absent(up{%(cephExporterSelector)s} == 1)
label_replace((up{%(cephExporterSelector)s} == 0 or absent(up{%(cephExporterSelector)s})), "namespace", "openshift-storage", "", "")
||| % $._config,
'for': $._config.mgrIsAbsentAlertTime,
labels: {
Expand All @@ -23,7 +23,7 @@
{
alert: 'CephMgrIsMissingReplicas',
expr: |||
sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d
sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < %(cephMgrCount)d
||| % $._config,
'for': $._config.mgrMissingReplicasAlertTime,
labels: {
Expand All @@ -44,7 +44,7 @@
{
alert: 'CephMdsMissingReplicas',
expr: |||
sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) < %(cephMdsCount)d
sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) by (namespace) < %(cephMdsCount)d
||| % $._config,
'for': $._config.mdsMissingReplicasAlertTime,
labels: {
Expand Down
2 changes: 1 addition & 1 deletion alerts/monquorum.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
alert: 'CephMonQuorumAtRisk',
expr: |||
count(ceph_mon_quorum_status{%s} == 1) <= (floor(count(ceph_mon_metadata{%s}) / 2) + 1)
count(ceph_mon_quorum_status{%s} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{%s}) by (namespace) / 2) + 1)
||| % [$._config.cephExporterSelector, $._config.cephExporterSelector],
'for': $._config.monQuorumAlertTime,
labels: {
Expand Down
4 changes: 2 additions & 2 deletions alerts/osd.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
expr: |||
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
||| % $._config,
'for': $._config.osdDiskAlertTime,
'for': $._config.osdDiskNotRespondingTime,
labels: {
severity: 'critical',
},
Expand All @@ -73,7 +73,7 @@
expr: |||
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
||| % $._config,
'for': $._config.osdDiskAlertTime,
'for': $._config.osdDiskUnavailableTime,
labels: {
severity: 'critical',
},
Expand Down
6 changes: 3 additions & 3 deletions alerts/state.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
expr: |||
ceph_health_status{%(cephExporterSelector)s} == 1
||| % $._config,
'for': $._config.clusterStateAlertTime,
'for': $._config.clusterWarningStateAlertTime,
labels: {
severity: 'warning',
},
Expand All @@ -39,7 +39,7 @@
{
alert: 'CephOSDVersionMismatch',
expr: |||
count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version)) > 1
count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
||| % $._config,
'for': $._config.clusterVersionAlertTime,
labels: {
Expand All @@ -55,7 +55,7 @@
{
alert: 'CephMonVersionMismatch',
expr: |||
count(count(ceph_mon_metadata{%(cephExporterSelector)s}) by (ceph_version)) > 1
count(count(ceph_mon_metadata{%(cephExporterSelector)s, ceph_version != ""}) by (ceph_version)) > 1
||| % $._config,
'for': $._config.clusterVersionAlertTime,
labels: {
Expand Down
3 changes: 3 additions & 0 deletions config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
// Duration to raise various Alerts
cephNodeDownAlertTime: '30s',
clusterStateAlertTime: '10m',
clusterWarningStateAlertTime: '15m',
clusterVersionAlertTime: '10m',
clusterUtilizationAlertTime: '5s',
clusterReadOnlyAlertTime: '0s',
Expand All @@ -20,6 +21,8 @@
osdDataRebalanceAlertTime: '15s',
osdDataRecoveryAlertTime: '2h',
osdDataRecoveryInProgressAlertTime: '30s',
osdDiskNotRespondingTime: '15m',
osdDiskUnavailableTime: '1m',
osdDiskAlertTime: '1m',
osdDownAlertTime: '5m',
osdFlapAlertTime: '0s',
Expand Down
18 changes: 9 additions & 9 deletions extras/manifests/prometheus-ceph-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
- name: ceph.rules
rules:
- expr: |
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace)
record: cluster:ceph_node_down:join_kube
- expr: |
avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
Expand Down Expand Up @@ -42,7 +42,7 @@ spec:
severity_level: critical
storage_type: ceph
expr: |
absent(up{job="rook-ceph-mgr"} == 1)
label_replace((up{job="rook-ceph-mgr"} == 0 or absent(up{job="rook-ceph-mgr"})), "namespace", "openshift-storage", "", "")
for: 5m
labels:
severity: critical
Expand All @@ -53,7 +53,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
sum(up{job="rook-ceph-mgr"}) < 1
sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < 1
for: 5m
labels:
severity: warning
Expand All @@ -66,7 +66,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2
for: 5m
labels:
severity: warning
Expand All @@ -79,7 +79,7 @@ spec:
severity_level: error
storage_type: ceph
expr: |
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) / 2) + 1)
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1)
for: 15m
labels:
severity: critical
Expand Down Expand Up @@ -150,7 +150,7 @@ spec:
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 1m
for: 15m
labels:
severity: critical
- alert: CephOSDDiskUnavailable
Expand Down Expand Up @@ -242,7 +242,7 @@ spec:
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} == 1
for: 10m
for: 15m
labels:
severity: warning
- alert: CephOSDVersionMismatch
Expand All @@ -252,7 +252,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
for: 10m
labels:
severity: warning
Expand All @@ -263,7 +263,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
count(count(ceph_mon_metadata{job="rook-ceph-mgr", ceph_version != ""}) by (ceph_version)) > 1
for: 10m
labels:
severity: warning
Expand Down
2 changes: 1 addition & 1 deletion rules/rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
record: 'cluster:ceph_node_down:join_kube',
expr: |||
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node)
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node, namespace)
||| % $._config,
},
{
Expand Down