Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

All alert interval changes #94

Merged
merged 5 commits into from Feb 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions alerts/absent_alerts.libsonnet
Expand Up @@ -7,7 +7,7 @@
{
alert: 'CephMgrIsAbsent',
expr: |||
absent(up{%(cephExporterSelector)s} == 1)
up{%(cephExporterSelector)s} == 0
||| % $._config,
'for': $._config.mgrIsAbsentAlertTime,
labels: {
Expand All @@ -23,7 +23,7 @@
{
alert: 'CephMgrIsMissingReplicas',
expr: |||
sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d
sum(up{%(cephExporterSelector)s}) by (namespace) < %(cephMgrCount)d
||| % $._config,
'for': $._config.mgrMissingReplicasAlertTime,
labels: {
Expand All @@ -44,7 +44,7 @@
{
alert: 'CephMdsMissingReplicas',
expr: |||
sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) < %(cephMdsCount)d
sum(ceph_mds_metadata{%(cephExporterSelector)s} == 1) by (namespace) < %(cephMdsCount)d
||| % $._config,
'for': $._config.mdsMissingReplicasAlertTime,
labels: {
Expand Down
2 changes: 1 addition & 1 deletion alerts/monquorum.libsonnet
Expand Up @@ -7,7 +7,7 @@
{
alert: 'CephMonQuorumAtRisk',
expr: |||
count(ceph_mon_quorum_status{%s} == 1) <= (floor(count(ceph_mon_metadata{%s}) / 2) + 1)
count(ceph_mon_quorum_status{%s} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{%s}) by (namespace) / 2) + 1)
||| % [$._config.cephExporterSelector, $._config.cephExporterSelector],
'for': $._config.monQuorumAlertTime,
labels: {
Expand Down
4 changes: 2 additions & 2 deletions alerts/osd.libsonnet
Expand Up @@ -57,7 +57,7 @@
expr: |||
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
||| % $._config,
'for': $._config.osdDiskAlertTime,
'for': $._config.osdDiskNotRespondingTime,
labels: {
severity: 'critical',
},
Expand All @@ -73,7 +73,7 @@
expr: |||
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
||| % $._config,
'for': $._config.osdDiskAlertTime,
'for': $._config.osdDiskUnavailableTime,
labels: {
severity: 'critical',
},
Expand Down
4 changes: 2 additions & 2 deletions alerts/state.libsonnet
Expand Up @@ -25,7 +25,7 @@
expr: |||
ceph_health_status{%(cephExporterSelector)s} == 1
||| % $._config,
'for': $._config.clusterStateAlertTime,
'for': $._config.clusterWarningStateAlertTime,
labels: {
severity: 'warning',
},
Expand All @@ -39,7 +39,7 @@
{
alert: 'CephOSDVersionMismatch',
expr: |||
count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version)) > 1
count(count(ceph_osd_metadata{%(cephExporterSelector)s}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
||| % $._config,
'for': $._config.clusterVersionAlertTime,
labels: {
Expand Down
3 changes: 3 additions & 0 deletions config.libsonnet
Expand Up @@ -11,6 +11,7 @@
// Duration to raise various Alerts
cephNodeDownAlertTime: '30s',
clusterStateAlertTime: '10m',
clusterWarningStateAlertTime: '15m',
clusterVersionAlertTime: '10m',
clusterUtilizationAlertTime: '5s',
clusterReadOnlyAlertTime: '0s',
Expand All @@ -20,6 +21,8 @@
osdDataRebalanceAlertTime: '15s',
osdDataRecoveryAlertTime: '2h',
osdDataRecoveryInProgressAlertTime: '30s',
osdDiskNotRespondingTime: '15m',
osdDiskUnavailableTime: '1m',
osdDiskAlertTime: '1m',
osdDownAlertTime: '5m',
osdFlapAlertTime: '0s',
Expand Down
16 changes: 8 additions & 8 deletions extras/manifests/prometheus-ceph-rules.yaml
Expand Up @@ -11,7 +11,7 @@ spec:
- name: ceph.rules
rules:
- expr: |
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace)
record: cluster:ceph_node_down:join_kube
- expr: |
avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
Expand Down Expand Up @@ -42,7 +42,7 @@ spec:
severity_level: critical
storage_type: ceph
expr: |
absent(up{job="rook-ceph-mgr"} == 1)
up{job="rook-ceph-mgr"} == 0
for: 5m
labels:
severity: critical
Expand All @@ -53,7 +53,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
sum(up{job="rook-ceph-mgr"}) < 1
sum(up{job="rook-ceph-mgr"}) by (namespace) < 1
for: 5m
labels:
severity: warning
Expand All @@ -66,7 +66,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2
for: 5m
labels:
severity: warning
Expand All @@ -79,7 +79,7 @@ spec:
severity_level: error
storage_type: ceph
expr: |
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) / 2) + 1)
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1)
for: 15m
labels:
severity: critical
Expand Down Expand Up @@ -150,7 +150,7 @@ spec:
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 1m
for: 15m
labels:
severity: critical
- alert: CephOSDDiskUnavailable
Expand Down Expand Up @@ -242,7 +242,7 @@ spec:
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} == 1
for: 10m
for: 15m
labels:
severity: warning
- alert: CephOSDVersionMismatch
Expand All @@ -252,7 +252,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
for: 10m
labels:
severity: warning
Expand Down
2 changes: 1 addition & 1 deletion rules/rules.libsonnet
Expand Up @@ -7,7 +7,7 @@
{
record: 'cluster:ceph_node_down:join_kube',
expr: |||
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node)
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{%(cephExporterSelector)s},"node","$1","exported_instance","(.*)")) by (node, namespace)
||| % $._config,
},
{
Expand Down