Skip to content

Commit

Permalink
monitoring: customize prometheus rule alerts
Browse files Browse the repository at this point in the history
Signed-off-by: Yuval Manor <yuvalman958@gmail.com>
  • Loading branch information
yuvalman committed Dec 30, 2021
1 parent 376ca62 commit 035be1b
Show file tree
Hide file tree
Showing 22 changed files with 698 additions and 184 deletions.
7 changes: 7 additions & 0 deletions Documentation/ceph-cluster-crd.md
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 +1372,13 @@ spec:
#externalMgrEndpoints:
#- ip: 192.168.39.182
#externalMgrPrometheusPort: 9283
#prometheusRule:
# alerts:
# cephMgrIsAbsent:
# for: 1m
# namespace: custom-namespace
# severityLevel: rook-severityLevel
# severity: rook-severity
```

Choose the namespace carefully, if you have an existing cluster managed by Rook, you have likely already injected `common.yaml`.
Expand Down
12 changes: 10 additions & 2 deletions Documentation/ceph-monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ kubectl create -f deploy/examples/monitoring/rbac.yaml
```

2. Make following changes to your CephCluster object (e.g., `cluster.yaml`).

3. Optional: prometheusRule field for overwriting values of existing alerts (Only if monitoring.enabled: true)
```YAML
apiVersion: ceph.rook.io/v1
kind: CephCluster
Expand All @@ -117,12 +117,20 @@ spec:
monitoring:
enabled: true
rulesNamespace: "rook-ceph"
# prometheus rule values for overwriting default prometheus rules values
# prometheusRule:
# alerts:
# cephMgrIsAbsent:
# for: 1m
# namespace: custom-namespace
# severityLevel: rook-severityLevel
# severity: rook-severity
[...]
```

(Where `rook-ceph` is the CephCluster name / namespace)

3. Deploy or update the CephCluster object.
4. Deploy or update the CephCluster object.

```console
kubectl apply -f cluster.yaml
Expand Down
3 changes: 3 additions & 0 deletions deploy/charts/rook-ceph-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ monitoring:
# enabling will also create RBAC rules to allow Operator to create ServiceMonitors
enabled: false
rulesNamespaceOverride:
# prometheus rule values for overwriting default prometheus rules values
# prometheusRule:


# If true, create & use PSP resources. Set this to the same value as the rook-ceph chart.
pspEnable: true
Expand Down
4 changes: 4 additions & 0 deletions deploy/charts/rook-ceph/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ spec:
- name: CSI_CEPHFS_PLUGIN_RESOURCE
value: {{ .Values.csi.csiCephFSPluginResource | quote }}
{{- end }}
{{- if .Values.monitoring.prometheusRule }}
- name: ROOK_CEPH_MONITORING_PROMETHEUS_RULE
value: {{ .Values.monitoring.prometheusRule | toYaml | quote }}
{{- end }}
{{- end }}
- name: ROOK_ENABLE_DISCOVERY_DAEMON
value: "{{ .Values.enableDiscoveryDaemon }}"
Expand Down
9 changes: 9 additions & 0 deletions deploy/charts/rook-ceph/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,12 @@ monitoring:
# requires Prometheus to be pre-installed
# enabling will also create RBAC rules to allow Operator to create ServiceMonitors
enabled: false
# prometheus rule values for overwriting default prometheus rules values
# prometheusRule:
# alerts:
# cephMgrIsAbsent:
# for: 1m
# namespace: custom-namespace
# severityLevel: rook-severityLevel
# severity: rook-severity

7 changes: 7 additions & 0 deletions deploy/examples/cluster-external.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,10 @@ spec:
# externalMgrEndpoints:
#- ip: ip
# externalMgrPrometheusPort: 9283
# prometheusRule:
# alerts:
# cephMgrIsAbsent:
# for: 1m
# namespace: custom-namespace
# severityLevel: rook-severityLevel
# severity: rook-severity
8 changes: 8 additions & 0 deletions deploy/examples/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ spec:
# If you have multiple rook-ceph clusters in the same k8s cluster, choose the same namespace (ideally, namespace with prometheus
# deployed) to set rulesNamespace for all the clusters. Otherwise, you will get duplicate alerts with multiple alert definitions.
rulesNamespace: rook-ceph
# prometheus rule values for overwriting default prometheus rules values
# prometheusRule:
# alerts:
# cephMgrIsAbsent:
# for: 1m
# namespace: custom-namespace
# severityLevel: rook-severityLevel
# severity: rook-severity
network:
# enable host networking
#provider: host
Expand Down

This file was deleted.

1 change: 0 additions & 1 deletion deploy/examples/monitoring/prometheus-ceph-v15-rules.yaml

This file was deleted.

This file was deleted.

1 change: 0 additions & 1 deletion deploy/examples/monitoring/prometheus-ceph-v16-rules.yaml

This file was deleted.

118 changes: 118 additions & 0 deletions deploy/examples/monitoring/prometheusrule-default-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
alerts:
cephMgrIsAbsent:
for: 5m
namespace: rook-ceph
severityLevel: critical
severity: critical
cephMgrIsMissingReplicas:
for: 5m
severityLevel: warning
severity: warning
cephMdsMissingReplicas:
for: 5m
severityLevel: warning
severity: warning
cephMonQuorumAtRisk:
for: 15m
severityLevel: error
severity: critical
cephMonQuorumLost:
for: 5m
severityLevel: critical
severity: critical
cephMonHighNumberOfLeaderChanges:
limit: 0.95
for: 5m
severityLevel: warning
severity: warning
cephNodeDown:
for: 30s
severityLevel: error
severity: critical
cephOSDCriticallyFull:
limit: 0.80
for: 40s
severityLevel: error
severity: critical
cephOSDFlapping:
limit: 5
osdUpRate: 5m
for: 0s
severityLevel: error
severity: critical
cephOSDNearFull:
limit: 0.75
for: 40s
severityLevel: warning
severity: warning
cephOSDDiskNotResponding:
for: 15m
severityLevel: error
severity: critical
cephOSDDiskUnavailable:
for: 1m
severityLevel: error
severity: critical
cephOSDSlowOps:
for: 30s
severityLevel: warning
severity: warning
cephDataRecoveryTakingTooLong:
for: 2h
severityLevel: warning
severity: warning
cephPGRepairTakingTooLong:
for: 1h
severityLevel: warning
severity: warning
PersistentVolumeUsageNearFull:
limit: 0.75
for: 5s
severityLevel: warning
severity: warning
PersistentVolumeUsageCritical:
limit: 0.85
for: 5s
severityLevel: error
severity: critical
cephClusterErrorState:
for: 10m
severityLevel: error
severity: critical
cephClusterWarningState:
for: 15m
severityLevel: warning
severity: warning
cephOSDVersionMismatch:
for: 10m
severityLevel: warning
severity: warning
cephMonVersionMismatch:
for: 10m
severityLevel: warning
severity: warning
cephClusterNearFull:
limit: 0.75
for: 5s
severityLevel: warning
severity: warning
cephClusterCriticallyFull:
limit: 0.80
for: 5s
severityLevel: error
severity: critical
cephClusterReadOnly:
limit: 0.85
for: 0s
severityLevel: error
severity: critical
cephPoolQuotaBytesNearExhaustion:
limit: 0.70
for: 1m
severityLevel: warning
severity: warning
cephPoolQuotaBytesCriticallyExhausted:
limit: 0.90
for: 1m
severityLevel: critical
severity: critical
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ require (
github.com/hashicorp/vault-plugin-secrets-kv v0.9.0
github.com/hashicorp/vault/api v1.1.2-0.20210713235431-1fc8af4c041f
github.com/hashicorp/vault/sdk v0.2.2-0.20210825150427-9b1f4d486f5d
github.com/imdario/mergo v0.3.12
github.com/k8snetworkplumbingwg/network-attachment-definition-client v1.1.0
github.com/kube-object-storage/lib-bucket-provisioner v0.0.0-20210907154902-775800a3d0b0
github.com/libopenstorage/secrets v0.0.0-20210709082113-dde442ea20ec
Expand Down
104 changes: 85 additions & 19 deletions pkg/operator/ceph/cluster/mgr/mgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,17 @@ limitations under the License.
package mgr

import (
"bytes"
_ "embed"
"fmt"
"github.com/imdario/mergo"
"k8s.io/apimachinery/pkg/util/yaml"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"text/template"

"github.com/banzaicloud/k8s-objectmatcher/patch"
"github.com/coreos/pkg/capnslog"
Expand All @@ -41,24 +48,26 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var logger = capnslog.NewPackageLogger("github.com/rook/rook", "op-mgr")
var (
logger = capnslog.NewPackageLogger("github.com/rook/rook", "op-mgr")
prometheusRuleName = "prometheus-ceph-vVERSION-rules"

var prometheusRuleName = "prometheus-ceph-vVERSION-rules"

// PrometheusExternalRuleName is the name of the prometheus external rule
var PrometheusExternalRuleName = "prometheus-ceph-vVERSION-rules-external"
// PrometheusExternalRuleName is the name of the prometheus external rule
PrometheusExternalRuleName = "prometheus-ceph-vVERSION-rules-external"
monitoringPath = "/etc/ceph-monitoring/"
)

const (
AppName = "rook-ceph-mgr"
serviceAccountName = "rook-ceph-mgr"
maxMgrCount = 2
PrometheusModuleName = "prometheus"
crashModuleName = "crash"
PgautoscalerModuleName = "pg_autoscaler"
balancerModuleName = "balancer"
balancerModuleMode = "upmap"
monitoringPath = "/etc/ceph-monitoring/"
serviceMonitorFile = "service-monitor.yaml"
AppName = "rook-ceph-mgr"
serviceAccountName = "rook-ceph-mgr"
maxMgrCount = 2
PrometheusModuleName = "prometheus"
crashModuleName = "crash"
PgautoscalerModuleName = "pg_autoscaler"
balancerModuleName = "balancer"
balancerModuleMode = "upmap"
serviceMonitorFile = "service-monitor.yaml"
defaultPrometheusRuleFile = "prometheusrule-default-values.yaml"
// minimum amount of memory in MB to run the pod
cephMgrPodMinimumMemory uint64 = 512
// DefaultMetricsPort prometheus exporter port
Expand Down Expand Up @@ -489,11 +498,13 @@ func (c *Cluster) EnableServiceMonitor(activeDaemon string) error {
func (c *Cluster) DeployPrometheusRule(name, namespace string) error {
version := strconv.Itoa(c.clusterInfo.CephVersion.Major)
name = strings.Replace(name, "VERSION", version, 1)
prometheusRuleFile := name + ".yaml"
prometheusRuleFile = path.Join(monitoringPath, prometheusRuleFile)
prometheusRule, err := k8sutil.GetPrometheusRule(prometheusRuleFile)
path := PrometheusRuleTemplatePath
if strings.Contains(name, "external") {
path = PrometheusRuleExternalTemplatePath
}
prometheusRule, err := templateToPrometheusRule(name, path)
if err != nil {
return errors.Wrap(err, "prometheus rule could not be deployed")
return errors.Wrap(err, "failed to template prometheus rule")
}
prometheusRule.SetName(name)
prometheusRule.SetNamespace(namespace)
Expand All @@ -508,6 +519,61 @@ func (c *Cluster) DeployPrometheusRule(name, namespace string) error {
return nil
}

func templateToPrometheusRule(name, templateData string) (*monitoringv1.PrometheusRule, error) {
var rule monitoringv1.PrometheusRule
customPrometheusRule, err := getComputeCustomizedPrometheus()
if err != nil {
return nil, errors.Wrap(err, "failed to compute customized prometheus rule")
}
t, err := loadTemplate(name, templateData, customPrometheusRule)
if err != nil {
return nil, errors.Wrap(err, "failed to load prometheus rule template")
}
err = yaml.Unmarshal(t, &rule)
if err != nil {
return nil, err
}
return &rule, nil
}

func loadTemplate(name, templateData string, p *PrometheusRuleCustomized) ([]byte, error) {
var writer bytes.Buffer
t := template.New(name)
t, err := t.Parse(templateData)
if err != nil {
return nil, errors.Wrapf(err, "failed to parse template %v", name)
}
err = t.Execute(&writer, p)
return writer.Bytes(), err
}

// getComputeCustomizedPrometheus compute PrometheusRuleCustomized by merging the data that was get from env with the default data
func getComputeCustomizedPrometheus() (*PrometheusRuleCustomized, error) {
var defaultPrometheusRuleVals PrometheusRuleCustomized
fi, err := os.Open(filepath.Clean(path.Join(monitoringPath, defaultPrometheusRuleFile)))
if err != nil {
return &PrometheusRuleCustomized{}, err
}
err = yaml.NewYAMLToJSONDecoder(fi).Decode(&defaultPrometheusRuleVals)
if err != nil {
return &PrometheusRuleCustomized{}, err
}
// merge resources from env with default values (if any)
if prometheusRuleTemplate := os.Getenv(rookMonitoringPrometheus); prometheusRuleTemplate != "" {
var overwritePrometheusRuleVals PrometheusRuleCustomized
err = yaml.Unmarshal([]byte(prometheusRuleTemplate), &overwritePrometheusRuleVals)
if err != nil {
return &PrometheusRuleCustomized{}, err
}
err = mergo.Merge(&overwritePrometheusRuleVals, defaultPrometheusRuleVals)
if err != nil {
return &PrometheusRuleCustomized{}, err
}
return &overwritePrometheusRuleVals, nil
}
return &defaultPrometheusRuleVals, nil
}

// IsModuleInSpec returns whether a module is present in the CephCluster manager spec
func IsModuleInSpec(modules []cephv1.Module, moduleName string) bool {
for _, v := range modules {
Expand Down

0 comments on commit 035be1b

Please sign in to comment.