Skip to content

Commit

Permalink
monitoring: create the prometheus rules instead of rook
Browse files Browse the repository at this point in the history
Rook has stopped creating the prometheus rules with the cephcluster
monitoring.enabled setting. Now the rules must be created separately
from the cluster CR as described in the rook PR
rook/rook#9837. The rules are fully owned
downstream by the ocs operator now since upstream they are only
installed by the helm chart. This also gives full flexibility downstream
to update the rules only when QE determines we are ready for testing
all the new rules.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
  • Loading branch information
travisn committed Apr 4, 2022
1 parent 06870b0 commit bacb136
Show file tree
Hide file tree
Showing 5 changed files with 513 additions and 5 deletions.
133 changes: 128 additions & 5 deletions controllers/storagecluster/cephcluster.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package storagecluster

import (
"bytes"
"context"
_ "embed"
"fmt"
"os"
"reflect"
Expand All @@ -11,6 +13,8 @@ import (
"github.com/go-logr/logr"
v1 "github.com/openshift/api/config/v1"
objectreferencesv1 "github.com/openshift/custom-resource-status/objectreferences/v1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
monitoringclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned"
ocsv1 "github.com/red-hat-storage/ocs-operator/api/v1"
"github.com/red-hat-storage/ocs-operator/controllers/defaults"
statusutil "github.com/red-hat-storage/ocs-operator/controllers/util"
Expand All @@ -21,7 +25,9 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
k8sYAML "k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/apimachinery/pkg/version"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/tools/reference"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
Expand Down Expand Up @@ -62,8 +68,18 @@ const (

const (
// PriorityClasses for cephCluster
systemNodeCritical = "system-node-critical"
openshiftUserCritical = "openshift-user-critical"
systemNodeCritical = "system-node-critical"
openshiftUserCritical = "openshift-user-critical"
prometheusLocalRuleName = "prometheus-ceph-rules"
prometheusExternalRuleName = "prometheus-ceph-rules-external"
)

var (
//go:embed prometheus/externalcephrules.yaml
externalPrometheusRules string
//go:embed prometheus/localcephrules.yaml
localPrometheusRules string
testSkipPrometheusRules = false
)

func arbiterEnabled(sc *ocsv1.StorageCluster) bool {
Expand Down Expand Up @@ -273,6 +289,12 @@ func (obj *ocsCephCluster) ensureCreated(r *StorageClusterReconciler, sc *ocsv1.
}
}

// Create the prometheus rules if required by the cephcluster CR
if err := createPrometheusRules(r, sc, cephCluster); err != nil {
r.Log.Error(err, "Unable to create or update prometheus rules.", "CephCluster", klog.KRef(found.Namespace, found.Name))
return reconcile.Result{}, err
}

return reconcile.Result{}, nil
}

Expand Down Expand Up @@ -351,8 +373,7 @@ func newCephCluster(sc *ocsv1.StorageCluster, cephImage string, nodeCount int, s
SSL: sc.Spec.ManagedResources.CephDashboard.SSL,
},
Monitoring: rookCephv1.MonitoringSpec{
Enabled: true,
RulesNamespace: "openshift-storage",
Enabled: true,
},
Storage: rookCephv1.StorageScopeSpec{
StorageClassDeviceSets: newStorageClassDeviceSets(sc, serverVersion),
Expand Down Expand Up @@ -477,7 +498,6 @@ func newExternalCephCluster(sc *ocsv1.StorageCluster, cephImage, monitoringIP, m

if monitoringIP != "" {
monitoringSpec.Enabled = true
monitoringSpec.RulesNamespace = sc.Namespace
// replace any comma with space and collect all the non-empty items
monIPArr := parseMonitoringIPs(monitoringIP)
monitoringSpec.ExternalMgrEndpoints = make([]corev1.EndpointAddress, len(monIPArr))
Expand Down Expand Up @@ -914,3 +934,106 @@ func addStrictFailureDomainTSC(placement *rookCephv1.Placement, topologyKey stri

placement.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{newTSC, placement.TopologySpreadConstraints[0]}
}

// ensureCreated ensures that cephFilesystem resources exist in the desired
// state.
func createPrometheusRules(r *StorageClusterReconciler, sc *ocsv1.StorageCluster, cluster *rookCephv1.CephCluster) error {
if !cluster.Spec.Monitoring.Enabled {
r.Log.Info("prometheus rules skipped", "CephCluster")
return nil
}
if testSkipPrometheusRules {
r.Log.Info("skipping prometheus rules in test")
return nil
}

rules := localPrometheusRules
name := prometheusLocalRuleName
if cluster.Spec.External.Enable {
rules = externalPrometheusRules
name = prometheusExternalRuleName
}
prometheusRule, err := parsePrometheusRule(rules)
if err != nil {
r.Log.Error(err, "Unable to retrieve prometheus rules.", "CephCluster")
return err
}
prometheusRule.SetName(name)
prometheusRule.SetNamespace(sc.Namespace)
if err := controllerutil.SetControllerReference(sc, prometheusRule, r.Scheme); err != nil {
r.Log.Error(err, "Unable to set controller reference for prometheus rules.", "CephCluster")
return err
}
applyLabels(getCephClusterMonitoringLabels(*sc), &prometheusRule.ObjectMeta)

if err := createOrUpdatePrometheusRule(r, sc, prometheusRule); err != nil {
r.Log.Error(err, "Prometheus rules could not be created.", "CephCluster")
return err
}

r.Log.Info("prometheus rules deployed", "CephCluster")

return nil
}

// OverwriteApplyToObjectMeta adds labels to object meta, overwriting keys that are already defined.
func applyLabels(labels map[string]string, t *metav1.ObjectMeta) {
if t.Labels == nil {
t.Labels = map[string]string{}
}
for k, v := range labels {
t.Labels[k] = v
}
}

// GetPrometheusRule returns provided prometheus rules or an error
func parsePrometheusRule(rules string) (*monitoringv1.PrometheusRule, error) {
var rule monitoringv1.PrometheusRule
err := k8sYAML.NewYAMLOrJSONDecoder(bytes.NewBufferString(string(rules)), 1000).Decode(&rule)
if err != nil {
return nil, fmt.Errorf("prometheusRules could not be decoded. %v", err)
}
return &rule, nil
}

// createOrUpdatePrometheusRule creates a prometheusRule object or an error
func createOrUpdatePrometheusRule(r *StorageClusterReconciler, sc *ocsv1.StorageCluster, prometheusRule *monitoringv1.PrometheusRule) error {
name := prometheusRule.GetName()
namespace := prometheusRule.GetNamespace()
client, err := getMonitoringClient()
if err != nil {
return fmt.Errorf("failed to get monitoring client. %v", err)
}
_, err = client.MonitoringV1().PrometheusRules(namespace).Create(context.TODO(), prometheusRule, metav1.CreateOptions{})
if err != nil {
if !errors.IsAlreadyExists(err) {
return fmt.Errorf("failed to create prometheusRules. %v", err)
}
// Get current PrometheusRule so the ResourceVersion can be set as needed
// for the object update operation
promRule, err := client.MonitoringV1().PrometheusRules(namespace).Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get prometheusRule object. %v", err)
}
promRule.Spec = prometheusRule.Spec
promRule.ObjectMeta.Labels = prometheusRule.ObjectMeta.Labels
_, err = client.MonitoringV1().PrometheusRules(namespace).Update(context.TODO(), promRule, metav1.UpdateOptions{})
if err != nil {
r.Log.Error(err, "failed to update prometheus rules.", "CephCluster")
return err
}
}
return nil
}

func getMonitoringClient() (*monitoringclient.Clientset, error) {
cfg, err := clientcmd.BuildConfigFromFlags("", "")
if err != nil {
return nil, fmt.Errorf("failed to build config foo. %v", err)
}
client, err := monitoringclient.NewForConfig(cfg)
if err != nil {
return nil, fmt.Errorf("failed to get monitoring client bar. %v", err)
}
return client, nil
}
11 changes: 11 additions & 0 deletions controllers/storagecluster/cephcluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

func TestEnsureCephCluster(t *testing.T) {
// cases for testing
testSkipPrometheusRules = true
cases := []struct {
label string
shouldCreate bool
Expand Down Expand Up @@ -771,6 +772,16 @@ func TestNewCephDaemonResources(t *testing.T) {
}
}

func TestParsePrometheusRules(t *testing.T) {
prometheusRules, err := parsePrometheusRule(localPrometheusRules)
assert.NilError(t, err)
assert.Equal(t, 11, len(prometheusRules.Spec.Groups))

prometheusRules, err = parsePrometheusRule(externalPrometheusRules)
assert.NilError(t, err)
assert.Equal(t, 1, len(prometheusRules.Spec.Groups))
}

func TestGetNetworkSpec(t *testing.T) {
testTable := []struct {
desc string
Expand Down
34 changes: 34 additions & 0 deletions controllers/storagecluster/prometheus/externalcephrules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: rook-prometheus
role: alert-rules
name: prometheus-ceph-rules
namespace: rook-ceph
spec:
groups:
- name: persistent-volume-alert.rules
rules:
- alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75
for: 5s
labels:
severity: warning
- alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85
for: 5s
labels:
severity: critical

0 comments on commit bacb136

Please sign in to comment.