From 78feac0f4cfccb0d4da293371da3e6c872dacf38 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Tue, 28 Dec 2021 03:03:29 +0000 Subject: [PATCH] core: support priority class for crashcollector Support priorityClass to crashcollectors as mons, mgrs, and osds. https://rook.io/docs/rook/v1.8/ceph-cluster-crd.html#priority-class-names-configuration-settings The main use case is applying the high priority to crashcollectors to preempt normal pods under heavy load. Without this feature, we might lose crash information. Closes: https://github.com/rook/rook/issues/9500 Signed-off-by: Satoru Takeuchi --- Documentation/ceph-cluster-crd.md | 3 +- deploy/charts/rook-ceph-cluster/values.yaml | 1 + deploy/examples/cluster.yaml | 1 + pkg/apis/ceph.rook.io/v1/priorityclasses.go | 8 +++ .../ceph.rook.io/v1/priorityclasses_test.go | 10 +-- pkg/operator/ceph/cluster/crash/crash.go | 9 +-- pkg/operator/ceph/cluster/crash/crash_test.go | 63 +++++++++++++++++++ 7 files changed, 86 insertions(+), 9 deletions(-) diff --git a/Documentation/ceph-cluster-crd.md b/Documentation/ceph-cluster-crd.md index f72625754804b..97004cd6a932e 100755 --- a/Documentation/ceph-cluster-crd.md +++ b/Documentation/ceph-cluster-crd.md @@ -601,10 +601,11 @@ Priority class names can be specified so that the Rook components will have thos You can set priority class names for Rook components for the list of key value pairs: -* `all`: Set priority class names for MGRs, Mons, OSDs. +* `all`: Set priority class names for MGRs, Mons, OSDs, and crashcollectors. * `mgr`: Set priority class names for MGRs. * `mon`: Set priority class names for Mons. * `osd`: Set priority class names for OSDs. +* `crashcollector`: Set priority class names for crashcollectors. The specific component keys will act as overrides to `all`. diff --git a/deploy/charts/rook-ceph-cluster/values.yaml b/deploy/charts/rook-ceph-cluster/values.yaml index 1ca0664ecb4d1..743de21eac157 100644 --- a/deploy/charts/rook-ceph-cluster/values.yaml +++ b/deploy/charts/rook-ceph-cluster/values.yaml @@ -238,6 +238,7 @@ cephClusterSpec: # mon: rook-ceph-mon-priority-class # osd: rook-ceph-osd-priority-class # mgr: rook-ceph-mgr-priority-class + # crashcollector: rook-ceph-crashcollector-priority-class storage: # cluster level storage configuration and selection useAllNodes: true diff --git a/deploy/examples/cluster.yaml b/deploy/examples/cluster.yaml index 365908df34715..447dbdee299d5 100644 --- a/deploy/examples/cluster.yaml +++ b/deploy/examples/cluster.yaml @@ -207,6 +207,7 @@ spec: # mon: rook-ceph-mon-priority-class # osd: rook-ceph-osd-priority-class # mgr: rook-ceph-mgr-priority-class +# crashcollector: rook-ceph-crashcollector-priority-class storage: # cluster level storage configuration and selection useAllNodes: true useAllDevices: true diff --git a/pkg/apis/ceph.rook.io/v1/priorityclasses.go b/pkg/apis/ceph.rook.io/v1/priorityclasses.go index d60ed1acd8cfd..c984b852cf514 100644 --- a/pkg/apis/ceph.rook.io/v1/priorityclasses.go +++ b/pkg/apis/ceph.rook.io/v1/priorityclasses.go @@ -55,3 +55,11 @@ func GetCleanupPriorityClassName(p PriorityClassNamesSpec) string { } return p[KeyCleanup] } + +// GetCrashCollectorPriorityClassName returns the priority class name for the crashcollector +func GetCrashCollectorPriorityClassName(p PriorityClassNamesSpec) string { + if _, ok := p[KeyCrashCollector]; !ok { + return p.All() + } + return p[KeyCrashCollector] +} diff --git a/pkg/apis/ceph.rook.io/v1/priorityclasses_test.go b/pkg/apis/ceph.rook.io/v1/priorityclasses_test.go index a92e584bcf31c..cc2f018acd2b8 100644 --- a/pkg/apis/ceph.rook.io/v1/priorityclasses_test.go +++ b/pkg/apis/ceph.rook.io/v1/priorityclasses_test.go @@ -30,6 +30,7 @@ all: all-class mgr: mgr-class mon: mon-class osd: osd-class +crashcollector: crashcollector-class `) // convert the raw spec yaml into JSON @@ -43,10 +44,11 @@ osd: osd-class // the unmarshalled priority class names spec should equal the expected spec below expected := PriorityClassNamesSpec{ - "all": "all-class", - "mgr": "mgr-class", - "mon": "mon-class", - "osd": "osd-class", + "all": "all-class", + "mgr": "mgr-class", + "mon": "mon-class", + "osd": "osd-class", + "crashcollector": "crashcollector-class", } assert.Equal(t, expected, priorityClassNames) } diff --git a/pkg/operator/ceph/cluster/crash/crash.go b/pkg/operator/ceph/cluster/crash/crash.go index d5836a02f3697..b0db4acfc5539 100644 --- a/pkg/operator/ceph/cluster/crash/crash.go +++ b/pkg/operator/ceph/cluster/crash/crash.go @@ -113,10 +113,11 @@ func (r *ReconcileNode) createOrUpdateCephCrash(node corev1.Node, tolerations [] Containers: []corev1.Container{ getCrashDaemonContainer(cephCluster, *cephVersion), }, - Tolerations: tolerations, - RestartPolicy: corev1.RestartPolicyAlways, - HostNetwork: cephCluster.Spec.Network.IsHost(), - Volumes: volumes, + Tolerations: tolerations, + RestartPolicy: corev1.RestartPolicyAlways, + HostNetwork: cephCluster.Spec.Network.IsHost(), + Volumes: volumes, + PriorityClassName: cephv1.GetCrashCollectorPriorityClassName(cephCluster.Spec.PriorityClassNames), }, } diff --git a/pkg/operator/ceph/cluster/crash/crash_test.go b/pkg/operator/ceph/cluster/crash/crash_test.go index 3ead1f591adde..2fd74093e07d0 100644 --- a/pkg/operator/ceph/cluster/crash/crash_test.go +++ b/pkg/operator/ceph/cluster/crash/crash_test.go @@ -18,6 +18,7 @@ package crash import ( "context" + "fmt" "testing" cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" @@ -25,14 +26,18 @@ import ( "github.com/rook/rook/pkg/client/clientset/versioned/scheme" "github.com/rook/rook/pkg/clusterd" cephver "github.com/rook/rook/pkg/operator/ceph/version" + "github.com/rook/rook/pkg/operator/k8sutil" "github.com/rook/rook/pkg/operator/test" "github.com/stretchr/testify/assert" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/batch/v1" "k8s.io/api/batch/v1beta1" + corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" cntrlutil "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -105,3 +110,61 @@ func TestCreateOrUpdateCephCron(t *testing.T) { assert.Error(t, err) assert.True(t, kerrors.IsNotFound(err)) } + +func TestCreateOrUpdateCephCrash(t *testing.T) { + cephCluster := cephv1.CephCluster{ + ObjectMeta: metav1.ObjectMeta{Namespace: "rook-ceph"}, + } + cephCluster.Spec.Labels = cephv1.LabelsSpec{} + cephCluster.Spec.PriorityClassNames = cephv1.PriorityClassNamesSpec{} + cephVersion := &cephver.CephVersion{Major: 16, Minor: 2, Extra: 0} + ctx := context.TODO() + context := &clusterd.Context{ + Clientset: test.New(t, 1), + RookClientset: rookclient.NewSimpleClientset(), + } + + s := scheme.Scheme + err := appsv1.AddToScheme(s) + if err != nil { + assert.Fail(t, "failed to build scheme") + } + r := &ReconcileNode{ + scheme: s, + client: fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects().Build(), + context: context, + } + + node := corev1.Node{} + nodeSelector := map[string]string{corev1.LabelHostname: "testnode"} + node.SetLabels(nodeSelector) + tolerations := []corev1.Toleration{corev1.Toleration{}} + res, err := r.createOrUpdateCephCrash(node, tolerations, cephCluster, cephVersion) + assert.NoError(t, err) + assert.Equal(t, controllerutil.OperationResult("created"), res) + name := k8sutil.TruncateNodeName(fmt.Sprintf("%s-%%s", AppName), "testnode") + deploy := appsv1.Deployment{} + err = r.client.Get(ctx, types.NamespacedName{Namespace: "rook-ceph", Name: name}, &deploy) + assert.NoError(t, err) + podSpec := deploy.Spec.Template + assert.Equal(t, nodeSelector, podSpec.Spec.NodeSelector) + assert.Equal(t, "", podSpec.ObjectMeta.Labels["foo"]) + assert.Equal(t, tolerations, podSpec.Spec.Tolerations) + assert.Equal(t, false, podSpec.Spec.HostNetwork) + assert.Equal(t, "", podSpec.Spec.PriorityClassName) + + cephCluster.Spec.Labels[cephv1.KeyCrashCollector] = map[string]string{"foo": "bar"} + cephCluster.Spec.Network.HostNetwork = true + cephCluster.Spec.PriorityClassNames[cephv1.KeyCrashCollector] = "test-priority-class" + tolerations = []corev1.Toleration{corev1.Toleration{Key: "key", Operator: "Equal", Value: "value", Effect: "NoSchedule"}} + res, err = r.createOrUpdateCephCrash(node, tolerations, cephCluster, cephVersion) + assert.NoError(t, err) + assert.Equal(t, controllerutil.OperationResult("updated"), res) + err = r.client.Get(ctx, types.NamespacedName{Namespace: "rook-ceph", Name: name}, &deploy) + assert.NoError(t, err) + podSpec = deploy.Spec.Template + assert.Equal(t, "bar", podSpec.ObjectMeta.Labels["foo"]) + assert.Equal(t, tolerations, podSpec.Spec.Tolerations) + assert.Equal(t, true, podSpec.Spec.HostNetwork) + assert.Equal(t, "test-priority-class", podSpec.Spec.PriorityClassName) +}