From 0c19f62df275dd0b0668635ec4a318bb3fb1ea71 Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Mon, 13 Dec 2021 17:51:25 -0700 Subject: [PATCH] mds: skip sanity check during upgrades to 16.2.7 The mons need to be configured to skip the mds sanity check during upgrade to 16.2.7 or the mons may crash during that upgrade. Signed-off-by: Travis Nielsen --- pkg/operator/ceph/cluster/cluster.go | 48 ++++++++++++++ pkg/operator/ceph/cluster/cluster_test.go | 80 +++++++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/pkg/operator/ceph/cluster/cluster.go b/pkg/operator/ceph/cluster/cluster.go index 7ba602234d7b..6c6a7bdaa00d 100755 --- a/pkg/operator/ceph/cluster/cluster.go +++ b/pkg/operator/ceph/cluster/cluster.go @@ -92,6 +92,14 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep } c.ClusterInfo.SetName(c.namespacedName.Name) + // Execute actions before the monitors are up and running, if needed during upgrades. + // These actions would be skipped in a new cluster. + logger.Debug("monitors are about to reconcile, executing pre actions") + err = c.preMonStartupActions(cephVersion) + if err != nil { + return errors.Wrap(err, "failed to execute actions before reconciling the ceph monitors") + } + // Start the mon pods controller.UpdateCondition(c.ClusterInfo.Context, c.context, c.namespacedName, cephv1.ConditionProgressing, v1.ConditionTrue, cephv1.ClusterProgressingReason, "Configuring Ceph Mons") clusterInfo, err := c.mons.Start(c.ClusterInfo, rookImage, cephVersion, *c.Spec) @@ -443,6 +451,37 @@ func (c *cluster) replaceDefaultCrushMap(newRoot string) (err error) { return nil } +// preMonStartupActions is a collection of actions to run before the monitors are reconciled. +func (c *cluster) preMonStartupActions(cephVersion cephver.CephVersion) error { + // Disable the mds sanity checks for the mons due to a ceph upgrade issue + // for the mds to Pacific if 16.2.7 or greater. We keep it more general for any + // Pacific upgrade greater than 16.2.7 in case they skip updrading directly to 16.2.7. + if c.isUpgrade && cephVersion.IsPacific() && cephVersion.IsAtLeast(cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}) { + if err := c.skipMDSSanityChecks(true); err != nil { + // If there is an error, just print it and continue. Likely there is not a + // negative consequence of continuing since several complex conditions must exist to hit + // the upgrade issue where the sanity checks need to be disabled. + logger.Warningf("failed to disable the mon_mds_skip_sanity. %v", err) + } + } + return nil +} + +func (c *cluster) skipMDSSanityChecks(skip bool) error { + // In a running cluster disable the mds skip sanity setting during upgrades. + monStore := config.GetMonStore(c.context, c.ClusterInfo) + if skip { + if err := monStore.Set("mon", "mon_mds_skip_sanity", "1"); err != nil { + return err + } + } else { + if err := monStore.Delete("mon", "mon_mds_skip_sanity"); err != nil { + return err + } + } + return nil +} + // postMonStartupActions is a collection of actions to run once the monitors are up and running // It gets executed right after the main mon Start() method // Basically, it is executed between the monitors and the manager sequence @@ -464,6 +503,15 @@ func (c *cluster) postMonStartupActions() error { return errors.Wrap(err, "failed to enable Ceph messenger version 2") } + // Always ensure the skip mds sanity checks setting is cleared, for all Pacific deployments + if c.ClusterInfo.CephVersion.IsPacific() { + if err := c.skipMDSSanityChecks(false); err != nil { + // If there is an error, just print it and continue. We can just try again + // at the next reconcile. + logger.Warningf("failed to re-enable the mon_mds_skip_sanity. %v", err) + } + } + crushRoot := client.GetCrushRootFromSpec(c.Spec) if crushRoot != "default" { // Remove the root=default and replicated_rule which are created by diff --git a/pkg/operator/ceph/cluster/cluster_test.go b/pkg/operator/ceph/cluster/cluster_test.go index c60caab5f777..cdbef0528dd5 100644 --- a/pkg/operator/ceph/cluster/cluster_test.go +++ b/pkg/operator/ceph/cluster/cluster_test.go @@ -19,11 +19,17 @@ package cluster import ( "testing" + "time" + "github.com/pkg/errors" cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "github.com/rook/rook/pkg/clusterd" "github.com/rook/rook/pkg/daemon/ceph/client" + cephclient "github.com/rook/rook/pkg/daemon/ceph/client" + cephver "github.com/rook/rook/pkg/operator/ceph/version" testop "github.com/rook/rook/pkg/operator/test" + exectest "github.com/rook/rook/pkg/util/exec/test" + "github.com/stretchr/testify/assert" ) func TestPreClusterStartValidation(t *testing.T) { @@ -69,3 +75,77 @@ func TestPreClusterStartValidation(t *testing.T) { }) } } + +func TestPreMonChecks(t *testing.T) { + executor := &exectest.MockExecutor{} + context := &clusterd.Context{Executor: executor} + setSkipSanity := false + unsetSkipSanity := false + executor.MockExecuteCommandWithTimeout = func(timeout time.Duration, command string, args ...string) (string, error) { + logger.Infof("Command: %s %v", command, args) + if args[0] == "config" { + if args[1] == "set" { + setSkipSanity = true + assert.Equal(t, "mon", args[2]) + assert.Equal(t, "mon_mds_skip_sanity", args[3]) + assert.Equal(t, "1", args[4]) + return "", nil + } + if args[1] == "rm" { + unsetSkipSanity = true + assert.Equal(t, "mon", args[2]) + assert.Equal(t, "mon_mds_skip_sanity", args[3]) + return "", nil + } + } + return "", errors.Errorf("unexpected ceph command %q", args) + } + c := cluster{context: context, ClusterInfo: cephclient.AdminTestClusterInfo("cluster")} + + t.Run("no upgrade", func(t *testing.T) { + v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + c.isUpgrade = false + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.False(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + }) + + t.Run("upgrade below version", func(t *testing.T) { + setSkipSanity = false + unsetSkipSanity = false + v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 6} + c.isUpgrade = true + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.False(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + }) + + t.Run("upgrade to applicable version", func(t *testing.T) { + setSkipSanity = false + unsetSkipSanity = false + v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + c.isUpgrade = true + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.True(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + + // This will be called during the post mon checks + err = c.skipMDSSanityChecks(false) + assert.NoError(t, err) + assert.True(t, unsetSkipSanity) + }) + + t.Run("upgrade to quincy", func(t *testing.T) { + setSkipSanity = false + unsetSkipSanity = false + v := cephver.CephVersion{Major: 17, Minor: 2, Extra: 0} + c.isUpgrade = true + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.False(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + }) +}