diff --git a/pkg/operator/ceph/cluster/cluster.go b/pkg/operator/ceph/cluster/cluster.go index 7ba602234d7b3..ebc9b1c8295b6 100755 --- a/pkg/operator/ceph/cluster/cluster.go +++ b/pkg/operator/ceph/cluster/cluster.go @@ -92,6 +92,13 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep } c.ClusterInfo.SetName(c.namespacedName.Name) + // Execute actions before the monitors are up and running + logger.Debug("monitors are about to reconcile, executing pre actions") + err = c.preMonStartupActions(cephVersion) + if err != nil { + return errors.Wrap(err, "failed to execute actions before reconciling the ceph monitors") + } + // Start the mon pods controller.UpdateCondition(c.ClusterInfo.Context, c.context, c.namespacedName, cephv1.ConditionProgressing, v1.ConditionTrue, cephv1.ClusterProgressingReason, "Configuring Ceph Mons") clusterInfo, err := c.mons.Start(c.ClusterInfo, rookImage, cephVersion, *c.Spec) @@ -443,6 +450,36 @@ func (c *cluster) replaceDefaultCrushMap(newRoot string) (err error) { return nil } +// preMonStartupActions is a collection of actions to run before the monitors are reconciled. +func (c *cluster) preMonStartupActions(cephVersion cephver.CephVersion) error { + // Disable the mds sanity checks for the mons due to a ceph upgrade issue + // for the mds to Pacific if 16.2.7 or greater. + if c.isUpgrade && cephVersion.IsPacific() && cephVersion.IsAtLeast(cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}) { + if err := c.skipMDSSanityChecks(true); err != nil { + // If there is an error, just print it and continue. Likely there is not a + // negative consequence of continuing since several complex conditions must exist to hit + // the upgrade issue where the sanity checks need to be disabled. + logger.Warningf("failed to disable the mon_mds_skip_sanity. %v", err) + } + } + return nil +} + +func (c *cluster) skipMDSSanityChecks(skip bool) error { + // In a running cluster disable the mds skip sanity setting during upgrades. + monStore := config.GetMonStore(c.context, c.ClusterInfo) + if skip { + if err := monStore.Set("mon", "mon_mds_skip_sanity", "1"); err != nil { + return err + } + } else { + if err := monStore.Delete("mon", "mon_mds_skip_sanity"); err != nil { + return err + } + } + return nil +} + // postMonStartupActions is a collection of actions to run once the monitors are up and running // It gets executed right after the main mon Start() method // Basically, it is executed between the monitors and the manager sequence @@ -464,6 +501,15 @@ func (c *cluster) postMonStartupActions() error { return errors.Wrap(err, "failed to enable Ceph messenger version 2") } + // Always ensure the skip mds sanity checks setting is cleared, for all Pacific deployments + if c.ClusterInfo.CephVersion.IsPacific() { + if err := c.skipMDSSanityChecks(false); err != nil { + // If there is an error, just print it and continue. We can just try again + // at the next reconcile. + logger.Warningf("failed to re-enable the mon_mds_skip_sanity. %v", err) + } + } + crushRoot := client.GetCrushRootFromSpec(c.Spec) if crushRoot != "default" { // Remove the root=default and replicated_rule which are created by diff --git a/pkg/operator/ceph/cluster/cluster_test.go b/pkg/operator/ceph/cluster/cluster_test.go index c60caab5f7776..cdbef0528dd52 100644 --- a/pkg/operator/ceph/cluster/cluster_test.go +++ b/pkg/operator/ceph/cluster/cluster_test.go @@ -19,11 +19,17 @@ package cluster import ( "testing" + "time" + "github.com/pkg/errors" cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "github.com/rook/rook/pkg/clusterd" "github.com/rook/rook/pkg/daemon/ceph/client" + cephclient "github.com/rook/rook/pkg/daemon/ceph/client" + cephver "github.com/rook/rook/pkg/operator/ceph/version" testop "github.com/rook/rook/pkg/operator/test" + exectest "github.com/rook/rook/pkg/util/exec/test" + "github.com/stretchr/testify/assert" ) func TestPreClusterStartValidation(t *testing.T) { @@ -69,3 +75,77 @@ func TestPreClusterStartValidation(t *testing.T) { }) } } + +func TestPreMonChecks(t *testing.T) { + executor := &exectest.MockExecutor{} + context := &clusterd.Context{Executor: executor} + setSkipSanity := false + unsetSkipSanity := false + executor.MockExecuteCommandWithTimeout = func(timeout time.Duration, command string, args ...string) (string, error) { + logger.Infof("Command: %s %v", command, args) + if args[0] == "config" { + if args[1] == "set" { + setSkipSanity = true + assert.Equal(t, "mon", args[2]) + assert.Equal(t, "mon_mds_skip_sanity", args[3]) + assert.Equal(t, "1", args[4]) + return "", nil + } + if args[1] == "rm" { + unsetSkipSanity = true + assert.Equal(t, "mon", args[2]) + assert.Equal(t, "mon_mds_skip_sanity", args[3]) + return "", nil + } + } + return "", errors.Errorf("unexpected ceph command %q", args) + } + c := cluster{context: context, ClusterInfo: cephclient.AdminTestClusterInfo("cluster")} + + t.Run("no upgrade", func(t *testing.T) { + v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + c.isUpgrade = false + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.False(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + }) + + t.Run("upgrade below version", func(t *testing.T) { + setSkipSanity = false + unsetSkipSanity = false + v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 6} + c.isUpgrade = true + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.False(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + }) + + t.Run("upgrade to applicable version", func(t *testing.T) { + setSkipSanity = false + unsetSkipSanity = false + v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + c.isUpgrade = true + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.True(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + + // This will be called during the post mon checks + err = c.skipMDSSanityChecks(false) + assert.NoError(t, err) + assert.True(t, unsetSkipSanity) + }) + + t.Run("upgrade to quincy", func(t *testing.T) { + setSkipSanity = false + unsetSkipSanity = false + v := cephver.CephVersion{Major: 17, Minor: 2, Extra: 0} + c.isUpgrade = true + err := c.preMonStartupActions(v) + assert.NoError(t, err) + assert.False(t, setSkipSanity) + assert.False(t, unsetSkipSanity) + }) +}