Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mds: Skip sanity check during upgrades to 16.2.7 (backport #9418) #9448

Merged
merged 1 commit into from Dec 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
48 changes: 48 additions & 0 deletions pkg/operator/ceph/cluster/cluster.go
Expand Up @@ -97,6 +97,14 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep
}
c.ClusterInfo.SetName(c.namespacedName.Name)

// Execute actions before the monitors are up and running, if needed during upgrades.
// These actions would be skipped in a new cluster.
logger.Debug("monitors are about to reconcile, executing pre actions")
err = c.preMonStartupActions(cephVersion)
if err != nil {
return errors.Wrap(err, "failed to execute actions before reconciling the ceph monitors")
}

// Start the mon pods
controller.UpdateCondition(c.context, c.namespacedName, cephv1.ConditionProgressing, v1.ConditionTrue, cephv1.ClusterProgressingReason, "Configuring Ceph Mons")
clusterInfo, err := c.mons.Start(c.ClusterInfo, rookImage, cephVersion, *c.Spec)
Expand Down Expand Up @@ -545,6 +553,37 @@ func (c *cluster) replaceDefaultCrushMap(newRoot string) (err error) {
return nil
}

// preMonStartupActions is a collection of actions to run before the monitors are reconciled.
func (c *cluster) preMonStartupActions(cephVersion cephver.CephVersion) error {
// Disable the mds sanity checks for the mons due to a ceph upgrade issue
// for the mds to Pacific if 16.2.7 or greater. We keep it more general for any
// Pacific upgrade greater than 16.2.7 in case they skip updrading directly to 16.2.7.
if c.isUpgrade && cephVersion.IsPacific() && cephVersion.IsAtLeast(cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}) {
if err := c.skipMDSSanityChecks(true); err != nil {
// If there is an error, just print it and continue. Likely there is not a
// negative consequence of continuing since several complex conditions must exist to hit
// the upgrade issue where the sanity checks need to be disabled.
logger.Warningf("failed to disable the mon_mds_skip_sanity. %v", err)
}
}
return nil
}

func (c *cluster) skipMDSSanityChecks(skip bool) error {
// In a running cluster disable the mds skip sanity setting during upgrades.
monStore := config.GetMonStore(c.context, c.ClusterInfo)
if skip {
if err := monStore.Set("mon", "mon_mds_skip_sanity", "1"); err != nil {
return err
}
} else {
if err := monStore.Delete("mon", "mon_mds_skip_sanity"); err != nil {
return err
}
}
return nil
}

// postMonStartupActions is a collection of actions to run once the monitors are up and running
// It gets executed right after the main mon Start() method
// Basically, it is executed between the monitors and the manager sequence
Expand All @@ -566,6 +605,15 @@ func (c *cluster) postMonStartupActions() error {
return errors.Wrap(err, "failed to enable Ceph messenger version 2")
}

// Always ensure the skip mds sanity checks setting is cleared, for all Pacific deployments
if c.ClusterInfo.CephVersion.IsPacific() {
if err := c.skipMDSSanityChecks(false); err != nil {
// If there is an error, just print it and continue. We can just try again
// at the next reconcile.
logger.Warningf("failed to re-enable the mon_mds_skip_sanity. %v", err)
}
}

crushRoot := client.GetCrushRootFromSpec(c.Spec)
if crushRoot != "default" {
// Remove the root=default and replicated_rule which are created by
Expand Down
80 changes: 80 additions & 0 deletions pkg/operator/ceph/cluster/cluster_test.go
Expand Up @@ -19,10 +19,16 @@ package cluster

import (
"testing"
"time"

"github.com/pkg/errors"
cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
"github.com/rook/rook/pkg/clusterd"
cephclient "github.com/rook/rook/pkg/daemon/ceph/client"
cephver "github.com/rook/rook/pkg/operator/ceph/version"
testop "github.com/rook/rook/pkg/operator/test"
exectest "github.com/rook/rook/pkg/util/exec/test"
"github.com/stretchr/testify/assert"
)

func TestPreClusterStartValidation(t *testing.T) {
Expand Down Expand Up @@ -68,3 +74,77 @@ func TestPreClusterStartValidation(t *testing.T) {
})
}
}

func TestPreMonChecks(t *testing.T) {
executor := &exectest.MockExecutor{}
context := &clusterd.Context{Executor: executor}
setSkipSanity := false
unsetSkipSanity := false
executor.MockExecuteCommandWithTimeout = func(timeout time.Duration, command string, args ...string) (string, error) {
logger.Infof("Command: %s %v", command, args)
if args[0] == "config" {
if args[1] == "set" {
setSkipSanity = true
assert.Equal(t, "mon", args[2])
assert.Equal(t, "mon_mds_skip_sanity", args[3])
assert.Equal(t, "1", args[4])
return "", nil
}
if args[1] == "rm" {
unsetSkipSanity = true
assert.Equal(t, "mon", args[2])
assert.Equal(t, "mon_mds_skip_sanity", args[3])
return "", nil
}
}
return "", errors.Errorf("unexpected ceph command %q", args)
}
c := cluster{context: context, ClusterInfo: cephclient.AdminClusterInfo("cluster")}

t.Run("no upgrade", func(t *testing.T) {
v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
c.isUpgrade = false
err := c.preMonStartupActions(v)
assert.NoError(t, err)
assert.False(t, setSkipSanity)
assert.False(t, unsetSkipSanity)
})

t.Run("upgrade below version", func(t *testing.T) {
setSkipSanity = false
unsetSkipSanity = false
v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 6}
c.isUpgrade = true
err := c.preMonStartupActions(v)
assert.NoError(t, err)
assert.False(t, setSkipSanity)
assert.False(t, unsetSkipSanity)
})

t.Run("upgrade to applicable version", func(t *testing.T) {
setSkipSanity = false
unsetSkipSanity = false
v := cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
c.isUpgrade = true
err := c.preMonStartupActions(v)
assert.NoError(t, err)
assert.True(t, setSkipSanity)
assert.False(t, unsetSkipSanity)

// This will be called during the post mon checks
err = c.skipMDSSanityChecks(false)
assert.NoError(t, err)
assert.True(t, unsetSkipSanity)
})

t.Run("upgrade to quincy", func(t *testing.T) {
setSkipSanity = false
unsetSkipSanity = false
v := cephver.CephVersion{Major: 17, Minor: 2, Extra: 0}
c.isUpgrade = true
err := c.preMonStartupActions(v)
assert.NoError(t, err)
assert.False(t, setSkipSanity)
assert.False(t, unsetSkipSanity)
})
}