Skip to content

Commit

Permalink
mds: change init sequence
Browse files Browse the repository at this point in the history
The MDS core team suggested with deploy the MDS daemon first and then do
the filesystem creation and configuration. Reversing the sequence lets
us avoid spurious FS_DOWN warnings when creating the filesystem.

Closes: #8745
Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Sep 20, 2021
1 parent acb64e9 commit 2bbb420
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 34 deletions.
36 changes: 10 additions & 26 deletions pkg/operator/ceph/file/filesystem.go
Expand Up @@ -18,10 +18,8 @@ package file

import (
"fmt"
"syscall"

"github.com/rook/rook/pkg/operator/k8sutil"
"github.com/rook/rook/pkg/util/exec"

"github.com/pkg/errors"
cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
Expand Down Expand Up @@ -51,37 +49,31 @@ func createFilesystem(
ownerInfo *k8sutil.OwnerInfo,
dataDirHostPath string,
) error {
logger.Infof("start running mdses for filesystem %q", fs.Name)
c := mds.NewCluster(clusterInfo, context, clusterSpec, fs, ownerInfo, dataDirHostPath)
if err := c.Start(); err != nil {
return err
}

if len(fs.Spec.DataPools) != 0 {
f := newFS(fs.Name, fs.Namespace)
if err := f.doFilesystemCreate(context, clusterInfo, clusterSpec, fs.Spec); err != nil {
return errors.Wrapf(err, "failed to create filesystem %q", fs.Name)
}
}

filesystem, err := cephclient.GetFilesystem(context, clusterInfo, fs.Name)
if err != nil {
return errors.Wrapf(err, "failed to get filesystem %q", fs.Name)
}

if fs.Spec.MetadataServer.ActiveStandby {
if err = cephclient.AllowStandbyReplay(context, clusterInfo, fs.Name, fs.Spec.MetadataServer.ActiveStandby); err != nil {
if err := cephclient.AllowStandbyReplay(context, clusterInfo, fs.Name, fs.Spec.MetadataServer.ActiveStandby); err != nil {
return errors.Wrapf(err, "failed to set allow_standby_replay to filesystem %q", fs.Name)
}
}

// set the number of active mds instances
if fs.Spec.MetadataServer.ActiveCount > 1 {
if err = cephclient.SetNumMDSRanks(context, clusterInfo, fs.Name, fs.Spec.MetadataServer.ActiveCount); err != nil {
if err := cephclient.SetNumMDSRanks(context, clusterInfo, fs.Name, fs.Spec.MetadataServer.ActiveCount); err != nil {
logger.Warningf("failed setting active mds count to %d. %v", fs.Spec.MetadataServer.ActiveCount, err)
}
}

logger.Infof("start running mdses for filesystem %q", fs.Name)
c := mds.NewCluster(clusterInfo, context, clusterSpec, fs, filesystem, ownerInfo, dataDirHostPath)
if err := c.Start(); err != nil {
return err
}

return nil
}

Expand All @@ -94,23 +86,15 @@ func deleteFilesystem(
ownerInfo *k8sutil.OwnerInfo,
dataDirHostPath string,
) error {
filesystem, err := cephclient.GetFilesystem(context, clusterInfo, fs.Name)
if err != nil {
if code, ok := exec.ExitStatus(err); ok && code == int(syscall.ENOENT) {
// If we're deleting the filesystem anyway, ignore the error that the filesystem doesn't exist
return nil
}
return errors.Wrapf(err, "failed to get filesystem %q", fs.Name)
}
c := mds.NewCluster(clusterInfo, context, clusterSpec, fs, filesystem, ownerInfo, dataDirHostPath)
c := mds.NewCluster(clusterInfo, context, clusterSpec, fs, ownerInfo, dataDirHostPath)

// Delete mds CephX keys and configuration in centralized mon database
replicas := fs.Spec.MetadataServer.ActiveCount * 2
for i := 0; i < int(replicas); i++ {
daemonLetterID := k8sutil.IndexToName(i)
daemonName := fmt.Sprintf("%s-%s", fs.Name, daemonLetterID)

err = c.DeleteMdsCephObjects(daemonName)
err := c.DeleteMdsCephObjects(daemonName)
if err != nil {
return errors.Wrapf(err, "failed to delete mds ceph objects for filesystem %q", fs.Name)
}
Expand Down
10 changes: 4 additions & 6 deletions pkg/operator/ceph/file/mds/mds.go
Expand Up @@ -20,7 +20,6 @@ package mds
import (
"context"
"fmt"
"strconv"
"strings"
"syscall"
"time"
Expand Down Expand Up @@ -58,7 +57,6 @@ type Cluster struct {
context *clusterd.Context
clusterSpec *cephv1.ClusterSpec
fs cephv1.CephFilesystem
fsID string
ownerInfo *k8sutil.OwnerInfo
dataDirHostPath string
}
Expand All @@ -75,7 +73,6 @@ func NewCluster(
context *clusterd.Context,
clusterSpec *cephv1.ClusterSpec,
fs cephv1.CephFilesystem,
fsdetails *cephclient.CephFilesystemDetails,
ownerInfo *k8sutil.OwnerInfo,
dataDirHostPath string,
) *Cluster {
Expand All @@ -84,7 +81,6 @@ func NewCluster(
context: context,
clusterSpec: clusterSpec,
fs: fs,
fsID: strconv.Itoa(fsdetails.ID),
ownerInfo: ownerInfo,
dataDirHostPath: dataDirHostPath,
}
Expand Down Expand Up @@ -232,7 +228,7 @@ func (c *Cluster) isCephUpgrade() (bool, error) {
return false, err
}
if cephver.IsSuperior(c.clusterInfo.CephVersion, *currentVersion) {
logger.Debugf("ceph version for MDS %q is %q and target version is %q", key, currentVersion, c.clusterInfo.CephVersion)
logger.Debugf("ceph version for MDS %q is %q and target version is %q", key, currentVersion.String(), c.clusterInfo.CephVersion.String())
return true, err
}
}
Expand All @@ -249,7 +245,9 @@ func (c *Cluster) upgradeMDS() error {
return errors.Wrap(err, "failed to setting allow_standby_replay to false")
}

// In Pacific, standby-replay daemons are stopped automatically. Older versions of Ceph require us to stop these daemons manually.
// In Pacific, standby-replay daemons are stopped automatically. Older versions of Ceph require
// us to stop these daemons manually.
// TODO: so why don't we have a version check?
if err := cephclient.FailAllStandbyReplayMDS(c.context, c.clusterInfo, c.fs.Name); err != nil {
return errors.Wrap(err, "failed to fail mds agent in up:standby-replay state")
}
Expand Down
1 change: 0 additions & 1 deletion pkg/operator/ceph/file/mds/spec_test.go
Expand Up @@ -72,7 +72,6 @@ func testDeploymentObject(t *testing.T, network cephv1.NetworkSpec) (*apps.Deplo
Network: network,
},
fs,
&cephclient.CephFilesystemDetails{ID: 15},
&k8sutil.OwnerInfo{},
"/var/lib/rook/",
)
Expand Down
9 changes: 8 additions & 1 deletion tests/integration/ceph_base_file_test.go
Expand Up @@ -387,7 +387,14 @@ func createFilesystem(helper *clients.TestClient, k8sh *utils.K8sHelper, s suite
require.Nil(s.T(), fscErr)
logger.Infof("File system %s created", filesystemName)

filesystemList, _ := helper.FSClient.List(settings.Namespace)
var filesystemList []cephclient.CephFilesystem
for range []int{0, 10} {
filesystemList, _ = helper.FSClient.List(settings.Namespace)
if len(filesystemList) == 1 {
break
}
time.Sleep(time.Second * 5)
}
require.Equal(s.T(), 1, len(filesystemList), "There should be one shared file system present")
}

Expand Down

0 comments on commit 2bbb420

Please sign in to comment.