Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mon: Set stretch tiebreaker reliably during failover #9282

Merged
merged 1 commit into from Dec 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/daemon/ceph/client/mon.go
Expand Up @@ -64,6 +64,7 @@ type MonDump struct {
FSID string `json:"fsid"`
Mons []MonDumpEntry `json:"mons"`
Quorum []int `json:"quorum"`
TiebreakerMon string `json:"tiebreaker_mon"`
leseb marked this conversation as resolved.
Show resolved Hide resolved
}

type MonDumpEntry struct {
Expand Down Expand Up @@ -141,7 +142,7 @@ func SetMonStretchTiebreaker(context *clusterd.Context, clusterInfo *ClusterInfo
if code, ok := exec.ExitStatus(err); ok && code == int(syscall.EINVAL) {
// TODO: Get a more distinctive error from ceph so we don't have to compare the error message
if strings.Contains(string(buf), "stretch mode is already engaged") {
logger.Infof("stretch mode is already enabled")
logger.Info("stretch mode is already enabled")
return nil
}
return errors.Wrapf(err, "stretch mode failed to be enabled. %s", string(buf))
Expand Down
3 changes: 1 addition & 2 deletions pkg/operator/ceph/cluster/cluster.go
Expand Up @@ -138,8 +138,7 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep

// If a stretch cluster, enable the arbiter after the OSDs are created with the CRUSH map
if c.Spec.IsStretchCluster() {
failingOver := false
if err := c.mons.ConfigureArbiter(failingOver); err != nil {
if err := c.mons.ConfigureArbiter(); err != nil {
return errors.Wrap(err, "failed to configure stretch arbiter")
}
}
Expand Down
9 changes: 3 additions & 6 deletions pkg/operator/ceph/cluster/mon/health.go
Expand Up @@ -453,9 +453,7 @@ func (c *Cluster) failoverMon(name string) error {

// remove the failed mon from a local list of the existing mons for finding a stretch zone
existingMons := c.clusterInfoToMonConfig(name)
// Cache the name of the current arbiter in case it is updated during the failover
// This allows a simple check for updating the arbiter later in this method
currentArbiter := c.arbiterMon

zone, err := c.findAvailableZoneIfStretched(existingMons)
if err != nil {
return errors.Wrap(err, "failed to find available stretch zone")
Expand Down Expand Up @@ -494,10 +492,9 @@ func (c *Cluster) failoverMon(name string) error {
}

// Assign to a zone if a stretch cluster
if c.spec.IsStretchCluster() && name == currentArbiter {
if c.spec.IsStretchCluster() {
// Update the arbiter mon for the stretch cluster if it changed
failingOver := true
if err := c.ConfigureArbiter(failingOver); err != nil {
if err := c.ConfigureArbiter(); err != nil {
return errors.Wrap(err, "failed to configure stretch arbiter")
}
}
Expand Down
28 changes: 17 additions & 11 deletions pkg/operator/ceph/cluster/mon/mon.go
Expand Up @@ -339,31 +339,37 @@ func (c *Cluster) isArbiterZone(zone string) bool {
return c.getArbiterZone() == zone
}

func (c *Cluster) ConfigureArbiter(failingOver bool) error {
func (c *Cluster) ConfigureArbiter() error {
if c.arbiterMon == "" {
return errors.New("arbiter not specified for the stretch cluster")
}

failureDomain := c.stretchFailureDomainName()
if failingOver {
// Set the new mon tiebreaker
if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil {
return errors.Wrap(err, "failed to set new mon tiebreaker")
}
return nil
}

monDump, err := cephclient.GetMonDump(c.context, c.ClusterInfo)
if err != nil {
logger.Warningf("attempting to enable arbiter after failed to detect if already enabled. %v", err)
} else if monDump.StretchMode {
logger.Infof("stretch mode is already enabled")
// only support arbiter failover if at least v16.2.7
if !c.ClusterInfo.CephVersion.IsAtLeast(arbiterFailoverSupportedCephVersion) {
logger.Info("stretch mode is already enabled")
return nil
}

if monDump.TiebreakerMon == c.arbiterMon {
logger.Infof("stretch mode is already enabled with tiebreaker %q", c.arbiterMon)
return nil
}
// Set the new mon tiebreaker
logger.Infof("updating tiebreaker mon from %q to %q", monDump.TiebreakerMon, c.arbiterMon)
if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil {
return errors.Wrap(err, "failed to set new mon tiebreaker")
}
return nil
}

// Wait for the CRUSH map to have at least two zones
// The timeout is relatively short since the operator will requeue the reconcile
// and try again at a higher level if not yet found
failureDomain := c.stretchFailureDomainName()
logger.Infof("enabling stretch mode... waiting for two failure domains of type %q to be found in the CRUSH map after OSD initialization", failureDomain)
pollInterval := 5 * time.Second
totalWaitTime := 2 * time.Minute
Expand Down
57 changes: 57 additions & 0 deletions pkg/operator/ceph/cluster/mon/mon_test.go
Expand Up @@ -462,6 +462,63 @@ func TestMonFoundInQuorum(t *testing.T) {
assert.False(t, monFoundInQuorum("d", response))
}

func TestConfigureArbiter(t *testing.T) {
c := &Cluster{spec: cephv1.ClusterSpec{
Mon: cephv1.MonSpec{
StretchCluster: &cephv1.StretchClusterSpec{
Zones: []cephv1.StretchClusterZoneSpec{
{Name: "a", Arbiter: true},
{Name: "b"},
{Name: "c"},
},
},
},
}}
c.arbiterMon = "arb"
currentArbiter := c.arbiterMon
setNewTiebreaker := false
executor := &exectest.MockExecutor{
MockExecuteCommandWithOutput: func(command string, args ...string) (string, error) {
logger.Infof("%s %v", command, args)
if args[0] == "mon" {
if args[1] == "dump" {
return fmt.Sprintf(`{"tiebreaker_mon": "%s", "stretch_mode": true}`, currentArbiter), nil
}
if args[1] == "set_new_tiebreaker" {
assert.Equal(t, c.arbiterMon, args[2])
setNewTiebreaker = true
return "", nil
}
}
return "", fmt.Errorf("unrecognized output file command: %s %v", command, args)
},
}
c.context = &clusterd.Context{Clientset: test.New(t, 5), Executor: executor}
c.ClusterInfo = clienttest.CreateTestClusterInfo(5)

t.Run("no arbiter failover for old ceph version", func(t *testing.T) {
c.arbiterMon = "changed"
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 6}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.False(t, setNewTiebreaker)
})
t.Run("stretch mode already configured - new", func(t *testing.T) {
c.arbiterMon = currentArbiter
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.False(t, setNewTiebreaker)
})
t.Run("tiebreaker changed", func(t *testing.T) {
c.arbiterMon = "changed"
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.True(t, setNewTiebreaker)
})
}

func TestFindAvailableZoneForStretchedMon(t *testing.T) {
c := &Cluster{spec: cephv1.ClusterSpec{
Mon: cephv1.MonSpec{
Expand Down