Skip to content

Commit

Permalink
Merge pull request #9308 from rook/mergify/bp/release-1.7/pr-9282
Browse files Browse the repository at this point in the history
mon: Set stretch tiebreaker reliably during failover (backport #9282)
  • Loading branch information
mergify[bot] committed Dec 3, 2021
2 parents b828f71 + 6a30b55 commit c994c8c
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 20 deletions.
3 changes: 2 additions & 1 deletion pkg/daemon/ceph/client/mon.go
Expand Up @@ -64,6 +64,7 @@ type MonDump struct {
FSID string `json:"fsid"`
Mons []MonDumpEntry `json:"mons"`
Quorum []int `json:"quorum"`
TiebreakerMon string `json:"tiebreaker_mon"`
}

type MonDumpEntry struct {
Expand Down Expand Up @@ -141,7 +142,7 @@ func SetMonStretchTiebreaker(context *clusterd.Context, clusterInfo *ClusterInfo
if code, ok := exec.ExitStatus(err); ok && code == int(syscall.EINVAL) {
// TODO: Get a more distinctive error from ceph so we don't have to compare the error message
if strings.Contains(string(buf), "stretch mode is already engaged") {
logger.Infof("stretch mode is already enabled")
logger.Info("stretch mode is already enabled")
return nil
}
return errors.Wrapf(err, "stretch mode failed to be enabled. %s", string(buf))
Expand Down
3 changes: 1 addition & 2 deletions pkg/operator/ceph/cluster/cluster.go
Expand Up @@ -152,8 +152,7 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep

// If a stretch cluster, enable the arbiter after the OSDs are created with the CRUSH map
if c.Spec.IsStretchCluster() {
failingOver := false
if err := c.mons.ConfigureArbiter(failingOver); err != nil {
if err := c.mons.ConfigureArbiter(); err != nil {
return errors.Wrap(err, "failed to configure stretch arbiter")
}
}
Expand Down
9 changes: 3 additions & 6 deletions pkg/operator/ceph/cluster/mon/health.go
Expand Up @@ -455,9 +455,7 @@ func (c *Cluster) failoverMon(name string) error {

// remove the failed mon from a local list of the existing mons for finding a stretch zone
existingMons := c.clusterInfoToMonConfig(name)
// Cache the name of the current arbiter in case it is updated during the failover
// This allows a simple check for updating the arbiter later in this method
currentArbiter := c.arbiterMon

zone, err := c.findAvailableZoneIfStretched(existingMons)
if err != nil {
return errors.Wrap(err, "failed to find available stretch zone")
Expand Down Expand Up @@ -496,10 +494,9 @@ func (c *Cluster) failoverMon(name string) error {
}

// Assign to a zone if a stretch cluster
if c.spec.IsStretchCluster() && name == currentArbiter {
if c.spec.IsStretchCluster() {
// Update the arbiter mon for the stretch cluster if it changed
failingOver := true
if err := c.ConfigureArbiter(failingOver); err != nil {
if err := c.ConfigureArbiter(); err != nil {
return errors.Wrap(err, "failed to configure stretch arbiter")
}
}
Expand Down
28 changes: 17 additions & 11 deletions pkg/operator/ceph/cluster/mon/mon.go
Expand Up @@ -338,31 +338,37 @@ func (c *Cluster) isArbiterZone(zone string) bool {
return c.getArbiterZone() == zone
}

func (c *Cluster) ConfigureArbiter(failingOver bool) error {
func (c *Cluster) ConfigureArbiter() error {
if c.arbiterMon == "" {
return errors.New("arbiter not specified for the stretch cluster")
}

failureDomain := c.stretchFailureDomainName()
if failingOver {
// Set the new mon tiebreaker
if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil {
return errors.Wrap(err, "failed to set new mon tiebreaker")
}
return nil
}

monDump, err := cephclient.GetMonDump(c.context, c.ClusterInfo)
if err != nil {
logger.Warningf("attempting to enable arbiter after failed to detect if already enabled. %v", err)
} else if monDump.StretchMode {
logger.Infof("stretch mode is already enabled")
// only support arbiter failover if at least v16.2.7
if !c.ClusterInfo.CephVersion.IsAtLeast(arbiterFailoverSupportedCephVersion) {
logger.Info("stretch mode is already enabled")
return nil
}

if monDump.TiebreakerMon == c.arbiterMon {
logger.Infof("stretch mode is already enabled with tiebreaker %q", c.arbiterMon)
return nil
}
// Set the new mon tiebreaker
logger.Infof("updating tiebreaker mon from %q to %q", monDump.TiebreakerMon, c.arbiterMon)
if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil {
return errors.Wrap(err, "failed to set new mon tiebreaker")
}
return nil
}

// Wait for the CRUSH map to have at least two zones
// The timeout is relatively short since the operator will requeue the reconcile
// and try again at a higher level if not yet found
failureDomain := c.stretchFailureDomainName()
logger.Infof("enabling stretch mode... waiting for two failure domains of type %q to be found in the CRUSH map after OSD initialization", failureDomain)
pollInterval := 5 * time.Second
totalWaitTime := 2 * time.Minute
Expand Down
57 changes: 57 additions & 0 deletions pkg/operator/ceph/cluster/mon/mon_test.go
Expand Up @@ -464,6 +464,63 @@ func TestMonFoundInQuorum(t *testing.T) {
assert.False(t, monFoundInQuorum("d", response))
}

func TestConfigureArbiter(t *testing.T) {
c := &Cluster{spec: cephv1.ClusterSpec{
Mon: cephv1.MonSpec{
StretchCluster: &cephv1.StretchClusterSpec{
Zones: []cephv1.StretchClusterZoneSpec{
{Name: "a", Arbiter: true},
{Name: "b"},
{Name: "c"},
},
},
},
}}
c.arbiterMon = "arb"
currentArbiter := c.arbiterMon
setNewTiebreaker := false
executor := &exectest.MockExecutor{
MockExecuteCommandWithOutput: func(command string, args ...string) (string, error) {
logger.Infof("%s %v", command, args)
if args[0] == "mon" {
if args[1] == "dump" {
return fmt.Sprintf(`{"tiebreaker_mon": "%s", "stretch_mode": true}`, currentArbiter), nil
}
if args[1] == "set_new_tiebreaker" {
assert.Equal(t, c.arbiterMon, args[2])
setNewTiebreaker = true
return "", nil
}
}
return "", fmt.Errorf("unrecognized output file command: %s %v", command, args)
},
}
c.context = &clusterd.Context{Clientset: test.New(t, 5), Executor: executor}
c.ClusterInfo = clienttest.CreateTestClusterInfo(5)

t.Run("no arbiter failover for old ceph version", func(t *testing.T) {
c.arbiterMon = "changed"
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 6}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.False(t, setNewTiebreaker)
})
t.Run("stretch mode already configured - new", func(t *testing.T) {
c.arbiterMon = currentArbiter
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.False(t, setNewTiebreaker)
})
t.Run("tiebreaker changed", func(t *testing.T) {
c.arbiterMon = "changed"
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.True(t, setNewTiebreaker)
})
}

func TestFindAvailableZoneForStretchedMon(t *testing.T) {
c := &Cluster{spec: cephv1.ClusterSpec{
Mon: cephv1.MonSpec{
Expand Down

0 comments on commit c994c8c

Please sign in to comment.