From 6a30b55ee14d4cf6d6c67eafe75cb3597c2e830f Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Tue, 30 Nov 2021 17:50:47 -0700 Subject: [PATCH] mon: set stretch tiebreaker reliably during failover The failover of the arbiter mon in a stretch cluster was sometimes failing due to the new tiebreaker not being set in ceph. Rook would repeatedly try to remove the old tiebreaker mon and keep failing because the new tiebreaker had not been set. Now we make setting the tiebreaker idempotent in case the operator restarts in the middle of the operation or some other corner case causes the expected tiebreaker to be set. In that case, the next reconcile will also ensure the tiebreaker mon is set as expected. Signed-off-by: Travis Nielsen (cherry picked from commit d55669293a515f6647daa24927dcda54ead802b1) --- pkg/daemon/ceph/client/mon.go | 3 +- pkg/operator/ceph/cluster/cluster.go | 3 +- pkg/operator/ceph/cluster/mon/health.go | 9 ++-- pkg/operator/ceph/cluster/mon/mon.go | 28 ++++++----- pkg/operator/ceph/cluster/mon/mon_test.go | 57 +++++++++++++++++++++++ 5 files changed, 80 insertions(+), 20 deletions(-) diff --git a/pkg/daemon/ceph/client/mon.go b/pkg/daemon/ceph/client/mon.go index 08cfc92a8082..77f94794f5d3 100644 --- a/pkg/daemon/ceph/client/mon.go +++ b/pkg/daemon/ceph/client/mon.go @@ -64,6 +64,7 @@ type MonDump struct { FSID string `json:"fsid"` Mons []MonDumpEntry `json:"mons"` Quorum []int `json:"quorum"` + TiebreakerMon string `json:"tiebreaker_mon"` } type MonDumpEntry struct { @@ -141,7 +142,7 @@ func SetMonStretchTiebreaker(context *clusterd.Context, clusterInfo *ClusterInfo if code, ok := exec.ExitStatus(err); ok && code == int(syscall.EINVAL) { // TODO: Get a more distinctive error from ceph so we don't have to compare the error message if strings.Contains(string(buf), "stretch mode is already engaged") { - logger.Infof("stretch mode is already enabled") + logger.Info("stretch mode is already enabled") return nil } return errors.Wrapf(err, "stretch mode failed to be enabled. %s", string(buf)) diff --git a/pkg/operator/ceph/cluster/cluster.go b/pkg/operator/ceph/cluster/cluster.go index e2cc23f38adb..0c4bab321b4e 100755 --- a/pkg/operator/ceph/cluster/cluster.go +++ b/pkg/operator/ceph/cluster/cluster.go @@ -152,8 +152,7 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep // If a stretch cluster, enable the arbiter after the OSDs are created with the CRUSH map if c.Spec.IsStretchCluster() { - failingOver := false - if err := c.mons.ConfigureArbiter(failingOver); err != nil { + if err := c.mons.ConfigureArbiter(); err != nil { return errors.Wrap(err, "failed to configure stretch arbiter") } } diff --git a/pkg/operator/ceph/cluster/mon/health.go b/pkg/operator/ceph/cluster/mon/health.go index f535d1307d23..0336b3762590 100644 --- a/pkg/operator/ceph/cluster/mon/health.go +++ b/pkg/operator/ceph/cluster/mon/health.go @@ -455,9 +455,7 @@ func (c *Cluster) failoverMon(name string) error { // remove the failed mon from a local list of the existing mons for finding a stretch zone existingMons := c.clusterInfoToMonConfig(name) - // Cache the name of the current arbiter in case it is updated during the failover - // This allows a simple check for updating the arbiter later in this method - currentArbiter := c.arbiterMon + zone, err := c.findAvailableZoneIfStretched(existingMons) if err != nil { return errors.Wrap(err, "failed to find available stretch zone") @@ -496,10 +494,9 @@ func (c *Cluster) failoverMon(name string) error { } // Assign to a zone if a stretch cluster - if c.spec.IsStretchCluster() && name == currentArbiter { + if c.spec.IsStretchCluster() { // Update the arbiter mon for the stretch cluster if it changed - failingOver := true - if err := c.ConfigureArbiter(failingOver); err != nil { + if err := c.ConfigureArbiter(); err != nil { return errors.Wrap(err, "failed to configure stretch arbiter") } } diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go index 8ffa85c62b80..d35268007c3f 100644 --- a/pkg/operator/ceph/cluster/mon/mon.go +++ b/pkg/operator/ceph/cluster/mon/mon.go @@ -338,31 +338,37 @@ func (c *Cluster) isArbiterZone(zone string) bool { return c.getArbiterZone() == zone } -func (c *Cluster) ConfigureArbiter(failingOver bool) error { +func (c *Cluster) ConfigureArbiter() error { if c.arbiterMon == "" { return errors.New("arbiter not specified for the stretch cluster") } - failureDomain := c.stretchFailureDomainName() - if failingOver { - // Set the new mon tiebreaker - if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil { - return errors.Wrap(err, "failed to set new mon tiebreaker") - } - return nil - } - monDump, err := cephclient.GetMonDump(c.context, c.ClusterInfo) if err != nil { logger.Warningf("attempting to enable arbiter after failed to detect if already enabled. %v", err) } else if monDump.StretchMode { - logger.Infof("stretch mode is already enabled") + // only support arbiter failover if at least v16.2.7 + if !c.ClusterInfo.CephVersion.IsAtLeast(arbiterFailoverSupportedCephVersion) { + logger.Info("stretch mode is already enabled") + return nil + } + + if monDump.TiebreakerMon == c.arbiterMon { + logger.Infof("stretch mode is already enabled with tiebreaker %q", c.arbiterMon) + return nil + } + // Set the new mon tiebreaker + logger.Infof("updating tiebreaker mon from %q to %q", monDump.TiebreakerMon, c.arbiterMon) + if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil { + return errors.Wrap(err, "failed to set new mon tiebreaker") + } return nil } // Wait for the CRUSH map to have at least two zones // The timeout is relatively short since the operator will requeue the reconcile // and try again at a higher level if not yet found + failureDomain := c.stretchFailureDomainName() logger.Infof("enabling stretch mode... waiting for two failure domains of type %q to be found in the CRUSH map after OSD initialization", failureDomain) pollInterval := 5 * time.Second totalWaitTime := 2 * time.Minute diff --git a/pkg/operator/ceph/cluster/mon/mon_test.go b/pkg/operator/ceph/cluster/mon/mon_test.go index 08bd62a6cd92..f53630993c72 100644 --- a/pkg/operator/ceph/cluster/mon/mon_test.go +++ b/pkg/operator/ceph/cluster/mon/mon_test.go @@ -464,6 +464,63 @@ func TestMonFoundInQuorum(t *testing.T) { assert.False(t, monFoundInQuorum("d", response)) } +func TestConfigureArbiter(t *testing.T) { + c := &Cluster{spec: cephv1.ClusterSpec{ + Mon: cephv1.MonSpec{ + StretchCluster: &cephv1.StretchClusterSpec{ + Zones: []cephv1.StretchClusterZoneSpec{ + {Name: "a", Arbiter: true}, + {Name: "b"}, + {Name: "c"}, + }, + }, + }, + }} + c.arbiterMon = "arb" + currentArbiter := c.arbiterMon + setNewTiebreaker := false + executor := &exectest.MockExecutor{ + MockExecuteCommandWithOutput: func(command string, args ...string) (string, error) { + logger.Infof("%s %v", command, args) + if args[0] == "mon" { + if args[1] == "dump" { + return fmt.Sprintf(`{"tiebreaker_mon": "%s", "stretch_mode": true}`, currentArbiter), nil + } + if args[1] == "set_new_tiebreaker" { + assert.Equal(t, c.arbiterMon, args[2]) + setNewTiebreaker = true + return "", nil + } + } + return "", fmt.Errorf("unrecognized output file command: %s %v", command, args) + }, + } + c.context = &clusterd.Context{Clientset: test.New(t, 5), Executor: executor} + c.ClusterInfo = clienttest.CreateTestClusterInfo(5) + + t.Run("no arbiter failover for old ceph version", func(t *testing.T) { + c.arbiterMon = "changed" + c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 6} + err := c.ConfigureArbiter() + assert.NoError(t, err) + assert.False(t, setNewTiebreaker) + }) + t.Run("stretch mode already configured - new", func(t *testing.T) { + c.arbiterMon = currentArbiter + c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + err := c.ConfigureArbiter() + assert.NoError(t, err) + assert.False(t, setNewTiebreaker) + }) + t.Run("tiebreaker changed", func(t *testing.T) { + c.arbiterMon = "changed" + c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + err := c.ConfigureArbiter() + assert.NoError(t, err) + assert.True(t, setNewTiebreaker) + }) +} + func TestFindAvailableZoneForStretchedMon(t *testing.T) { c := &Cluster{spec: cephv1.ClusterSpec{ Mon: cephv1.MonSpec{