diff --git a/pkg/daemon/ceph/client/mon.go b/pkg/daemon/ceph/client/mon.go index 08cfc92a8082..77f94794f5d3 100644 --- a/pkg/daemon/ceph/client/mon.go +++ b/pkg/daemon/ceph/client/mon.go @@ -64,6 +64,7 @@ type MonDump struct { FSID string `json:"fsid"` Mons []MonDumpEntry `json:"mons"` Quorum []int `json:"quorum"` + TiebreakerMon string `json:"tiebreaker_mon"` } type MonDumpEntry struct { @@ -141,7 +142,7 @@ func SetMonStretchTiebreaker(context *clusterd.Context, clusterInfo *ClusterInfo if code, ok := exec.ExitStatus(err); ok && code == int(syscall.EINVAL) { // TODO: Get a more distinctive error from ceph so we don't have to compare the error message if strings.Contains(string(buf), "stretch mode is already engaged") { - logger.Infof("stretch mode is already enabled") + logger.Info("stretch mode is already enabled") return nil } return errors.Wrapf(err, "stretch mode failed to be enabled. %s", string(buf)) diff --git a/pkg/operator/ceph/cluster/cluster.go b/pkg/operator/ceph/cluster/cluster.go index e2cc23f38adb..0c4bab321b4e 100755 --- a/pkg/operator/ceph/cluster/cluster.go +++ b/pkg/operator/ceph/cluster/cluster.go @@ -152,8 +152,7 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep // If a stretch cluster, enable the arbiter after the OSDs are created with the CRUSH map if c.Spec.IsStretchCluster() { - failingOver := false - if err := c.mons.ConfigureArbiter(failingOver); err != nil { + if err := c.mons.ConfigureArbiter(); err != nil { return errors.Wrap(err, "failed to configure stretch arbiter") } } diff --git a/pkg/operator/ceph/cluster/mon/health.go b/pkg/operator/ceph/cluster/mon/health.go index f535d1307d23..0336b3762590 100644 --- a/pkg/operator/ceph/cluster/mon/health.go +++ b/pkg/operator/ceph/cluster/mon/health.go @@ -455,9 +455,7 @@ func (c *Cluster) failoverMon(name string) error { // remove the failed mon from a local list of the existing mons for finding a stretch zone existingMons := c.clusterInfoToMonConfig(name) - // Cache the name of the current arbiter in case it is updated during the failover - // This allows a simple check for updating the arbiter later in this method - currentArbiter := c.arbiterMon + zone, err := c.findAvailableZoneIfStretched(existingMons) if err != nil { return errors.Wrap(err, "failed to find available stretch zone") @@ -496,10 +494,9 @@ func (c *Cluster) failoverMon(name string) error { } // Assign to a zone if a stretch cluster - if c.spec.IsStretchCluster() && name == currentArbiter { + if c.spec.IsStretchCluster() { // Update the arbiter mon for the stretch cluster if it changed - failingOver := true - if err := c.ConfigureArbiter(failingOver); err != nil { + if err := c.ConfigureArbiter(); err != nil { return errors.Wrap(err, "failed to configure stretch arbiter") } } diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go index 8ffa85c62b80..d35268007c3f 100644 --- a/pkg/operator/ceph/cluster/mon/mon.go +++ b/pkg/operator/ceph/cluster/mon/mon.go @@ -338,31 +338,37 @@ func (c *Cluster) isArbiterZone(zone string) bool { return c.getArbiterZone() == zone } -func (c *Cluster) ConfigureArbiter(failingOver bool) error { +func (c *Cluster) ConfigureArbiter() error { if c.arbiterMon == "" { return errors.New("arbiter not specified for the stretch cluster") } - failureDomain := c.stretchFailureDomainName() - if failingOver { - // Set the new mon tiebreaker - if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil { - return errors.Wrap(err, "failed to set new mon tiebreaker") - } - return nil - } - monDump, err := cephclient.GetMonDump(c.context, c.ClusterInfo) if err != nil { logger.Warningf("attempting to enable arbiter after failed to detect if already enabled. %v", err) } else if monDump.StretchMode { - logger.Infof("stretch mode is already enabled") + // only support arbiter failover if at least v16.2.7 + if !c.ClusterInfo.CephVersion.IsAtLeast(arbiterFailoverSupportedCephVersion) { + logger.Info("stretch mode is already enabled") + return nil + } + + if monDump.TiebreakerMon == c.arbiterMon { + logger.Infof("stretch mode is already enabled with tiebreaker %q", c.arbiterMon) + return nil + } + // Set the new mon tiebreaker + logger.Infof("updating tiebreaker mon from %q to %q", monDump.TiebreakerMon, c.arbiterMon) + if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil { + return errors.Wrap(err, "failed to set new mon tiebreaker") + } return nil } // Wait for the CRUSH map to have at least two zones // The timeout is relatively short since the operator will requeue the reconcile // and try again at a higher level if not yet found + failureDomain := c.stretchFailureDomainName() logger.Infof("enabling stretch mode... waiting for two failure domains of type %q to be found in the CRUSH map after OSD initialization", failureDomain) pollInterval := 5 * time.Second totalWaitTime := 2 * time.Minute diff --git a/pkg/operator/ceph/cluster/mon/mon_test.go b/pkg/operator/ceph/cluster/mon/mon_test.go index 08bd62a6cd92..f53630993c72 100644 --- a/pkg/operator/ceph/cluster/mon/mon_test.go +++ b/pkg/operator/ceph/cluster/mon/mon_test.go @@ -464,6 +464,63 @@ func TestMonFoundInQuorum(t *testing.T) { assert.False(t, monFoundInQuorum("d", response)) } +func TestConfigureArbiter(t *testing.T) { + c := &Cluster{spec: cephv1.ClusterSpec{ + Mon: cephv1.MonSpec{ + StretchCluster: &cephv1.StretchClusterSpec{ + Zones: []cephv1.StretchClusterZoneSpec{ + {Name: "a", Arbiter: true}, + {Name: "b"}, + {Name: "c"}, + }, + }, + }, + }} + c.arbiterMon = "arb" + currentArbiter := c.arbiterMon + setNewTiebreaker := false + executor := &exectest.MockExecutor{ + MockExecuteCommandWithOutput: func(command string, args ...string) (string, error) { + logger.Infof("%s %v", command, args) + if args[0] == "mon" { + if args[1] == "dump" { + return fmt.Sprintf(`{"tiebreaker_mon": "%s", "stretch_mode": true}`, currentArbiter), nil + } + if args[1] == "set_new_tiebreaker" { + assert.Equal(t, c.arbiterMon, args[2]) + setNewTiebreaker = true + return "", nil + } + } + return "", fmt.Errorf("unrecognized output file command: %s %v", command, args) + }, + } + c.context = &clusterd.Context{Clientset: test.New(t, 5), Executor: executor} + c.ClusterInfo = clienttest.CreateTestClusterInfo(5) + + t.Run("no arbiter failover for old ceph version", func(t *testing.T) { + c.arbiterMon = "changed" + c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 6} + err := c.ConfigureArbiter() + assert.NoError(t, err) + assert.False(t, setNewTiebreaker) + }) + t.Run("stretch mode already configured - new", func(t *testing.T) { + c.arbiterMon = currentArbiter + c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + err := c.ConfigureArbiter() + assert.NoError(t, err) + assert.False(t, setNewTiebreaker) + }) + t.Run("tiebreaker changed", func(t *testing.T) { + c.arbiterMon = "changed" + c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7} + err := c.ConfigureArbiter() + assert.NoError(t, err) + assert.True(t, setNewTiebreaker) + }) +} + func TestFindAvailableZoneForStretchedMon(t *testing.T) { c := &Cluster{spec: cephv1.ClusterSpec{ Mon: cephv1.MonSpec{