Skip to content

Commit

Permalink
mon: set stretch tiebreaker reliably during failover
Browse files Browse the repository at this point in the history
The failover of the arbiter mon in a stretch cluster was sometimes
failing due to the new tiebreaker not being set in ceph.
Rook would repeatedly try to remove the old tiebreaker mon
and keep failing because the new tiebreaker had not been set.
Now we make setting the tiebreaker idempotent in case the operator
restarts in the middle of the operation or some other corner
case causes the expected tiebreaker to be set. In that case,
the next reconcile will also ensure the tiebreaker mon is
set as expected.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
  • Loading branch information
travisn committed Dec 1, 2021
1 parent 4ec496d commit 6cd8ad1
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 19 deletions.
1 change: 1 addition & 0 deletions pkg/daemon/ceph/client/mon.go
Expand Up @@ -64,6 +64,7 @@ type MonDump struct {
FSID string `json:"fsid"`
Mons []MonDumpEntry `json:"mons"`
Quorum []int `json:"quorum"`
TiebreakerMon string `json:"tiebreaker_mon"`
}

type MonDumpEntry struct {
Expand Down
3 changes: 1 addition & 2 deletions pkg/operator/ceph/cluster/cluster.go
Expand Up @@ -138,8 +138,7 @@ func (c *cluster) reconcileCephDaemons(rookImage string, cephVersion cephver.Cep

// If a stretch cluster, enable the arbiter after the OSDs are created with the CRUSH map
if c.Spec.IsStretchCluster() {
failingOver := false
if err := c.mons.ConfigureArbiter(failingOver); err != nil {
if err := c.mons.ConfigureArbiter(); err != nil {
return errors.Wrap(err, "failed to configure stretch arbiter")
}
}
Expand Down
9 changes: 3 additions & 6 deletions pkg/operator/ceph/cluster/mon/health.go
Expand Up @@ -453,9 +453,7 @@ func (c *Cluster) failoverMon(name string) error {

// remove the failed mon from a local list of the existing mons for finding a stretch zone
existingMons := c.clusterInfoToMonConfig(name)
// Cache the name of the current arbiter in case it is updated during the failover
// This allows a simple check for updating the arbiter later in this method
currentArbiter := c.arbiterMon

zone, err := c.findAvailableZoneIfStretched(existingMons)
if err != nil {
return errors.Wrap(err, "failed to find available stretch zone")
Expand Down Expand Up @@ -494,10 +492,9 @@ func (c *Cluster) failoverMon(name string) error {
}

// Assign to a zone if a stretch cluster
if c.spec.IsStretchCluster() && name == currentArbiter {
if c.spec.IsStretchCluster() {
// Update the arbiter mon for the stretch cluster if it changed
failingOver := true
if err := c.ConfigureArbiter(failingOver); err != nil {
if err := c.ConfigureArbiter(); err != nil {
return errors.Wrap(err, "failed to configure stretch arbiter")
}
}
Expand Down
28 changes: 17 additions & 11 deletions pkg/operator/ceph/cluster/mon/mon.go
Expand Up @@ -339,31 +339,37 @@ func (c *Cluster) isArbiterZone(zone string) bool {
return c.getArbiterZone() == zone
}

func (c *Cluster) ConfigureArbiter(failingOver bool) error {
func (c *Cluster) ConfigureArbiter() error {
if c.arbiterMon == "" {
return errors.New("arbiter not specified for the stretch cluster")
}

failureDomain := c.stretchFailureDomainName()
if failingOver {
// Set the new mon tiebreaker
if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil {
return errors.Wrap(err, "failed to set new mon tiebreaker")
}
return nil
}

monDump, err := cephclient.GetMonDump(c.context, c.ClusterInfo)
if err != nil {
logger.Warningf("attempting to enable arbiter after failed to detect if already enabled. %v", err)
} else if monDump.StretchMode {
logger.Infof("stretch mode is already enabled")
// only support arbiter failover if at least v16.2.7
if !c.ClusterInfo.CephVersion.IsAtLeast(arbiterFailoverSupportedCephVersion) {
logger.Infof("stretch mode is already enabled")
return nil
}

if monDump.TiebreakerMon == c.arbiterMon {
logger.Infof("stretch mode is already enabled with tiebreaker %q", c.arbiterMon)
return nil
}
// Set the new mon tiebreaker
logger.Infof("updating tiebreaker mon from %q to %q", monDump.TiebreakerMon, c.arbiterMon)
if err := cephclient.SetNewTiebreaker(c.context, c.ClusterInfo, c.arbiterMon); err != nil {
return errors.Wrap(err, "failed to set new mon tiebreaker")
}
return nil
}

// Wait for the CRUSH map to have at least two zones
// The timeout is relatively short since the operator will requeue the reconcile
// and try again at a higher level if not yet found
failureDomain := c.stretchFailureDomainName()
logger.Infof("enabling stretch mode... waiting for two failure domains of type %q to be found in the CRUSH map after OSD initialization", failureDomain)
pollInterval := 5 * time.Second
totalWaitTime := 2 * time.Minute
Expand Down
55 changes: 55 additions & 0 deletions pkg/operator/ceph/cluster/mon/mon_test.go
Expand Up @@ -462,6 +462,61 @@ func TestMonFoundInQuorum(t *testing.T) {
assert.False(t, monFoundInQuorum("d", response))
}

func TestConfigureArbiter(t *testing.T) {
c := &Cluster{spec: cephv1.ClusterSpec{
Mon: cephv1.MonSpec{
StretchCluster: &cephv1.StretchClusterSpec{
Zones: []cephv1.StretchClusterZoneSpec{
{Name: "a", Arbiter: true},
{Name: "b"},
{Name: "c"},
},
},
},
}}
c.arbiterMon = "arb"
currentArbiter := c.arbiterMon
setNewTiebreaker := false
executor := &exectest.MockExecutor{
MockExecuteCommandWithOutput: func(command string, args ...string) (string, error) {
logger.Infof("%s %v", command, args)
if args[0] == "mon" {
if args[1] == "dump" {
return fmt.Sprintf(`{"tiebreaker_mon": "%s", "stretch_mode": true}`, currentArbiter), nil
}
if args[1] == "set_new_tiebreaker" {
assert.Equal(t, c.arbiterMon, args[2])
setNewTiebreaker = true
return "", nil
}
}
return "", fmt.Errorf("unrecognized output file command: %s %v", command, args)
},
}
c.context = &clusterd.Context{Clientset: test.New(t, 5), Executor: executor}
c.ClusterInfo = clienttest.CreateTestClusterInfo(5)

t.Run("stretch mode already configured - old", func(t *testing.T) {
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 6}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.False(t, setNewTiebreaker)
})
t.Run("stretch mode already configured - new", func(t *testing.T) {
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.False(t, setNewTiebreaker)
})
t.Run("tiebreaker changed", func(t *testing.T) {
c.ClusterInfo.CephVersion = cephver.CephVersion{Major: 16, Minor: 2, Extra: 7}
c.arbiterMon = "other"
err := c.ConfigureArbiter()
assert.NoError(t, err)
assert.True(t, setNewTiebreaker)
})
}

func TestFindAvailableZoneForStretchedMon(t *testing.T) {
c := &Cluster{spec: cephv1.ClusterSpec{
Mon: cephv1.MonSpec{
Expand Down

0 comments on commit 6cd8ad1

Please sign in to comment.