Skip to content

Commit

Permalink
core: rgw: allow specifying daemon startup probes
Browse files Browse the repository at this point in the history
Allow specifying daemon startup probes where we also allow configuring
liveness probes. Startup probes allow Rook to tolerate when Ceph daemons
occasionally take a long time to start up while not also making
Kubernetes liveness probes slower to detect runtime failures of daemons.

Startup probes are beta in Kubernetes 1.18, so we should not enable
probes by default for earlier Kubernetes versions.

Signed-off-by: Blaine Gardner <blaine.gardner@redhat.com>
  • Loading branch information
BlaineEXE committed Dec 21, 2021
1 parent 517aff4 commit 9c6c8a9
Show file tree
Hide file tree
Showing 14 changed files with 641 additions and 29 deletions.
197 changes: 196 additions & 1 deletion deploy/charts/rook-ceph/templates/resources.yaml

Large diffs are not rendered by default.

197 changes: 196 additions & 1 deletion deploy/examples/crds.yaml

Large diffs are not rendered by default.

Expand Up @@ -20,6 +20,10 @@ import (
corev1 "k8s.io/api/core/v1"
)

/*
* Liveness probes
*/

// GetMonLivenessProbe returns the liveness probe for the MON service
func GetMonLivenessProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
return l.LivenessProbe[ResourcesKeyMon].Probe
Expand All @@ -39,3 +43,27 @@ func GetOSDLivenessProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
func GetMdsLivenessProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
return l.LivenessProbe[ResourcesKeyMDS].Probe
}

/*
* Startup probes
*/

// GetMonStartupProbe returns the startup probe for the MON service
func GetMonStartupProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
return l.StartupProbe[ResourcesKeyMon].Probe
}

// GetMgrStartupProbe returns the startup probe for the MGR service
func GetMgrStartupProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
return l.StartupProbe[ResourcesKeyMgr].Probe
}

// GetOSDStartupProbe returns the startup probe for the OSD service
func GetOSDStartupProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
return l.StartupProbe[ResourcesKeyOSD].Probe
}

// GetMdsStartupProbe returns the startup probe for the MDS service
func GetMdsStartupProbe(l CephClusterHealthCheckSpec) *corev1.Probe {
return l.StartupProbe[ResourcesKeyMDS].Probe
}
7 changes: 6 additions & 1 deletion pkg/apis/ceph.rook.io/v1/types.go
Expand Up @@ -59,9 +59,12 @@ type CephClusterHealthCheckSpec struct {
// +optional
// +nullable
DaemonHealth DaemonHealthSpec `json:"daemonHealth,omitempty"`
// LivenessProbe allows to change the livenessprobe configuration for a given daemon
// LivenessProbe allows changing the livenessProbe configuration for a given daemon
// +optional
LivenessProbe map[KeyType]*ProbeSpec `json:"livenessProbe,omitempty"`
// StartupProbe allows changing the startupProbe configuration for a given daemon
// +optional
StartupProbe map[KeyType]*ProbeSpec `json:"startupProbe,omitempty"`
}

// DaemonHealthSpec is a daemon health check
Expand Down Expand Up @@ -1308,6 +1311,8 @@ type BucketHealthCheckSpec struct {
LivenessProbe *ProbeSpec `json:"livenessProbe,omitempty"`
// +optional
ReadinessProbe *ProbeSpec `json:"readinessProbe,omitempty"`
// +optional
StartupProbe *ProbeSpec `json:"startupProbe,omitempty"`
}

// HealthCheckSpec represents the health check of an object store bucket
Expand Down
20 changes: 20 additions & 0 deletions pkg/apis/ceph.rook.io/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pkg/operator/ceph/cluster/mgr/spec.go
Expand Up @@ -184,11 +184,12 @@ func (c *Cluster) makeMgrDaemonContainer(mgrConfig *mgrConfig) v1.Container {
),
Resources: cephv1.GetMgrResources(c.spec.Resources),
SecurityContext: controller.PodSecurityContext(),
StartupProbe: controller.GenerateStartupProbeExecDaemon(config.MgrType, mgrConfig.DaemonID),
LivenessProbe: controller.GenerateLivenessProbeExecDaemon(config.MgrType, mgrConfig.DaemonID),
WorkingDir: config.VarLogCephDir,
}

// If the liveness probe is enabled
container = config.ConfigureStartupProbe(cephv1.KeyMgr, container, c.spec.HealthCheck)
container = config.ConfigureLivenessProbe(cephv1.KeyMgr, container, c.spec.HealthCheck)

// If host networking is enabled, we don't need a bind addr that is different from the public addr
Expand Down
3 changes: 2 additions & 1 deletion pkg/operator/ceph/cluster/mon/spec.go
Expand Up @@ -313,6 +313,7 @@ func (c *Cluster) makeMonDaemonContainer(monConfig *monConfig) corev1.Container
k8sutil.PodIPEnvVar(podIPEnvVar),
),
Resources: cephv1.GetMonResources(c.spec.Resources),
StartupProbe: controller.GenerateStartupProbeExecDaemon(config.MonType, monConfig.DaemonName),
LivenessProbe: controller.GenerateLivenessProbeExecDaemon(config.MonType, monConfig.DaemonName),
WorkingDir: config.VarLogCephDir,
}
Expand All @@ -326,7 +327,7 @@ func (c *Cluster) makeMonDaemonContainer(monConfig *monConfig) corev1.Container
}
}

// If the liveness probe is enabled
container = config.ConfigureStartupProbe(cephv1.KeyMon, container, c.spec.HealthCheck)
container = config.ConfigureLivenessProbe(cephv1.KeyMon, container, c.spec.HealthCheck)

// If host networking is enabled, we don't need a bind addr that is different from the public addr
Expand Down
3 changes: 2 additions & 1 deletion pkg/operator/ceph/cluster/osd/spec.go
Expand Up @@ -552,6 +552,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
Env: envVars,
Resources: osdProps.resources,
SecurityContext: securityContext,
StartupProbe: controller.GenerateStartupProbeExecDaemon(opconfig.OsdType, osdID),
LivenessProbe: controller.GenerateLivenessProbeExecDaemon(opconfig.OsdType, osdID),
WorkingDir: opconfig.VarLogCephDir,
},
Expand All @@ -571,7 +572,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
podTemplateSpec.Spec.Containers = append(podTemplateSpec.Spec.Containers, *controller.LogCollectorContainer(fmt.Sprintf("ceph-osd.%s", osdID), c.clusterInfo.Namespace, c.spec))
}

// If the liveness probe is enabled
podTemplateSpec.Spec.Containers[0] = opconfig.ConfigureStartupProbe(cephv1.KeyOSD, podTemplateSpec.Spec.Containers[0], c.spec.HealthCheck)
podTemplateSpec.Spec.Containers[0] = opconfig.ConfigureLivenessProbe(cephv1.KeyOSD, podTemplateSpec.Spec.Containers[0], c.spec.HealthCheck)

if c.spec.Network.IsHost() {
Expand Down
26 changes: 26 additions & 0 deletions pkg/operator/ceph/config/livenessprobe.go
Expand Up @@ -51,6 +51,32 @@ func ConfigureLivenessProbe(daemon cephv1.KeyType, container v1.Container, healt
return container
}

// ConfigureStartupProbe returns the desired startup probe for a given daemon
func ConfigureStartupProbe(daemon cephv1.KeyType, container v1.Container, healthCheck cephv1.CephClusterHealthCheckSpec) v1.Container {
// Map of functions
probeFnMap := map[cephv1.KeyType]fn{
cephv1.KeyMon: cephv1.GetMonStartupProbe,
cephv1.KeyMgr: cephv1.GetMgrStartupProbe,
cephv1.KeyOSD: cephv1.GetOSDStartupProbe,
cephv1.KeyMds: cephv1.GetMdsStartupProbe,
}

if _, ok := healthCheck.StartupProbe[daemon]; ok {
if healthCheck.StartupProbe[daemon].Disabled {
container.StartupProbe = nil
} else {
probe := probeFnMap[daemon](healthCheck)
// If the spec value is not empty, let's apply it along with default when some fields are not specified
if probe != nil {
// Set the startup probe on the container to overwrite the default probe created by Rook
container.StartupProbe = GetProbeWithDefaults(probe, container.StartupProbe)
}
}
}

return container
}

func GetProbeWithDefaults(desiredProbe, currentProbe *v1.Probe) *v1.Probe {
newProbe := *desiredProbe

Expand Down
92 changes: 90 additions & 2 deletions pkg/operator/ceph/config/livenessprobe_test.go
Expand Up @@ -62,8 +62,8 @@ func configLivenessProbeHelper(t *testing.T, keyType cephv1.KeyType) {
args args
want v1.Container
}{
{"probe-enabled", args{keyType, container, cephv1.CephClusterHealthCheckSpec{}}, container},
{"probe-disabled", args{keyType, container, cephv1.CephClusterHealthCheckSpec{LivenessProbe: l}}, v1.Container{}},
{string(keyType) + "_probe-enabled", args{keyType, container, cephv1.CephClusterHealthCheckSpec{}}, container},
{string(keyType) + "_probe-disabled", args{keyType, container, cephv1.CephClusterHealthCheckSpec{LivenessProbe: l}}, v1.Container{}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand All @@ -74,6 +74,94 @@ func configLivenessProbeHelper(t *testing.T, keyType cephv1.KeyType) {
}
}

func TestConfigureStartupProbe(t *testing.T) {
keyTypes := []cephv1.KeyType{
cephv1.KeyMds,
cephv1.KeyMon,
cephv1.KeyMgr,
cephv1.KeyOSD,
}

for _, keyType := range keyTypes {
configStartupProbeHelper(t, keyType)
}

t.Run("integration check: configured probes should override values", func(t *testing.T) {
defaultProbe := &v1.Probe{
Handler: v1.Handler{
HTTPGet: &v1.HTTPGetAction{
Path: "/",
Port: intstr.FromInt(8443),
},
},
}
userProbe := &v1.Probe{
Handler: v1.Handler{
HTTPGet: &v1.HTTPGetAction{
Path: "/custom/path",
Port: intstr.FromInt(8080),
},
},
InitialDelaySeconds: 999,
TimeoutSeconds: 888,
PeriodSeconds: 777,
SuccessThreshold: 666,
FailureThreshold: 555,
}

healthCheckSpec := cephv1.CephClusterHealthCheckSpec{
StartupProbe: map[cephv1.KeyType]*cephv1.ProbeSpec{
cephv1.KeyMon: {
Disabled: false,
Probe: userProbe,
},
},
}

container := v1.Container{StartupProbe: defaultProbe}

got := ConfigureStartupProbe(cephv1.KeyMon, container, healthCheckSpec)
// the resultant container's startup probe should have been overridden, but the handler
// should always be the rook-given default
expectedProbe := *userProbe
expectedProbe.Handler = defaultProbe.Handler
assert.Equal(t, &expectedProbe, got.StartupProbe)
})
}

func configStartupProbeHelper(t *testing.T, keyType cephv1.KeyType) {
p := &v1.Probe{
Handler: v1.Handler{
HTTPGet: &v1.HTTPGetAction{
Path: "/",
Port: intstr.FromInt(8080),
},
},
}
container := v1.Container{StartupProbe: p}
l := map[cephv1.KeyType]*cephv1.ProbeSpec{keyType: {Disabled: true}}
type args struct {
daemon cephv1.KeyType
container v1.Container
healthCheck cephv1.CephClusterHealthCheckSpec
}
tests := []struct {
name string
args args
want v1.Container
}{
{string(keyType) + "_probe-enabled", args{keyType, container, cephv1.CephClusterHealthCheckSpec{}}, container},
{string(keyType) + "_probe-disabled", args{keyType, container, cephv1.CephClusterHealthCheckSpec{StartupProbe: l}}, v1.Container{}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := ConfigureStartupProbe(tt.args.daemon, tt.args.container, tt.args.healthCheck); !reflect.DeepEqual(got, tt.want) {
t.Errorf("ConfigureStartupProbe() = %v, want %v", got, tt.want)
}
})
}
}

func TestGetProbeWithDefaults(t *testing.T) {
t.Run("using default probe", func(t *testing.T) {
currentProb := &v1.Probe{
Expand Down
54 changes: 36 additions & 18 deletions pkg/operator/ceph/controller/spec.go
Expand Up @@ -43,18 +43,19 @@ import (
const (
// ConfigInitContainerName is the name which is given to the config initialization container
// in all Ceph pods.
ConfigInitContainerName = "config-init"
logVolumeName = "rook-ceph-log"
volumeMountSubPath = "data"
crashVolumeName = "rook-ceph-crash"
daemonSocketDir = "/run/ceph"
initialDelaySecondsNonOSDDaemon int32 = 10
initialDelaySecondsOSDDaemon int32 = 45
logCollector = "log-collector"
DaemonIDLabel = "ceph_daemon_id"
daemonTypeLabel = "ceph_daemon_type"
ExternalMgrAppName = "rook-ceph-mgr-external"
ServiceExternalMetricName = "http-external-metrics"
ConfigInitContainerName = "config-init"
logVolumeName = "rook-ceph-log"
volumeMountSubPath = "data"
crashVolumeName = "rook-ceph-crash"
daemonSocketDir = "/run/ceph"
livenessProbeInitialDelaySeconds int32 = 10
startupProbeFailuresDaemonDefault int32 = 6 // multiply by 10 = effective startup timeout
startupProbeFailuresDaemonOSD int32 = 9 // multiply by 10 = effective startup timeout
logCollector = "log-collector"
DaemonIDLabel = "ceph_daemon_id"
daemonTypeLabel = "ceph_daemon_type"
ExternalMgrAppName = "rook-ceph-mgr-external"
ServiceExternalMetricName = "http-external-metrics"
)

type daemonConfig struct {
Expand Down Expand Up @@ -554,13 +555,10 @@ func StoredLogAndCrashVolumeMount(varLogCephDir, varLibCephCrashDir string) []v1
}
}

// GenerateLivenessProbeExecDaemon makes sure a daemon has a socket and that it can be called and returns 0
// GenerateLivenessProbeExecDaemon generates a liveness probe that makes sure a daemon has a socket,
// that it can be called, and that it returns 0
func GenerateLivenessProbeExecDaemon(daemonType, daemonID string) *v1.Probe {
confDaemon := getDaemonConfig(daemonType, daemonID)
initialDelaySeconds := initialDelaySecondsNonOSDDaemon
if daemonType == config.OsdType {
initialDelaySeconds = initialDelaySecondsOSDDaemon
}

return &v1.Probe{
Handler: v1.Handler{
Expand All @@ -579,8 +577,28 @@ func GenerateLivenessProbeExecDaemon(daemonType, daemonID string) *v1.Probe {
},
},
},
InitialDelaySeconds: initialDelaySeconds,
InitialDelaySeconds: livenessProbeInitialDelaySeconds,
}
}

// GenerateStartupProbeExecDaemon generates a startup probe that makes sure a daemon has a socket,
// that it can be called, and that it returns 0
func GenerateStartupProbeExecDaemon(daemonType, daemonID string) *v1.Probe {
// startup probe is the same as the liveness probe, but with modified thresholds
probe := GenerateLivenessProbeExecDaemon(daemonType, daemonID)

// these are hardcoded to 10 so that the failure threshold can be easily multiplied by 10 to
// give the effective startup timeout
probe.InitialDelaySeconds = 10
probe.PeriodSeconds = 10

if daemonType == config.OsdType {
probe.FailureThreshold = startupProbeFailuresDaemonOSD
} else {
probe.FailureThreshold = startupProbeFailuresDaemonDefault
}

return probe
}

func getDaemonConfig(daemonType, daemonID string) *daemonConfig {
Expand Down
5 changes: 2 additions & 3 deletions pkg/operator/ceph/controller/spec_test.go
Expand Up @@ -158,12 +158,11 @@ func TestGenerateLivenessProbeExecDaemon(t *testing.T) {
}

assert.Equal(t, expectedCommand, probe.Handler.Exec.Command)
// it's an OSD the delay must be 45
assert.Equal(t, initialDelaySecondsOSDDaemon, probe.InitialDelaySeconds)
assert.Equal(t, livenessProbeInitialDelaySeconds, probe.InitialDelaySeconds)

// test with a mon so the delay should be 10
probe = GenerateLivenessProbeExecDaemon(config.MonType, "a")
assert.Equal(t, initialDelaySecondsNonOSDDaemon, probe.InitialDelaySeconds)
assert.Equal(t, livenessProbeInitialDelaySeconds, probe.InitialDelaySeconds)
}

func TestDaemonFlags(t *testing.T) {
Expand Down
2 changes: 2 additions & 0 deletions pkg/operator/ceph/file/mds/spec.go
Expand Up @@ -44,6 +44,7 @@ const (
func (c *Cluster) makeDeployment(mdsConfig *mdsConfig, namespace string) (*apps.Deployment, error) {

mdsContainer := c.makeMdsDaemonContainer(mdsConfig)
mdsContainer = config.ConfigureStartupProbe(cephv1.KeyMds, mdsContainer, c.clusterSpec.HealthCheck)
mdsContainer = config.ConfigureLivenessProbe(cephv1.KeyMds, mdsContainer, c.clusterSpec.HealthCheck)

podSpec := v1.PodTemplateSpec{
Expand Down Expand Up @@ -147,6 +148,7 @@ func (c *Cluster) makeMdsDaemonContainer(mdsConfig *mdsConfig) v1.Container {
Env: append(controller.DaemonEnvVars(c.clusterSpec.CephVersion.Image), k8sutil.PodIPEnvVar(podIPEnvVar)),
Resources: c.fs.Spec.MetadataServer.Resources,
SecurityContext: controller.PodSecurityContext(),
StartupProbe: controller.GenerateStartupProbeExecDaemon(config.MdsType, mdsConfig.DaemonID),
LivenessProbe: controller.GenerateLivenessProbeExecDaemon(config.MdsType, mdsConfig.DaemonID),
WorkingDir: config.VarLogCephDir,
}
Expand Down

0 comments on commit 9c6c8a9

Please sign in to comment.