Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #124503: kubeadm: check for available nodes during 'CreateJob' #124570

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 37 additions & 13 deletions cmd/kubeadm/app/phases/upgrade/health.go
Expand Up @@ -64,7 +64,7 @@ func (c *healthCheck) Name() string {
}

// CheckClusterHealth makes sure:
// - the API /healthz endpoint is healthy
// - the cluster can accept a workload
// - all control-plane Nodes are Ready
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
Expand Down Expand Up @@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi
}

// createJob is a check that verifies that a Job can be created in the cluster
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
const (
prefix = "upgrade-health-check"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
prefix = "upgrade-health-check"
fieldSelector = "spec.unschedulable=false"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
)
var (
err, lastError error
ctx = context.Background()
nodes *v1.NodeList
listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
)

// If client.Discovery().RESTClient() is nil, the fake client is used.
Expand All @@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
return nil
}

// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check.
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
if err != nil {
klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
lastError = err
return false, nil
}
return true, nil
})
if err != nil {
return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
}

if len(nodes.Items) == 0 {
klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
return nil
}

// Prepare Job
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
Spec: batchv1.JobSpec{
BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: ptr.To[int32](2),
TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'.
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Expand All @@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
}

ctx := context.Background()

// Create the Job, but retry if it fails
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
var jobName string
err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil {
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
lastError = err
Expand All @@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
}

// Wait for the Job to complete
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
lastError = err
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
Expand Down Expand Up @@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon
selectorControlPlane := labels.SelectorFromSet(map[string]string{
constants.LabelNodeRoleControlPlane: "",
})
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
LabelSelector: selectorControlPlane.String(),
})
if err != nil {
Expand Down