Skip to content

Commit

Permalink
Merge pull request #124570 from neolit123/automated-cherry-pick-of-#1…
Browse files Browse the repository at this point in the history
…24503-origin-release-1.30

Automated cherry pick of #124503: kubeadm: check for available nodes during 'CreateJob'
  • Loading branch information
k8s-ci-robot committed May 10, 2024
2 parents 65921f4 + 1410806 commit 630266c
Showing 1 changed file with 37 additions and 13 deletions.
50 changes: 37 additions & 13 deletions cmd/kubeadm/app/phases/upgrade/health.go
Expand Up @@ -64,7 +64,7 @@ func (c *healthCheck) Name() string {
}

// CheckClusterHealth makes sure:
// - the API /healthz endpoint is healthy
// - the cluster can accept a workload
// - all control-plane Nodes are Ready
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
Expand Down Expand Up @@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi
}

// createJob is a check that verifies that a Job can be created in the cluster
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
const (
prefix = "upgrade-health-check"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
prefix = "upgrade-health-check"
fieldSelector = "spec.unschedulable=false"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
)
var (
err, lastError error
ctx = context.Background()
nodes *v1.NodeList
listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
)

// If client.Discovery().RESTClient() is nil, the fake client is used.
Expand All @@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
return nil
}

// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check.
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
if err != nil {
klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
lastError = err
return false, nil
}
return true, nil
})
if err != nil {
return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
}

if len(nodes.Items) == 0 {
klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
return nil
}

// Prepare Job
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
Spec: batchv1.JobSpec{
BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: ptr.To[int32](2),
TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'.
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Expand All @@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
}

ctx := context.Background()

// Create the Job, but retry if it fails
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
var jobName string
err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil {
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
lastError = err
Expand All @@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
}

// Wait for the Job to complete
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
lastError = err
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
Expand Down Expand Up @@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon
selectorControlPlane := labels.SelectorFromSet(map[string]string{
constants.LabelNodeRoleControlPlane: "",
})
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
LabelSelector: selectorControlPlane.String(),
})
if err != nil {
Expand Down

0 comments on commit 630266c

Please sign in to comment.