Skip to content

Commit

Permalink
Merge pull request #53857 from derekwaynecarr/sync-event
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue (batch tested with PRs 51840, 53542, 53857, 53831, 53702). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

kubelet sync pod throws more detailed events

**What this PR does / why we need it**:
If there are errors in the kubelet sync pod iteration, it is difficult to determine the problem.

This provides more specific events for errors that occur in the syncPod iteration to help perform problem isolation.

Fixes #53900

**Special notes for your reviewer**:
It is safer to dispatch more specific events now that we have an event budget per object enforced via #47367

**Release note**:
```release-note
kubelet provides more specific events when unable to sync pod
```
  • Loading branch information
Kubernetes Submit Queue committed Oct 13, 2017
2 parents 1ee617c + 5422460 commit e6e23ae
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 7 deletions.
7 changes: 7 additions & 0 deletions pkg/kubelet/events/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ const (
BackOffStartContainer = "BackOff"
ExceededGracePeriod = "ExceededGracePeriod"

// Pod event reason list
FailedToKillPod = "FailedKillPod"
FailedToCreatePodContainer = "FailedCreatePodContainer"
FailedToMakePodDataDirectories = "Failed"
NetworkNotReady = "NetworkNotReady"

// Image event reason list
PullingImage = "Pulling"
PulledImage = "Pulled"
Expand Down Expand Up @@ -65,6 +71,7 @@ const (
UnsupportedMountOption = "UnsupportedMountOption"
SandboxChanged = "SandboxChanged"
FailedCreatePodSandBox = "FailedCreatePodSandBox"
FailedStatusPodSandBox = "FailedPodSandBoxStatus"

// Image manager event reason list
InvalidDiskCapacity = "InvalidDiskCapacity"
Expand Down
11 changes: 11 additions & 0 deletions pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -1474,6 +1474,10 @@ func (kl *Kubelet) GetClusterDNS(pod *v1.Pod) ([]string, []string, bool, error)
//
// If any step of this workflow errors, the error is returned, and is repeated
// on the next syncPod call.
//
// This operation writes all events that are dispatched in order to provide
// the most accurate information possible about an error situation to aid debugging.
// Callers should not throw an event if this operation returns an error.
func (kl *Kubelet) syncPod(o syncPodOptions) error {
// pull out the required options
pod := o.pod
Expand All @@ -1491,6 +1495,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
kl.statusManager.SetPodStatus(pod, apiPodStatus)
// we kill the pod with the specified grace period since this is a termination
if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
// there was an error killing the pod, so we return that error directly
utilruntime.HandleError(err)
return err
Expand Down Expand Up @@ -1557,6 +1562,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed {
var syncErr error
if err := kl.killPod(pod, nil, podStatus, nil); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
syncErr = fmt.Errorf("error killing pod: %v", err)
utilruntime.HandleError(syncErr)
} else {
Expand All @@ -1571,6 +1577,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {

// If the network plugin is not ready, only start the pod if it uses the host network
if rs := kl.runtimeState.networkErrors(); len(rs) != 0 && !kubecontainer.IsHostNetworkPod(pod) {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "network is not ready: %v", rs)
return fmt.Errorf("network is not ready: %v", rs)
}

Expand Down Expand Up @@ -1613,6 +1620,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err)
}
if err := pcm.EnsureExists(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err)
return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err)
}
}
Expand Down Expand Up @@ -1650,6 +1658,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {

// Make data directories for the pod
if err := kl.makePodDataDirs(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err)
glog.Errorf("Unable to make pod data directories for pod %q: %v", format.Pod(pod), err)
return err
}
Expand All @@ -1671,6 +1680,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
result := kl.containerRuntime.SyncPod(pod, apiPodStatus, podStatus, pullSecrets, kl.backOff)
kl.reasonCache.Update(pod.UID, result)
if err := result.Error(); err != nil {
// Do not record an event here, as we keep all event logging for sync pod failures
// local to container runtime so we get better errors
return err
}

Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
// Step 1: pull the image.
imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets)
if err != nil {
m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return msg, err
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStat

podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
if err != nil {
ref, err := ref.GetReference(api.Scheme, pod)
if err != nil {
glog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err)
}
m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedStatusPodSandBox, "Unable to get pod sandbox status: %v", err)
glog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod))
result.Fail(err)
return
Expand Down
11 changes: 4 additions & 7 deletions pkg/kubelet/pod_workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
// the previous sync.
status, err := p.podCache.GetNewerThan(podUID, lastSyncTime)
if err != nil {
// This is the legacy event thrown by manage pod loop
// all other events are now dispatched from syncPodFn
p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err)
return err
}
err = p.syncPodFn(syncPodOptions{
Expand All @@ -179,14 +182,8 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
update.OnCompleteFunc(err)
}
if err != nil {
// IMPORTANT: we do not log errors here, the syncPodFn is responsible for logging errors
glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err)
// if we failed sync, we throw more specific events for why it happened.
// as a result, i question the value of this event.
// TODO: determine if we can remove this in a future release.
// do not include descriptive text that can vary on why it failed so in a pathological
// scenario, kubelet does not create enough discrete events that miss default aggregation
// window.
p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod")
}
p.wrapUp(update.Pod.UID, err)
}
Expand Down

0 comments on commit e6e23ae

Please sign in to comment.