Skip to content

Commit

Permalink
When a dep transitions from done to dirty on a node in error, defer s…
Browse files Browse the repository at this point in the history
…etting the error regardless of whether or not we "should" fail fast.

If we attempt to fail fast, setting the error causes an `IllegalStateException` because the node has not been signaled by the dep that transitioned from done to dirty (the preconditions check for `isReady()` fails). Instead, avoid preventing new evaluations and follow the same strategy as in the `!shouldFailFast` case: allow the dep to finish and signal the failing node.

Although it seems like we should be able to set the error without waiting for the rewound dep to complete, I felt it was safer to maintain the invariants that a) a node's value is only set after it is signaled by all deps and b) `signalDep` is only called with done deps. I did not want to violate one of these two for such an obscure case.

Cleans up the semantics of `maybeHandleRegisteringNewlyDiscoveredDepsForDoneEntry` to only return true if we are in the "done to dirty" (rewinding) situation. Previously, it was also returning true if all newly discovered deps were not yet done in a `--nokeep_going` build (and thus removed). Some sky functions actually do throw an exception even after all newly requested deps are not done, for example `ConfiguredTargetFunction` (I assume it requests more "optional" deps in an attempt to report as many root causes as possible). Since this case is now distinct from the rewinding case, we continue to fail fast.

PiperOrigin-RevId: 479641479
Change-Id: I52aa52dbe3ee104c7293eaa6857e11e44e6245f2
  • Loading branch information
justinhorvitz authored and Copybara-Service committed Oct 7, 2022
1 parent 971e229 commit 7cc4812
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 18 deletions.
Expand Up @@ -592,6 +592,15 @@ public void run() {
// avoiding non-determinism. It's completely reasonable for SkyFunctions to throw eagerly
// because they do not know if they are in keep-going mode.
if (!evaluatorContext.keepGoing() || !env.valuesMissing()) {
if (maybeHandleRegisteringNewlyDiscoveredDepsForDoneEntry(
skyKey, nodeEntry, oldDeps, env, evaluatorContext.keepGoing())) {
// A newly requested dep transitioned from done to dirty before this node finished.
// It is not safe to set the error because the now-dirty dep has not signaled this
// node. We return (without preventing new evaluations) so that the dep can complete
// and signal this node.
return;
}

boolean shouldFailFast =
!evaluatorContext.keepGoing() || builderException.isCatastrophic();
if (shouldFailFast) {
Expand All @@ -607,18 +616,6 @@ public void run() {
"Aborting evaluation while evaluating %s", skyKey);
}
}

if (maybeHandleRegisteringNewlyDiscoveredDepsForDoneEntry(
skyKey, nodeEntry, oldDeps, env, evaluatorContext.keepGoing())) {
// A newly requested dep transitioned from done to dirty before this node finished.
// If shouldFailFast is true, this node won't be signalled by any such newly dirtied
// dep (because new evaluations have been prevented), and this node is responsible for
// throwing the SchedulerException below.
// Otherwise, this node will be signalled again, and so we should return.
if (!shouldFailFast) {
return;
}
}
boolean isTransitivelyTransient =
reifiedBuilderException.isTransient()
|| env.isAnyDirectDepErrorTransitivelyTransient()
Expand Down Expand Up @@ -1050,17 +1047,17 @@ private boolean maybeHandleRegisteringNewlyDiscoveredDepsForDoneEntry(
SkyFunctionEnvironment env,
boolean keepGoing)
throws InterruptedException {
if (env.getNewlyRequestedDeps().isEmpty()) {
return false;
}

// We don't expect any unfinished deps in a keep-going build.
if (!keepGoing) {
env.removeUndoneNewlyRequestedDeps();
}

env.addTemporaryDirectDepsTo(entry);
Set<SkyKey> newDeps = env.getNewlyRequestedDeps();
if (newDeps.isEmpty()) {
return false;
}

env.addTemporaryDirectDepsTo(entry);
Set<SkyKey> newlyAddedNewDeps;
Set<SkyKey> previouslyRegisteredNewDeps;
if (oldDeps.isEmpty()) {
Expand Down Expand Up @@ -1126,7 +1123,7 @@ private boolean maybeHandleRegisteringNewlyDiscoveredDepsForDoneEntry(
}

checkState(
selfSignalled || dirtyDepFound || newDeps.isEmpty(),
selfSignalled || dirtyDepFound,
"%s %s %s %s",
skyKey,
entry,
Expand Down
Expand Up @@ -211,4 +211,9 @@ public void generatedTransitiveHeaderRewound_lostInActionExecution() throws Exce
skipIfBazel();
helper.runGeneratedTransitiveHeaderRewound_lostInActionExecution_spawnFailed();
}

@Test
public void doneToDirtyDepForNodeInError() throws Exception {
helper.runDoneToDirtyDepForNodeInError();
}
}
Expand Up @@ -2515,6 +2515,73 @@ final void runGeneratedTransitiveHeaderRewound_lostInActionExecution(SpawnShim s
assertThat(dirtiedArtifactOwnerLabels(dirtiedKeys)).containsExactly("//genheader:gen_header");
}

/**
* Regression test for b/242179728.
*
* <p>Exercises a scenario where a failing action depends on another action which is rewound
* between the time that the action fails and the dep is looked up for signaling. The order of
* events in this scenario (synchronized so that they execute sequentially) is:
*
* <ol>
* <li>{@code //foo:dep} executes successfully and produces two outputs, {@code dep.out1} and
* {@code dep.out2}.
* <li>{@code //foo:fail}, which depends on {@code dep.out1}, executes and fails due to a
* regular action execution failure (not a lost input).
* <li>{@code //foo:other}, which depends on {@code dep.out2}, executes and observes a lost
* input. {@code //foo:dep} is rewound.
* <li>{@code //foo:fail} looks up {@code //foo:dep} for {@link Reason#RDEP_ADDITION} and
* observes it to be dirty.
* </ol>
*/
public final void runDoneToDirtyDepForNodeInError() throws Exception {
testCase.write(
"foo/BUILD",
"genrule(name = 'other', srcs = [':dep.out2'], outs = ['other.out'], cmd = 'cp $< $@')",
"genrule(name = 'fail', srcs = [':dep.out1'], outs = ['fail.out'], cmd = 'false')",
"genrule(name = 'dep', outs = ['dep.out1', 'dep.out2'], cmd = 'touch $(OUTS)')");
CountDownLatch depDone = new CountDownLatch(1);
CountDownLatch failExecuting = new CountDownLatch(1);
CountDownLatch depRewound = new CountDownLatch(1);
Label fail = Label.parseAbsoluteUnchecked("//foo:fail");
Label dep = Label.parseAbsoluteUnchecked("//foo:dep");
addSpawnShim(
"Executing genrule //foo:fail",
(spawn, context) -> {
failExecuting.countDown();
return ExecResult.delegate();
});
addSpawnShim(
"Executing genrule //foo:other",
(spawn, context) -> {
ImmutableList<ActionInput> lostInputs =
ImmutableList.of(SpawnInputUtils.getInputWithName(spawn, "dep.out2"));
return createLostInputsExecException(
context, lostInputs, new ActionInputDepOwnerMap(lostInputs));
});
injectListenerAtStartOfNextBuild(
(key, type, order, context) -> {
if (isActionExecutionKey(key, fail) && type == EventType.CREATE_IF_ABSENT) {
awaitUninterruptibly(depDone);
} else if (isActionExecutionKey(key, dep)
&& type == EventType.SET_VALUE
&& order == Order.AFTER) {
depDone.countDown();
} else if (isActionExecutionKey(key, dep)
&& type == EventType.ADD_REVERSE_DEP
&& order == Order.BEFORE
&& isActionExecutionKey(context, fail)) {
awaitUninterruptibly(depRewound);
} else if (isActionExecutionKey(key, dep)
&& type == EventType.MARK_DIRTY
&& order == Order.AFTER) {
depRewound.countDown();
}
});

assertThrows(BuildFailedException.class, () -> testCase.buildTarget("//foo:all"));
testCase.assertContainsError("Executing genrule //foo:fail failed");
}

private static boolean isActionExecutionKey(Object key, Label label) {
return key instanceof ActionLookupData && label.equals(((ActionLookupData) key).getLabel());
}
Expand Down

0 comments on commit 7cc4812

Please sign in to comment.