Skip to content

Commit

Permalink
grpclb: include fallback reason in error status of failing to fallback (
Browse files Browse the repository at this point in the history
#8035)

Enhance error information reflected by RPC status when failing to fallback (aka, no fallback addresses provided by resolver), by including the original cause of entering fallback. Cases to fallback include:

  - When the fallback timer fires before we have received the first response from the balancer.
     - If no fallback addresses are found, RPCs will be failed with status {UNAVAILABLE, description="Unable to fallback, no fallback addresses found\n Timeout waiting for remote balancer", cause=null}
  - When the balancer RPC finishes before receiving any backend addresses
     - If no fallback addresses are found, RPCs will be failed with status {UNAVAILABLE, description="Unable to fallback, no fallback addresses found\n <description from the status of balancer RPC>", cause=<cause from the status of balancer RPC>}
  - When we get an explicit response from the balancer telling us go into fallback
     - If no fallback addresses are found, RPCs will be failed with status {UNAVAILABLE, description="Unable to fallback, no fallback addresses found\n Fallback requested by balancer", cause=null}
  - When the balancer call has finished *and* we cannot connect to any of the backends in the last response we received from the balancer.
     - Depending on whichever the two happened last, the last happening one is the reason that triggers entering fallback. If no fallback addresses are found, RPCs will be failed with status {UNAVAILABLE, description="Unable to fallback, no fallback addresses found\n <description from the status of balancer RPC>", cause=<cause from the status of balancer RPC>} or {UNAVAILABLE, description="Unable to fallback, no fallback addresses found\n <description from the status of one of the broken subchannels>", cause=<cause from the status of one of the broken subchannels>}

Note all RPCs will fail with UNAVAILABLE status code, the fallback reason will be attached as description and cause (if any).
  • Loading branch information
voidzcy committed Apr 8, 2021
1 parent 239db4b commit b956f88
Show file tree
Hide file tree
Showing 2 changed files with 202 additions and 25 deletions.
76 changes: 59 additions & 17 deletions grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java
Expand Up @@ -95,8 +95,20 @@ final class GrpclbState {
static final Status NO_AVAILABLE_BACKENDS_STATUS =
Status.UNAVAILABLE.withDescription("LoadBalancer responded without any backends");
@VisibleForTesting
static final Status NO_FALLBACK_BACKENDS_FOUND_STATUS =
static final Status BALANCER_TIMEOUT_STATUS =
Status.UNAVAILABLE.withDescription("Timeout waiting for remote balancer");
@VisibleForTesting
static final Status BALANCER_REQUESTED_FALLBACK_STATUS =
Status.UNAVAILABLE.withDescription("Fallback requested by balancer");
@VisibleForTesting
static final Status NO_FALLBACK_BACKENDS_STATUS =
Status.UNAVAILABLE.withDescription("Unable to fallback, no fallback addresses found");
// This error status should never be propagated to RPC failures, as "no backend or balancer
// addresses found" should be directly handled as a name resolution error. So in cases of no
// balancer address, fallback should never fail.
private static final Status NO_LB_ADDRESS_PROVIDED_STATUS =
Status.UNAVAILABLE.withDescription("No balancer address found");


@VisibleForTesting
static final RoundRobinEntry BUFFER_ENTRY = new RoundRobinEntry() {
Expand Down Expand Up @@ -137,6 +149,10 @@ enum Mode {
private ScheduledHandle fallbackTimer;
private List<EquivalentAddressGroup> fallbackBackendList = Collections.emptyList();
private boolean usingFallbackBackends;
// Reason to fallback, will be used as RPC's error message if fail to fallback (e.g., no
// fallback addresses found).
@Nullable
private Status fallbackReason;
// True if the current balancer has returned a serverlist. Will be reset to false when lost
// connection to a balancer.
private boolean balancerWorking;
Expand Down Expand Up @@ -239,7 +255,7 @@ void handleAddresses(
// No balancer address: close existing balancer connection and enter fallback mode
// immediately.
shutdownLbComm();
syncContext.execute(new FallbackModeTask());
syncContext.execute(new FallbackModeTask(NO_LB_ADDRESS_PROVIDED_STATUS));
} else {
startLbComm(newLbAddressGroups);
// Avoid creating a new RPC just because the addresses were updated, as it can cause a
Expand All @@ -253,7 +269,8 @@ void handleAddresses(
// Start the fallback timer if it's never started
if (fallbackTimer == null) {
fallbackTimer = syncContext.schedule(
new FallbackModeTask(), FALLBACK_TIMEOUT_MS, TimeUnit.MILLISECONDS, timerService);
new FallbackModeTask(BALANCER_TIMEOUT_STATUS), FALLBACK_TIMEOUT_MS,
TimeUnit.MILLISECONDS, timerService);
}
}
fallbackBackendList = newBackendServers;
Expand All @@ -275,16 +292,21 @@ void requestConnection() {
}

private void maybeUseFallbackBackends() {
if (balancerWorking) {
return;
}
if (usingFallbackBackends) {
if (balancerWorking || usingFallbackBackends) {
return;
}
// Balancer RPC should have either been broken or timed out.
checkState(fallbackReason != null, "no reason to fallback");
for (Subchannel subchannel : subchannels.values()) {
if (subchannel.getAttributes().get(STATE_INFO).get().getState() == READY) {
ConnectivityStateInfo stateInfo = subchannel.getAttributes().get(STATE_INFO).get();
if (stateInfo.getState() == READY) {
return;
}
// If we do have balancer-provided backends, use one of its error in the error message if
// fail to fallback.
if (stateInfo.getState() == TRANSIENT_FAILURE) {
fallbackReason = stateInfo.getStatus();
}
}
// Fallback conditions met
useFallbackBackends();
Expand Down Expand Up @@ -401,8 +423,10 @@ void shutdown() {
void propagateError(Status status) {
logger.log(ChannelLogLevel.DEBUG, "[grpclb-<{0}>] Error: {1}", serviceName, status);
if (backendList.isEmpty()) {
Status error =
Status.UNAVAILABLE.withCause(status.getCause()).withDescription(status.getDescription());
maybeUpdatePicker(
TRANSIENT_FAILURE, new RoundRobinPicker(dropList, Arrays.asList(new ErrorEntry(status))));
TRANSIENT_FAILURE, new RoundRobinPicker(dropList, Arrays.asList(new ErrorEntry(error))));
}
}

Expand Down Expand Up @@ -528,8 +552,17 @@ public void onSubchannelState(ConnectivityStateInfo newState) {

@VisibleForTesting
class FallbackModeTask implements Runnable {
private final Status reason;

private FallbackModeTask(Status reason) {
this.reason = reason;
}

@Override
public void run() {
// Timer should have been cancelled if entered fallback early.
checkState(!usingFallbackBackends, "already in fallback");
fallbackReason = reason;
maybeUseFallbackBackends();
maybeUpdatePicker();
}
Expand Down Expand Up @@ -658,7 +691,9 @@ private void handleResponse(LoadBalanceResponse response) {
}

if (typeCase == LoadBalanceResponseTypeCase.FALLBACK_RESPONSE) {
// Force entering fallback requested by balancer.
cancelFallbackTimer();
fallbackReason = BALANCER_REQUESTED_FALLBACK_STATUS;
useFallbackBackends();
maybeUpdatePicker();
return;
Expand Down Expand Up @@ -690,7 +725,7 @@ private void handleResponse(LoadBalanceResponse response) {
} catch (UnknownHostException e) {
propagateError(
Status.UNAVAILABLE
.withDescription("Host for server not found: " + server)
.withDescription("Invalid backend address: " + server)
.withCause(e));
continue;
}
Expand All @@ -701,8 +736,9 @@ private void handleResponse(LoadBalanceResponse response) {
newBackendAddrList.add(new BackendAddressGroup(eag, token));
}
}
// Stop using fallback backends as soon as a new server list is received from the balancer.
// Exit fallback as soon as a new server list is received from the balancer.
usingFallbackBackends = false;
fallbackReason = null;
cancelFallbackTimer();
updateServerList(newDropList, newBackendAddrList, loadRecorder);
maybeUpdatePicker();
Expand All @@ -717,6 +753,8 @@ private void handleStreamClosed(Status error) {
cleanUp();
propagateError(error);
balancerWorking = false;
fallbackReason = error;
cancelFallbackTimer();
maybeUseFallbackBackends();
maybeUpdatePicker();

Expand Down Expand Up @@ -773,15 +811,19 @@ private void maybeUpdatePicker() {
List<RoundRobinEntry> pickList;
ConnectivityState state;
if (backendList.isEmpty()) {
if (balancerWorking) {
pickList =
Collections.<RoundRobinEntry>singletonList(
new ErrorEntry(NO_AVAILABLE_BACKENDS_STATUS));
// Note balancer (is working) may enforce using fallback backends, and that fallback may
// fail. So we should check if currently in fallback first.
if (usingFallbackBackends) {
Status error =
NO_FALLBACK_BACKENDS_STATUS
.withCause(fallbackReason.getCause())
.augmentDescription(fallbackReason.getDescription());
pickList = Collections.<RoundRobinEntry>singletonList(new ErrorEntry(error));
state = TRANSIENT_FAILURE;
} else if (usingFallbackBackends) {
} else if (balancerWorking) {
pickList =
Collections.<RoundRobinEntry>singletonList(
new ErrorEntry(NO_FALLBACK_BACKENDS_FOUND_STATUS));
new ErrorEntry(NO_AVAILABLE_BACKENDS_STATUS));
state = TRANSIENT_FAILURE;
} else { // still waiting for balancer
pickList = Collections.singletonList(BUFFER_ENTRY);
Expand Down

0 comments on commit b956f88

Please sign in to comment.