Skip to content

Commit

Permalink
grpclb: keep RR Subchannel state in TRANSIENT_FAILURE until becoming …
Browse files Browse the repository at this point in the history
…READY

If all RR servers are unhealthy, it's possible that at least one
connection is CONNECTING at every moment which causes RR to stay in
CONNECTING. It's better to keep the TRANSIENT_FAILURE state in that
case so that fail-fast RPCs can fail fast.

The same changes have been made for RoundRobinLoadBalancer in grpc#6657
  • Loading branch information
zhangkun83 committed Jan 15, 2021
1 parent 458b0e4 commit d6f7e57
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 3 deletions.
17 changes: 14 additions & 3 deletions grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java
Expand Up @@ -203,9 +203,20 @@ void handleSubchannelState(Subchannel subchannel, ConnectivityStateInfo newState
if (config.getMode() == Mode.ROUND_ROBIN && newState.getState() == IDLE) {
subchannel.requestConnection();
}
subchannel.getAttributes().get(STATE_INFO).set(newState);
maybeUseFallbackBackends();
maybeUpdatePicker();
AtomicReference<ConnectivityStateInfo> stateInfoRef =
subchannel.getAttributes().get(STATE_INFO);
// If all RR servers are unhealthy, it's possible that at least one connection is CONNECTING at
// every moment which causes RR to stay in CONNECTING. It's better to keep the TRANSIENT_FAILURE
// state in that case so that fail-fast RPCs can fail fast.
boolean keepState =
config.getMode() == Mode.ROUND_ROBIN
&& stateInfoRef.get().getState() == TRANSIENT_FAILURE
&& (newState.getState() == CONNECTING || newState.getState() == IDLE);
if (!keepState) {
stateInfoRef.set(newState);
maybeUseFallbackBackends();
maybeUpdatePicker();
}
}

/**
Expand Down
43 changes: 43 additions & 0 deletions grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java
Expand Up @@ -1104,6 +1104,49 @@ public void grpclbWorking() {
verify(subchannelPool).clear();
}

@Test
public void roundRobinMode_subchannelStayTransientFailureUntilReady() {
InOrder inOrder = inOrder(helper);
List<EquivalentAddressGroup> grpclbBalancerList = createResolvedBalancerAddresses(1);
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(), grpclbBalancerList);
verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture());
StreamObserver<LoadBalanceResponse> lbResponseObserver = lbResponseObserverCaptor.getValue();

// Simulate receiving LB response
List<ServerEntry> backends1 = Arrays.asList(
new ServerEntry("127.0.0.1", 2000, "token0001"),
new ServerEntry("127.0.0.1", 2010, "token0002"));
lbResponseObserver.onNext(buildInitialResponse());
lbResponseObserver.onNext(buildLbResponse(backends1));
assertEquals(2, mockSubchannels.size());
Subchannel subchannel1 = mockSubchannels.poll();
Subchannel subchannel2 = mockSubchannels.poll();

deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING));
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class));

// Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too.
Status error = Status.UNAVAILABLE.withDescription("error1");
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error));
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new ErrorEntry(error));

// Switch subchannel1 to IDLE, then to CONNECTING, which are ignored since the previous
// subchannel state is TRANSIENT_FAILURE. General state is unchanged.
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(IDLE));
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verifyNoMoreInteractions();

// Switch subchannel1 to READY, which will affect the general state
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(READY));
inOrder.verify(helper).updateBalancingState(eq(READY), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new BackendEntry(subchannel1, getLoadRecorder(), "token0001"));
inOrder.verifyNoMoreInteractions();
}

@Test
public void grpclbFallback_initialTimeout_serverListReceivedBeforeTimerExpires() {
subtestGrpclbFallbackInitialTimeout(false);
Expand Down

0 comments on commit d6f7e57

Please sign in to comment.