From d6f7e573276676f708a98bace6a20be352cea89e Mon Sep 17 00:00:00 2001 From: Kun Zhang Date: Fri, 15 Jan 2021 13:48:43 -0800 Subject: [PATCH] grpclb: keep RR Subchannel state in TRANSIENT_FAILURE until becoming READY If all RR servers are unhealthy, it's possible that at least one connection is CONNECTING at every moment which causes RR to stay in CONNECTING. It's better to keep the TRANSIENT_FAILURE state in that case so that fail-fast RPCs can fail fast. The same changes have been made for RoundRobinLoadBalancer in #6657 --- .../main/java/io/grpc/grpclb/GrpclbState.java | 17 ++++++-- .../grpc/grpclb/GrpclbLoadBalancerTest.java | 43 +++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java b/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java index 2ee6befb8e2..dc53be584cf 100644 --- a/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java +++ b/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java @@ -203,9 +203,20 @@ void handleSubchannelState(Subchannel subchannel, ConnectivityStateInfo newState if (config.getMode() == Mode.ROUND_ROBIN && newState.getState() == IDLE) { subchannel.requestConnection(); } - subchannel.getAttributes().get(STATE_INFO).set(newState); - maybeUseFallbackBackends(); - maybeUpdatePicker(); + AtomicReference stateInfoRef = + subchannel.getAttributes().get(STATE_INFO); + // If all RR servers are unhealthy, it's possible that at least one connection is CONNECTING at + // every moment which causes RR to stay in CONNECTING. It's better to keep the TRANSIENT_FAILURE + // state in that case so that fail-fast RPCs can fail fast. + boolean keepState = + config.getMode() == Mode.ROUND_ROBIN + && stateInfoRef.get().getState() == TRANSIENT_FAILURE + && (newState.getState() == CONNECTING || newState.getState() == IDLE); + if (!keepState) { + stateInfoRef.set(newState); + maybeUseFallbackBackends(); + maybeUpdatePicker(); + } } /** diff --git a/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java b/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java index dbbf9f8dab2..abc90743383 100644 --- a/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java +++ b/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java @@ -1104,6 +1104,49 @@ public void grpclbWorking() { verify(subchannelPool).clear(); } + @Test + public void roundRobinMode_subchannelStayTransientFailureUntilReady() { + InOrder inOrder = inOrder(helper); + List grpclbBalancerList = createResolvedBalancerAddresses(1); + deliverResolvedAddresses(Collections.emptyList(), grpclbBalancerList); + verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture()); + StreamObserver lbResponseObserver = lbResponseObserverCaptor.getValue(); + + // Simulate receiving LB response + List backends1 = Arrays.asList( + new ServerEntry("127.0.0.1", 2000, "token0001"), + new ServerEntry("127.0.0.1", 2010, "token0002")); + lbResponseObserver.onNext(buildInitialResponse()); + lbResponseObserver.onNext(buildLbResponse(backends1)); + assertEquals(2, mockSubchannels.size()); + Subchannel subchannel1 = mockSubchannels.poll(); + Subchannel subchannel2 = mockSubchannels.poll(); + + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING)); + deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING)); + inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class)); + + // Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too. + Status error = Status.UNAVAILABLE.withDescription("error1"); + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error)); + inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); + assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList) + .containsExactly(new ErrorEntry(error)); + + // Switch subchannel1 to IDLE, then to CONNECTING, which are ignored since the previous + // subchannel state is TRANSIENT_FAILURE. General state is unchanged. + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(IDLE)); + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING)); + inOrder.verifyNoMoreInteractions(); + + // Switch subchannel1 to READY, which will affect the general state + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(READY)); + inOrder.verify(helper).updateBalancingState(eq(READY), pickerCaptor.capture()); + assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList) + .containsExactly(new BackendEntry(subchannel1, getLoadRecorder(), "token0001")); + inOrder.verifyNoMoreInteractions(); + } + @Test public void grpclbFallback_initialTimeout_serverListReceivedBeforeTimerExpires() { subtestGrpclbFallbackInitialTimeout(false);