From a4334eb5c30c40419701835378b357fc0d2bcd27 Mon Sep 17 00:00:00 2001 From: ZHANG Dapeng Date: Mon, 29 Nov 2021 08:48:51 -0800 Subject: [PATCH] census: fix NPE in calling recordFinishedAttempt() (#8706) Fix the NPE as shown in the following stacktrace: ``` Caused by: java.lang.RuntimeException: java.lang.NullPointerException with message: null at io.grpc.census.CensusStatsModule$ClientTracer.recordFinishedAttempt(CensusStatsModule.java:388) ~[grpc-census-1.42.0.jar:1.42.0] at io.grpc.census.CensusStatsModule$CallAttemptsTracerFactory.recordFinishedCall(CensusStatsModule.java:525) ~[grpc-census-1.42.0.jar:1.42.0] at io.grpc.census.CensusStatsModule$CallAttemptsTracerFactory.attemptEnded(CensusStatsModule.java:492) ~[grpc-census-1.42.0.jar:1.42.0] at io.grpc.census.CensusStatsModule$ClientTracer.streamClosed(CensusStatsModule.java:345) ~[grpc-census-1.42.0.jar:1.42.0] at io.grpc.internal.StatsTraceContext.streamClosed(StatsTraceContext.java:155) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState.closeListener(AbstractClientStream.java:458) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState.access$400(AbstractClientStream.java:221) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState$1.run(AbstractClientStream.java:442) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState.deframerClosed(AbstractClientStream.java:278) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.Http2ClientStreamTransportState.deframerClosed(Http2ClientStreamTransportState.java:31) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.MessageDeframer.close(MessageDeframer.java:233) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.MessageDeframer.closeWhenComplete(MessageDeframer.java:191) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractStream$TransportState.closeDeframer(AbstractStream.java:200) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState.transportReportStatus(AbstractClientStream.java:445) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState.transportReportStatus(AbstractClientStream.java:401) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.AbstractClientStream$TransportState.inboundTrailersReceived(AbstractClientStream.java:384) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.internal.Http2ClientStreamTransportState.transportTrailersReceived(Http2ClientStreamTransportState.java:183) ~[grpc-core-1.42.0.jar:1.42.0] at io.grpc.netty.shaded.io.grpc.netty.NettyClientStream$TransportState.transportHeadersReceived(NettyClientStream.java:334) ~[grpc-netty-shaded-1.42.0.jar:1.42.0] at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler.onHeadersRead(NettyClientHandler.java:372) ~[grpc-netty-shaded-1.42.0.jar:1.42.0] at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler.access$1200(NettyClientHandler.java:91) ~[grpc-netty-shaded-1.42.0.jar:1.42.0] at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler$FrameListener.onHeadersRead(NettyClientHandler.java:934) ~[grpc-netty-shaded-1.42.0.jar:1.42.0] ``` The NPE can happen when `ClientCall.Listener.onClose()` and `StatsTraceContext.streamClosed()` (or `ClientStreamListener.closed()`) are invoked concurrently in different threads. Note that `CensusStatsModule$CallAttemptsTracerFactory.attemptEnded()` in the above stack trace would observe `callEnded==true` in such a race condition. The following are the possible scenarios that the race between `ClientCall.Listener.onClose()` and `ClientStreamListener.closed()` can happen: - Deadline exceeded but the underlying real stream is [not committed](https://github.com/grpc/grpc-java/blob/v1.42.0/core/src/main/java/io/grpc/internal/RetriableStream.java#L486-L495), the `ClientCall.Listener` may be closed earlier than the stream listener. (This is the case of the above stack trace, in which the inbound end-of-stream is received observing `callEnded==true`. Even if nothing inbound is received, there is still a chance that the NPE can happen.) - DelayedClientTransport.PendingStream has created a realStream but `setStream(realStream)` is [not called yet](https://github.com/grpc/grpc-java/blob/v1.42.0/core/src/main/java/io/grpc/internal/DelayedClientTransport.java#L366-L372), when deadline exceeded. (This has little chance to happen, only for the very first RPC on the channel.) - Hedging case. In deadline-exceeded cases, the shorter the deadline is, the more likely the race can happen. --- census/src/main/java/io/grpc/census/CensusStatsModule.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/census/src/main/java/io/grpc/census/CensusStatsModule.java b/census/src/main/java/io/grpc/census/CensusStatsModule.java index 4b2832c5989..487e49e5005 100644 --- a/census/src/main/java/io/grpc/census/CensusStatsModule.java +++ b/census/src/main/java/io/grpc/census/CensusStatsModule.java @@ -342,7 +342,6 @@ public void outboundMessage(int seqNo) { @Override public void streamClosed(Status status) { - attemptsState.attemptEnded(); stopwatch.stop(); roundtripNanos = stopwatch.elapsed(TimeUnit.NANOSECONDS); Deadline deadline = info.getCallOptions().getDeadline(); @@ -355,6 +354,7 @@ public void streamClosed(Status status) { statusCode = Code.DEADLINE_EXCEEDED; } } + attemptsState.attemptEnded(); if (inboundReceivedOrClosed.compareAndSet(false, true)) { if (module.recordFinishedRpcs) { // Stream is closed early. So no need to record metrics for any inbound events after this @@ -522,6 +522,8 @@ void recordFinishedCall() { tracer.statusCode = status.getCode(); tracer.recordFinishedAttempt(); } else if (inboundMetricTracer != null) { + // activeStreams has been decremented to 0 by attemptEnded(), + // so inboundMetricTracer.statusCode is guaranteed to be assigned already. inboundMetricTracer.recordFinishedAttempt(); } if (!module.recordRetryMetrics) {