Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce the flakiness of the OkHttp tests (though not eliminate). #10973

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -106,6 +106,7 @@ final class OkHttpServerTransport implements ServerTransport,
private final KeepAliveEnforcer keepAliveEnforcer;

private final Object lock = new Object();
private final Object handShakeLock = new Object();
@GuardedBy("lock")
private boolean abruptShutdown;
@GuardedBy("lock")
Expand Down Expand Up @@ -164,15 +165,17 @@ private void startIo(SerializingExecutor serializingExecutor) {
// The socket implementation is lazily initialized, but had broken thread-safety
// for that laziness https://bugs.openjdk.org/browse/JDK-8278326.
// As a workaround, we lock to synchronize initialization with shutdown().
HandshakerSocketFactory.HandshakeResult result;
synchronized (lock) {
socket.setTcpNoDelay(true);
}
HandshakerSocketFactory.HandshakeResult result =
config.handshakerSocketFactory.handshake(socket, Attributes.EMPTY);
synchronized (handShakeLock) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does nothing. This method is only called once ever per handShakeLock instance, and the lock isn't used anywhere else.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  Write of size 4 at 0x00008dc081dc by thread T73 (mutexes: write M0):
    #0 sun.security.ssl.SSLSessionImpl.invalidate()V SSLSessionImpl.java:781 
    #1 sun.security.ssl.TransportContext.fatal(Lsun/security/ssl/Alert;Ljava/lang/String;ZLjava/lang/Throwable;)Ljavax/net/ssl/SSLException; TransportContext.java:386 
    #2 sun.security.ssl.TransportContext.fatal(Lsun/security/ssl/Alert;Ljava/lang/String;Ljava/lang/Throwable;)Ljavax/net/ssl/SSLException; TransportContext.java:303 
    #3 sun.security.ssl.TransportContext.fatal(Lsun/security/ssl/Alert;Ljava/lang/Throwable;)Ljavax/net/ssl/SSLException; TransportContext.java:298 
    #4 sun.security.ssl.SSLSocketImpl.handleException(Ljava/lang/Exception;)V SSLSocketImpl.java:1686 
    #5 sun.security.ssl.SSLSocketImpl.startHandshake(Z)V SSLSocketImpl.java:466 
    #6 sun.security.ssl.SSLSocketImpl.startHandshake()V SSLSocketImpl.java:427 
    #7 io.grpc.okhttp.OkHttpProtocolNegotiator.negotiate(Ljavax/net/ssl/SSLSocket;Ljava/lang/String;Ljava/util/List;)Ljava/lang/String; OkHttpProtocolNegotiator.java:100 
    #8 io.grpc.okhttp.TlsServerHandshakerSocketFactory.handshake(Ljava/net/Socket;Lio/grpc/Attributes;)Lio/grpc/okhttp/HandshakerSocketFactory$HandshakeResult; TlsServerHandshakerSocketFactory.java:58 
    #9 io.grpc.okhttp.OkHttpServerTransport.startIo(Lio/grpc/internal/SerializingExecutor;)V OkHttpServerTransport.java:171 
    #10 io.grpc.okhttp.OkHttpServerTransport.lambda$start$0(Lio/grpc/internal/SerializingExecutor;)V OkHttpServerTransport.java:159 
    #11 io.grpc.okhttp.OkHttpServerTransport$$Lambda$90.run()V ?? 

  Previous read of size 4 at 0x00008dc081dc by thread T76:
    #0 sun.security.ssl.SSLSessionImpl.setContext(Lsun/security/ssl/SSLSessionContextImpl;)V SSLSessionImpl.java:768 
    #1 sun.security.ssl.SSLSessionContextImpl.put(Lsun/security/ssl/SSLSessionImpl;)V SSLSessionContextImpl.java:196 
    #2 sun.security.ssl.Finished$T12FinishedConsumer.onConsumeFinished(Lsun/security/ssl/ServerHandshakeContext;Ljava/nio/ByteBuffer;)V Finished.java:606 
    #3 sun.security.ssl.Finished$T12FinishedConsumer.consume(Lsun/security/ssl/ConnectionContext;Ljava/nio/ByteBuffer;)V Finished.java:527 
    #4 sun.security.ssl.SSLHandshake.consume(Lsun/security/ssl/ConnectionContext;Ljava/nio/ByteBuffer;)V SSLHandshake.java:392 
    #5 sun.security.ssl.HandshakeContext.dispatch(BLjava/nio/ByteBuffer;)V HandshakeContext.java:443 
    #6 sun.security.ssl.HandshakeContext.dispatch(BLsun/security/ssl/Plaintext;)V HandshakeContext.java:421 
    #7 sun.security.ssl.TransportContext.dispatch(Lsun/security/ssl/Plaintext;)V TransportContext.java:183 
    #8 sun.security.ssl.SSLTransport.decode(Lsun/security/ssl/TransportContext;[Ljava/nio/ByteBuffer;II[Ljava/nio/ByteBuffer;II)Lsun/security/ssl/Plaintext; SSLTransport.java:172 
    #9 sun.security.ssl.SSLSocketImpl.decode(Ljava/nio/ByteBuffer;)Lsun/security/ssl/Plaintext; SSLSocketImpl.java:1511 
    #10 sun.security.ssl.SSLSocketImpl.readHandshakeRecord()I SSLSocketImpl.java:1421 
    #11 sun.security.ssl.SSLSocketImpl.startHandshake(Z)V SSLSocketImpl.java:456 
    #12 sun.security.ssl.SSLSocketImpl.startHandshake()V SSLSocketImpl.java:427 
    #13 io.grpc.okhttp.OkHttpProtocolNegotiator.negotiate(Ljavax/net/ssl/SSLSocket;Ljava/lang/String;Ljava/util/List;)Ljava/lang/String; OkHttpProtocolNegotiator.java:100 
    #14 io.grpc.okhttp.TlsServerHandshakerSocketFactory.handshake(Ljava/net/Socket;Lio/grpc/Attributes;)Lio/grpc/okhttp/HandshakerSocketFactory$HandshakeResult; TlsServerHandshakerSocketFactory.java:58 
    #15 io.grpc.okhttp.OkHttpServerTransport.startIo(Lio/grpc/internal/SerializingExecutor;)V OkHttpServerTransport.java:171 
    #16 io.grpc.okhttp.OkHttpServerTransport.lambda$start$0(Lio/grpc/internal/SerializingExecutor;)V OkHttpServerTransport.java:159 
    #17 io.grpc.okhttp.OkHttpServerTransport$$Lambda$90.run()V ?? 

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a JDK bug. I don't know what JDK was used when TSAN failed, but based on the line numbers it looks to be JDK 11 or earlier. And it adds the new session to the session cache before setting the context, which does seem racy.
https://github.com/openjdk/jdk/blob/da75f3c4ad5bdf25167a3ed80e51f567ab3dbd01/src/java.base/share/classes/sun/security/ssl/SSLSessionContextImpl.java#L175

JDK master does the same, without any extra synchronization. So I think we need to report this and get it fixed.

result = config.handshakerSocketFactory.handshake(socket, Attributes.EMPTY);
}
synchronized (lock) {
this.socket = result.socket;
this.attributes = result.attributes;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did TSAN show this was needed? This field shouldn't be looked at until listener.transportReady(). I'd be interested to see what TSAN found.

}
this.attributes = result.attributes;

int maxQueuedControlFrames = 10000;
AsyncSink asyncSink = AsyncSink.sink(serializingExecutor, this, maxQueuedControlFrames);
Expand Down
Expand Up @@ -18,6 +18,7 @@

import static com.google.common.base.Charsets.UTF_8;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth.assertWithMessage;
import static io.grpc.internal.ClientStreamListener.RpcProgress.MISCARRIED;
import static io.grpc.internal.ClientStreamListener.RpcProgress.PROCESSED;
import static io.grpc.internal.ClientStreamListener.RpcProgress.REFUSED;
Expand Down Expand Up @@ -293,7 +294,7 @@ public void close() throws SecurityException {
Buffer buffer = createMessageFrame(message);
frameHandler().data(false, 3, buffer, (int) buffer.size(),
(int) buffer.size());
assertThat(logs).hasSize(1);
assertWithMessage("log messages: " + logs).that(logs).hasSize(1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this help? I tried it out and it seems to change:

io.grpc.okhttp.OkHttpClientTransportTest > testClientHandlerFrameLogger FAILED
    value of    : iterable.size()
    expected    : 2
    but was     : 1
    iterable was: [java.util.logging.LogRecord@7f12d356]

into:

io.grpc.okhttp.OkHttpClientTransportTest > testClientHandlerFrameLogger FAILED
    log messages: [java.util.logging.LogRecord@45026a28]
    value of    : iterable.size()
    expected    : 2
    but was     : 1
    iterable was: [java.util.logging.LogRecord@45026a28]

The point of Truth is that it prints the value of what your testing. Why print it twice?

log = logs.remove(0);
assertThat(log.getMessage()).startsWith(Direction.INBOUND + " DATA: streamId=" + 3);
assertThat(log.getLevel()).isEqualTo(Level.FINE);
Expand Down
8 changes: 8 additions & 0 deletions okhttp/src/test/java/io/grpc/okhttp/TlsTest.java
Expand Up @@ -117,6 +117,7 @@ public void mtls_succeeds() throws Exception {
ManagedChannel channel = grpcCleanupRule.register(clientChannel(server, channelCreds));

SimpleServiceGrpc.newBlockingStub(channel).unaryRpc(SimpleRequest.getDefaultInstance());
server.shutdown();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do these do? The Cleanup rule will shut down the server just after this line. Do we have to shut down the server before the client?

It isn't like OkHttpServer.acceptConnecitons() is spinning tightly in its loop. accept() blocks until there is a connection.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But because the client shuts down before the server, after it falls out of the accept it goes through the logic that is skipped when the shutdown flag is set.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried this and I saw no change in failure rate. (1000 runs)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Darn reality not matching the theory .... :(

}

@Test
Expand Down Expand Up @@ -144,6 +145,7 @@ public void untrustedClient_fails() throws Exception {
ManagedChannel channel = grpcCleanupRule.register(clientChannel(server, channelCreds));

assertRpcFails(channel);
server.shutdown();
}

@Test
Expand All @@ -168,6 +170,7 @@ public void missingOptionalClientCert_succeeds() throws Exception {
ManagedChannel channel = grpcCleanupRule.register(clientChannel(server, channelCreds));

SimpleServiceGrpc.newBlockingStub(channel).unaryRpc(SimpleRequest.getDefaultInstance());
server.shutdown();
}

@Test
Expand All @@ -192,6 +195,7 @@ public void missingRequiredClientCert_fails() throws Exception {
ManagedChannel channel = grpcCleanupRule.register(clientChannel(server, channelCreds));

assertRpcFails(channel);
server.shutdown();
}

@Test
Expand All @@ -208,6 +212,7 @@ public void untrustedServer_fails() throws Exception {
ManagedChannel channel = grpcCleanupRule.register(clientChannel(server, channelCreds));

assertRpcFails(channel);
server.shutdown();
}

@Test
Expand All @@ -231,6 +236,7 @@ public void unmatchedServerSubjectAlternativeNames_fails() throws Exception {
.build());

assertRpcFails(channel);
server.shutdown();
}

@Test
Expand All @@ -255,6 +261,7 @@ public void hostnameVerifierTrusts_succeeds()
.build());

SimpleServiceGrpc.newBlockingStub(channel).unaryRpc(SimpleRequest.getDefaultInstance());
server.shutdown();
}

@Test
Expand All @@ -280,6 +287,7 @@ public void hostnameVerifierFails_fails()

Status status = assertRpcFails(channel);
assertThat(status.getCause()).isInstanceOf(SSLPeerUnverifiedException.class);
server.shutdown();
}

private static Server server(ServerCredentials creds) throws IOException {
Expand Down