From 5fafb7a7e8963993ff4801851ab6cb155910cbfb Mon Sep 17 00:00:00 2001 From: Waldemar Quevedo Date: Fri, 18 Mar 2022 22:55:57 -0700 Subject: [PATCH 1/4] Add adr on implementation js.Publish retries Signed-off-by: Waldemar Quevedo --- adr/ADR-22.md | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 adr/ADR-22.md diff --git a/adr/ADR-22.md b/adr/ADR-22.md new file mode 100644 index 0000000..015ce75 --- /dev/null +++ b/adr/ADR-22.md @@ -0,0 +1,101 @@ +# JetStream Publish Retries on No Responders + +| Metadata | Value | +|----------|---------------------------| +| Date | 2022-03-18 | +| Author | wallyqs | +| Status | Partially Implemented | +| Tags | jetstream, client | + +## Motivation + +When the NATS Server is running with JetStream on cluster mode, there +can be occasional blips in leadership due which can result in a number +of `no responders available` errors during the election. In order to +try to mitigate these failures, retries can be added into JetStream +enabled clients to attempt to publish the message to JetStream once it +is ready again. + +## Implementation + +A `no responders available` error uses the 503 status header to signal +a client that there was no one available to serve the published +request. A synchronous `Publish` request when using the JetStream +context internally uses a `Request` to produce a message and if the +JetStream service was not ready at the moment of publishing, the +server will send to the requestor a 503 status message right away. + +To improve robustness of producing messages to JetStream, a client can +back off for a a bit and then try to send the message again later. +By default, the Go client waits for `250ms` and will retry 2 times +sending the message (so that in total it would have attempted to send +the message 3 times). + +Below can be found an example implementation using the `Request` API +from the Go client: + +```go +// Stream that persists messages sent to 'foo' +js.AddStream(&nats.StreamConfig{Name: "foo"}) + +var ( + retryWait = 250 * time.Millisecond + maxAttempts = 2 + i = 0 +) + +// Loop to publish a message every 100ms +for range time.NewTicker(100 * time.Millisecond).C { + subject := "foo" + msg := fmt.Sprintf("i:%d", i) + _, err := nc.Request(subject, []byte(msg), 1*time.Second) + if err != nil && err == nats.ErrNoResponders { + for attempts := 0; attempts < maxAttempts; attempts++ { + // Backoff before retrying + time.Sleep(retryWait) + + // Next attempt + _, err := nc.Request(subject, []byte(msg), 1*time.Second) + if err != nil && err == nats.ErrNoResponders { + // Retry again + continue + } + } + } + i++ +} +``` + +## Errors + +After exhausting the number of attempts, the result should either be a timeout error +in case the deadline expired or a `nats: no response from stream` error +if the error from the last attempt was still a `no responders error`. + +## Examples + +### Customizing retries with `RetryWait` and `RetryAttempts` + +Two options are added to customize the retry logic from the defaults: + +```go +_, err := js.Publish("foo", []byte("bar"), nats.RetryWait(250*time.Millisecond), nats.RetryAttempts(10)) +if err != nil { + log.Println("Pub Error", err) +} +``` + +### Make Publish retry as needed until context deadline + +In Go when using the context package, it can be possible to set the maximum deadline of the retries +so that the client can retry as needed. In the example below a client will attempt to publish for 10 seconds +backing off `250ms` as needed until the service is available again: + +```go +ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) +defer cancel() +_, err := js.Publish("foo", []byte("bar"), nats.Context(ctx), nats.RetryWait(250*time.Millisecond), nats.RetryAttempts(-1)) +if err != nil { + log.Println("Pub Error", err) +} +``` From a41998b1b422c5551e32839bdca22dabdb337f56 Mon Sep 17 00:00:00 2001 From: Waldemar Quevedo Date: Sun, 20 Mar 2022 15:26:38 -0700 Subject: [PATCH 2/4] Update README Signed-off-by: Waldemar Quevedo --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e0b0ff0..7e28395 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ This repo is used to capture architectural and design decisions as a reference o |[ADR-19](adr/ADR-19.md)|jetstream, client, kv, objectstore|API prefixes for materialized JetStream views:| |[ADR-20](adr/ADR-20.md)|jetstream, client, objectstore|JetStream based Object Stores| |[ADR-21](adr/ADR-21.md)|client|NATS Configuration Contexts| +|[ADR-22](adr/ADR-22.md)|jetstream, client|JetStream Publish Retries on No Responders| ## Jetstream @@ -44,6 +45,7 @@ This repo is used to capture architectural and design decisions as a reference o |[ADR-17](adr/ADR-17.md)|jetstream, client|Ordered Consumer| |[ADR-19](adr/ADR-19.md)|jetstream, client, kv, objectstore|API prefixes for materialized JetStream views:| |[ADR-20](adr/ADR-20.md)|jetstream, client, objectstore|JetStream based Object Stores| +|[ADR-22](adr/ADR-22.md)|jetstream, client|JetStream Publish Retries on No Responders| ## Kv From 6e9efed480e92366b42077b1fa4ca797a0698ae6 Mon Sep 17 00:00:00 2001 From: Waldemar Quevedo Date: Mon, 21 Mar 2022 10:01:52 -0700 Subject: [PATCH 3/4] Update adr/ADR-22.md - fix typo --- adr/ADR-22.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adr/ADR-22.md b/adr/ADR-22.md index 015ce75..e3cbf4a 100644 --- a/adr/ADR-22.md +++ b/adr/ADR-22.md @@ -10,7 +10,7 @@ ## Motivation When the NATS Server is running with JetStream on cluster mode, there -can be occasional blips in leadership due which can result in a number +can be occasional blips in leadership which can result in a number of `no responders available` errors during the election. In order to try to mitigate these failures, retries can be added into JetStream enabled clients to attempt to publish the message to JetStream once it From 380756a33635d69d8504e9e7c8d94f8f1d2fd359 Mon Sep 17 00:00:00 2001 From: Waldemar Quevedo Date: Mon, 21 Mar 2022 13:03:23 -0700 Subject: [PATCH 4/4] ADR-22 Add note on non context usage Signed-off-by: Waldemar Quevedo --- adr/ADR-22.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/adr/ADR-22.md b/adr/ADR-22.md index e3cbf4a..5cfcfd2 100644 --- a/adr/ADR-22.md +++ b/adr/ADR-22.md @@ -85,16 +85,24 @@ if err != nil { } ``` -### Make Publish retry as needed until context deadline +### Make Publish retry as needed until deadline -In Go when using the context package, it can be possible to set the maximum deadline of the retries -so that the client can retry as needed. In the example below a client will attempt to publish for 10 seconds -backing off `250ms` as needed until the service is available again: +It can be possible to set the maximum deadline of the retries so that the client can retry as needed. +In the example below a client will attempt to publish up to 10 seconds to wait for an ack response +from the server, backing off `250ms` as needed until the service is available again: ```go +// Using Go context package ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() _, err := js.Publish("foo", []byte("bar"), nats.Context(ctx), nats.RetryWait(250*time.Millisecond), nats.RetryAttempts(-1)) +if err != nil { + log.Println("Pub Error", err) + +} + +// Custom AckWait +_, err := js.Publish("foo", []byte("bar"), nats.AckWait(10*time.Second), nats.RetryWait(250*time.Millisecond), nats.RetryAttempts(-1)) if err != nil { log.Println("Pub Error", err) }