Skip to content

Commit

Permalink
Do not fail healthz in single server mode on failed snapshot restore. (
Browse files Browse the repository at this point in the history
…#4100)

In single server mode healthz could mistake a snapshot staging
direct…ory during a restore as an account.
If the restore took a long time, stalled, or was aborted, would cause
healthz to fail.

Signed-off-by: Derek Collison <derek@nats.io>
  • Loading branch information
derekcollison committed Apr 25, 2023
2 parents c2649be + 47c6bfd commit e25f89d
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 2 deletions.
34 changes: 34 additions & 0 deletions server/jetstream_test.go
Expand Up @@ -19968,3 +19968,37 @@ func TestJetStreamKVHistoryRegression(t *testing.T) {
})
}
}

func TestJetStreamSnapshotRestoreStallAndHealthz(t *testing.T) {
s := RunBasicJetStreamServer(t)
defer s.Shutdown()

nc, js := jsClientConnect(t, s)
defer nc.Close()

_, err := js.AddStream(&nats.StreamConfig{
Name: "ORDERS",
Subjects: []string{"orders.*"},
})
require_NoError(t, err)

for i := 0; i < 1000; i++ {
sendStreamMsg(t, nc, "orders.created", "new order")
}

hs := s.healthz(nil)
if hs.Status != "ok" || hs.Error != _EMPTY_ {
t.Fatalf("Expected health to be ok, got %+v", hs)
}

// Simulate the staging directory for restores. This is normally cleaned up
// but since its at the root of the storage directory make sure healthz is not affected.
snapDir := filepath.Join(s.getJetStream().config.StoreDir, snapStagingDir)
require_NoError(t, os.MkdirAll(snapDir, defaultDirPerms))

// Make sure healthz ok.
hs = s.healthz(nil)
if hs.Status != "ok" || hs.Error != _EMPTY_ {
t.Fatalf("Expected health to be ok, got %+v", hs)
}
}
3 changes: 3 additions & 0 deletions server/monitor.go
Expand Up @@ -3083,6 +3083,9 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus {
// Whip through account folders and pull each stream name.
fis, _ := os.ReadDir(sdir)
for _, fi := range fis {
if fi.Name() == snapStagingDir {
continue
}
acc, err := s.LookupAccount(fi.Name())
if err != nil {
health.Status = na
Expand Down
4 changes: 2 additions & 2 deletions server/norace_test.go
Expand Up @@ -6397,8 +6397,8 @@ func TestNoRaceJetStreamConsumerCreateTimeNumPending(t *testing.T) {
case <-time.After(5 * time.Second):
}

// Should stay under 5ms now, but for Travis variability say 25ms.
threshold := 25 * time.Millisecond
// Should stay under 5ms now, but for Travis variability say 50ms.
threshold := 50 * time.Millisecond

start := time.Now()
_, err = js.PullSubscribe("events.*", "dlc")
Expand Down

0 comments on commit e25f89d

Please sign in to comment.