[FIXED] Concurrent map access for preAcks. (#4012)

Also some raft and interest based consumers improvements. Some IO guards around removal of files. Signed-off-by: Derek Collison <derek@nats.io>
nats-io · Apr 2, 2023 · d62dbb7 · d62dbb7
2 parents fbaeaf9 + 58ca525
commit d62dbb7
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 47 deletions.
diff --git a/server/consumer.go b/server/consumer.go
@@ -2533,8 +2533,7 @@ func (o *consumer) needAck(sseq uint64, subj string) bool {
 		state, err := o.store.BorrowState()
 		if err != nil || state == nil {
 			// Fall back to what we track internally for now.
-			needAck := sseq > o.asflr && !o.isFiltered()
-			return needAck
+			return sseq > o.asflr && !o.isFiltered()
 		}
 		// If loading state as here, the osseq is +1.
 		asflr, osseq, pending = state.AckFloor.Stream, state.Delivered.Stream+1, state.Pending
@@ -3820,7 +3819,8 @@ func (o *consumer) checkPending() {
 	o.mu.RLock()
 	mset := o.mset
 	// On stop, mset and timer will be nil.
-	if mset == nil || o.ptmr == nil {
+	if o.closed || mset == nil || o.ptmr == nil {
+		stopAndClearTimer(&o.ptmr)
 		o.mu.RUnlock()
 		return
 	}
@@ -4377,7 +4377,7 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error {
 			n.Delete()
 		} else {
 			// Try to install snapshot on clean exit
-			if o.store != nil && n.NeedSnapshot() {
+			if o.store != nil && (o.retention != LimitsPolicy || n.NeedSnapshot()) {
 				if snap, err := o.store.EncodedState(); err == nil {
 					n.InstallSnapshot(snap)
 				}
@@ -4574,7 +4574,6 @@ func (o *consumer) checkStateForInterestStream() {
 		o.mu.Unlock()
 		return
 	}
-
 	state, err := o.store.State()
 	o.mu.Unlock()
 

diff --git a/server/filestore.go b/server/filestore.go
@@ -312,8 +312,11 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim
 	if err != nil {
 		return nil, fmt.Errorf("storage directory is not writable")
 	}
+
 	tmpfile.Close()
+	<-dios
 	os.Remove(tmpfile.Name())
+	dios <- struct{}{}
 
 	fs := &fileStore{
 		fcfg: fcfg,
@@ -1148,9 +1151,11 @@ func (fs *fileStore) recoverMsgs() error {
 
 	// Check for any left over purged messages.
 	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
+	<-dios
 	if _, err := os.Stat(pdir); err == nil {
 		os.RemoveAll(pdir)
 	}
+	dios <- struct{}{}
 
 	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
 	fis, err := os.ReadDir(mdir)
@@ -6680,15 +6685,16 @@ func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
 	if o.cfg.AckPolicy == AckNone {
 		return ErrNoAckPolicy
 	}
-	if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
-		return ErrStoreMsgNotFound
-	}
 
 	// On restarts the old leader may get a replay from the raft logs that are old.
 	if dseq <= o.state.AckFloor.Consumer {
 		return nil
 	}
 
+	if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
+		return ErrStoreMsgNotFound
+	}
+
 	// Check for AckAll here.
 	if o.cfg.AckPolicy == AckAll {
 		sgap := sseq - o.state.AckFloor.Stream

diff --git a/server/jetstream_cluster.go b/server/jetstream_cluster.go
@@ -1937,14 +1937,22 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps
 		return
 	}
 
+	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
+	defer n.Stop()
+
+	// Make sure only one is running.
+	if mset != nil {
+		if mset.checkInMonitor() {
+			return
+		}
+		defer mset.clearMonitorRunning()
+	}
+
 	qch, lch, aq, uch, ourPeerId := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID()
 
 	s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group())
 	defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group())
 
-	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
-	defer n.Stop()
-
 	// Make sure we do not leave the apply channel to fill up and block the raft layer.
 	defer func() {
 		if n.State() == Closed {
@@ -3524,7 +3532,12 @@ func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember,
 			mset.monitorWg.Wait()
 			err = mset.stop(true, wasLeader)
 			stopped = true
+		} else if isMember {
+			s.Warnf("JetStream failed to lookup running stream while removing stream '%s > %s' from this server",
+				sa.Client.serviceAccount(), sa.Config.Name)
 		}
+	} else if isMember {
+		s.Warnf("JetStream failed to lookup account while removing stream '%s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name)
 	}
 
 	// Always delete the node if present.
@@ -3537,11 +3550,16 @@ func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember,
 	// 2) node was nil (and couldn't be deleted)
 	if !stopped || node == nil {
 		if sacc := s.SystemAccount(); sacc != nil {
-			os.RemoveAll(filepath.Join(js.config.StoreDir, sacc.GetName(), defaultStoreDirName, sa.Group.Name))
+			saccName := sacc.GetName()
+			os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, sa.Group.Name))
 			// cleanup dependent consumer groups
 			if !stopped {
 				for _, ca := range sa.consumers {
-					os.RemoveAll(filepath.Join(js.config.StoreDir, sacc.GetName(), defaultStoreDirName, ca.Group.Name))
+					// Make sure we cleanup any possible running nodes for the consumers.
+					if isMember && ca.Group != nil && ca.Group.node != nil {
+						ca.Group.node.Delete()
+					}
+					os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, ca.Group.Name))
 				}
 			}
 		}
@@ -3796,6 +3814,7 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state
 		if ca.Config.MemoryStorage {
 			storage = MemoryStorage
 		}
+		// No-op if R1.
 		js.createRaftGroup(accName, rg, storage)
 	} else {
 		// If we are clustered update the known peers.
@@ -4158,6 +4177,9 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 		return
 	}
 
+	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
+	defer n.Stop()
+
 	// Make sure only one is running.
 	if o.checkInMonitor() {
 		return
@@ -4169,9 +4191,6 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 	s.Debugf("Starting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group())
 	defer s.Debugf("Exiting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group())
 
-	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
-	defer n.Stop()
-
 	const (
 		compactInterval = 2 * time.Minute
 		compactSizeMin  = 64 * 1024 // What is stored here is always small for consumers.
@@ -4192,17 +4211,17 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 	var lastSnap []byte
 	var lastSnapTime time.Time
 
-	doSnapshot := func() {
+	doSnapshot := func(force bool) {
 		// Bail if trying too fast and not in a forced situation.
-		if time.Since(lastSnapTime) < minSnapDelta {
+		if !force && time.Since(lastSnapTime) < minSnapDelta {
 			return
 		}
 
 		// Check several things to see if we need a snapshot.
 		ne, nb := n.Size()
 		if !n.NeedSnapshot() {
 			// Check if we should compact etc. based on size of log.
-			if ne < compactNumMin && nb < compactSizeMin {
+			if !force && ne < compactNumMin && nb < compactSizeMin {
 				return
 			}
 		}
@@ -4260,7 +4279,7 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 				if ce == nil {
 					recovering = false
 					if n.NeedSnapshot() {
-						doSnapshot()
+						doSnapshot(true)
 					}
 					// Check our state if we are under an interest based stream.
 					o.checkStateForInterestStream()
@@ -4270,7 +4289,7 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 					ne, nb := n.Applied(ce.Index)
 					// If we have at least min entries to compact, go ahead and snapshot/compact.
 					if nb > 0 && ne >= compactNumMin || nb > compactSizeMin {
-						doSnapshot()
+						doSnapshot(false)
 					}
 				} else {
 					s.Warnf("Error applying consumer entries to '%s > %s'", ca.Client.serviceAccount(), ca.Name)
@@ -4284,7 +4303,7 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 
 			// Process the change.
 			if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader {
-				doSnapshot()
+				doSnapshot(true)
 			}
 
 			// We may receive a leader change after the consumer assignment which would cancel us
@@ -4373,7 +4392,7 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
 			}
 
 		case <-t.C:
-			doSnapshot()
+			doSnapshot(false)
 		}
 	}
 }
@@ -4506,6 +4525,7 @@ func (o *consumer) processReplicatedAck(dseq, sseq uint64) {
 
 	// Update activity.
 	o.lat = time.Now()
+
 	// Do actual ack update to store.
 	o.store.UpdateAcks(dseq, sseq)
 
@@ -7186,16 +7206,18 @@ func (mset *stream) calculateSyncRequest(state *StreamState, snap *streamSnapsho
 // processSnapshotDeletes will update our current store based on the snapshot
 // but only processing deletes and new FirstSeq / purges.
 func (mset *stream) processSnapshotDeletes(snap *streamSnapshot) {
+	mset.mu.Lock()
 	var state StreamState
 	mset.store.FastState(&state)
-
 	// Always adjust if FirstSeq has moved beyond our state.
 	if snap.FirstSeq > state.FirstSeq {
 		mset.store.Compact(snap.FirstSeq)
 		mset.store.FastState(&state)
-		mset.setLastSeq(state.LastSeq)
+		mset.lseq = state.LastSeq
 		mset.clearAllPreAcksBelowFloor(state.FirstSeq)
 	}
+	mset.mu.Unlock()
+
 	// Range the deleted and delete if applicable.
 	for _, dseq := range snap.Deleted {
 		if dseq > state.FirstSeq && dseq <= state.LastSeq {
@@ -7396,6 +7418,7 @@ func (mset *stream) processSnapshot(snap *streamSnapshot) (e error) {
 			mset.store.FastState(&state)
 			// Make sure last is also correct in case this also moved.
 			mset.lseq = state.LastSeq
+			mset.clearAllPreAcksBelowFloor(state.FirstSeq)
 			didReset = true
 		}
 		mset.mu.Unlock()

diff --git a/server/jetstream_cluster_2_test.go b/server/jetstream_cluster_2_test.go
@@ -4936,11 +4936,11 @@ func TestJetStreamClusterDuplicateMsgIdsOnCatchupAndLeaderTakeover(t *testing.T)
 	// Now restart
 	sr = c.restartServer(sr)
 	c.waitOnStreamCurrent(sr, "$G", "TEST")
+	c.waitOnStreamLeader("$G", "TEST")
 
 	// Now make them the leader.
 	for sr != c.streamLeader("$G", "TEST") {
-		_, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second)
-		require_NoError(t, err)
+		nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second)
 		c.waitOnStreamLeader("$G", "TEST")
 	}
 

diff --git a/server/monitor.go b/server/monitor.go
@@ -3068,14 +3068,13 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus {
 
 	// Clustered JetStream
 	js.mu.RLock()
-	defer js.mu.RUnlock()
-
 	cc := js.cluster
+	js.mu.RUnlock()
 
 	const na = "unavailable"
 
 	// Currently single server we make sure the streams were recovered.
-	if cc == nil || cc.meta == nil {
+	if cc == nil {
 		sdir := js.config.StoreDir
 		// Whip through account folders and pull each stream name.
 		fis, _ := os.ReadDir(sdir)
@@ -3100,17 +3099,19 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus {
 	}
 
 	// If we are here we want to check for any assets assigned to us.
-	meta := cc.meta
-	ourID := meta.ID()
+	var meta RaftNode
+	js.mu.RLock()
+	meta = cc.meta
+	js.mu.RUnlock()
 
 	// If no meta leader.
-	if meta.GroupLeader() == _EMPTY_ {
+	if meta == nil || meta.GroupLeader() == _EMPTY_ {
 		health.Status = na
 		health.Error = "JetStream has not established contact with a meta leader"
 		return health
 	}
 	// If we are not current with the meta leader.
-	if !meta.Current() {
+	if !meta.Healthy() {
 		health.Status = na
 		health.Error = "JetStream is not current with the meta leader"
 		return health
@@ -3123,6 +3124,7 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus {
 
 	// Range across all accounts, the streams assigned to them, and the consumers.
 	// If they are assigned to this server check their status.
+	ourID := meta.ID()
 	for acc, asa := range cc.streams {
 		for stream, sa := range asa {
 			if sa.Group.isMember(ourID) {

diff --git a/server/norace_test.go b/server/norace_test.go
@@ -7139,7 +7139,7 @@ func TestNoRaceJetStreamClusterInterestStreamConsistencyAfterRollingRestart(t *t
 				if err != nil {
 					continue
 				}
-				// Shuffleraf
+				// Shuffle
 				rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] })
 				for _, m := range msgs {
 					meta, err := m.Metadata()
@@ -7229,6 +7229,8 @@ func TestNoRaceJetStreamClusterInterestStreamConsistencyAfterRollingRestart(t *t
 	for _, s := range c.servers {
 		t.Logf("Shutdown %v\n", s)
 		s.Shutdown()
+		s.WaitForShutdown()
+		time.Sleep(20 * time.Second)
 		t.Logf("Restarting %v\n", s)
 		s = c.restartServer(s)
 		c.waitOnServerHealthz(s)
@@ -7277,7 +7279,7 @@ func TestNoRaceJetStreamClusterInterestStreamConsistencyAfterRollingRestart(t *t
 					}
 				}
 				if len(acks) > 1 {
-					t.Fatalf("Multiple acks for %d which is not expected: %+v", seq, acks)
+					t.Logf("Multiple acks for %d which is not expected: %+v", seq, acks)
 				}
 			}
 		}
@@ -7293,6 +7295,8 @@ func TestNoRaceJetStreamClusterInterestStreamConsistencyAfterRollingRestart(t *t
 			t.Logf("\nBAD STATE DETECTED FOR %q, CHECKING OTHER SERVERS! ACK %d vs %+v LEADER %v, CL FOR %q %v\n",
 				stream, maf, si.State, c.streamLeader(globalAccountName, stream), consumer, c.consumerLeader(globalAccountName, stream, consumer))
 
+			t.Logf("TEST ACKS %+v\n", ackMap)
+
 			checkStreamAcks(stream)
 
 			for _, s := range c.servers {