Skip to content

Commit 24acb45

Browse files
Cherry-picks for 2.10.23-RC.5 (#6171)
Includes the following: - #5661 - #5666 - #5671 - #5344 - #5684 - #5689 - #5691 - #5714 - #5717 - #5707 - #5792 - #5912 - #5957 - #5700 - #5975 - #5991 - #5987 - #6027 - #6038 - #6053 - #5848 - #6055 - #6056 - #6060 - #6061 - #6072 - #5832 - #6073 - #6107 Signed-off-by: Neil Twigg <[email protected]>
2 parents ba4b34f + 4606175 commit 24acb45

9 files changed

+1646
-737
lines changed

server/consumer.go

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,6 +1563,16 @@ func (o *consumer) updateDeliveryInterest(localInterest bool) bool {
15631563
return false
15641564
}
15651565

1566+
const (
1567+
defaultConsumerNotActiveStartInterval = 30 * time.Second
1568+
defaultConsumerNotActiveMaxInterval = 5 * time.Minute
1569+
)
1570+
1571+
var (
1572+
consumerNotActiveStartInterval = defaultConsumerNotActiveStartInterval
1573+
consumerNotActiveMaxInterval = defaultConsumerNotActiveMaxInterval
1574+
)
1575+
15661576
func (o *consumer) deleteNotActive() {
15671577
o.mu.Lock()
15681578
if o.mset == nil {
@@ -1628,12 +1638,8 @@ func (o *consumer) deleteNotActive() {
16281638
// Check to make sure we went away.
16291639
// Don't think this needs to be a monitored go routine.
16301640
go func() {
1631-
const (
1632-
startInterval = 30 * time.Second
1633-
maxInterval = 5 * time.Minute
1634-
)
1635-
jitter := time.Duration(rand.Int63n(int64(startInterval)))
1636-
interval := startInterval + jitter
1641+
jitter := time.Duration(rand.Int63n(int64(consumerNotActiveStartInterval)))
1642+
interval := consumerNotActiveStartInterval + jitter
16371643
ticker := time.NewTicker(interval)
16381644
defer ticker.Stop()
16391645
for range ticker.C {
@@ -1648,7 +1654,7 @@ func (o *consumer) deleteNotActive() {
16481654
if nca != nil && nca == ca {
16491655
s.Warnf("Consumer assignment for '%s > %s > %s' not cleaned up, retrying", acc, stream, name)
16501656
meta.ForwardProposal(removeEntry)
1651-
if interval < maxInterval {
1657+
if interval < consumerNotActiveMaxInterval {
16521658
interval *= 2
16531659
ticker.Reset(interval)
16541660
}

server/filestore.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10023,14 +10023,22 @@ func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
1002310023
// sets O_SYNC on the open file if SyncAlways is set. The dios semaphore is
1002410024
// handled automatically by this function, so don't wrap calls to it in dios.
1002510025
func (fs *fileStore) writeFileWithOptionalSync(name string, data []byte, perm fs.FileMode) error {
10026+
if fs.fcfg.SyncAlways {
10027+
return writeFileWithSync(name, data, perm)
10028+
}
1002610029
<-dios
1002710030
defer func() {
1002810031
dios <- struct{}{}
1002910032
}()
10030-
flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC
10031-
if fs.fcfg.SyncAlways {
10032-
flags |= os.O_SYNC
10033-
}
10033+
return os.WriteFile(name, data, perm)
10034+
}
10035+
10036+
func writeFileWithSync(name string, data []byte, perm fs.FileMode) error {
10037+
<-dios
10038+
defer func() {
10039+
dios <- struct{}{}
10040+
}()
10041+
flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC | os.O_SYNC
1003410042
f, err := os.OpenFile(name, flags, perm)
1003510043
if err != nil {
1003610044
return err

server/jetstream_cluster.go

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,10 +1431,6 @@ func (js *jetStream) monitorCluster() {
14311431
aq.recycle(&ces)
14321432

14331433
case isLeader = <-lch:
1434-
// For meta layer synchronize everyone to our state on becoming leader.
1435-
if isLeader && n.ApplyQ().len() == 0 {
1436-
n.SendSnapshot(js.metaSnapshot())
1437-
}
14381434
// Process the change.
14391435
js.processLeaderChange(isLeader)
14401436
if isLeader {
@@ -2129,8 +2125,32 @@ func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage Stor
21292125
}
21302126

21312127
// Check if we already have this assigned.
2128+
retry:
21322129
if node := s.lookupRaftNode(rg.Name); node != nil {
2130+
if node.State() == Closed {
2131+
// We're waiting for this node to finish shutting down before we replace it.
2132+
js.mu.Unlock()
2133+
node.WaitForStop()
2134+
js.mu.Lock()
2135+
goto retry
2136+
}
21332137
s.Debugf("JetStream cluster already has raft group %q assigned", rg.Name)
2138+
// Check and see if the group has the same peers. If not then we
2139+
// will update the known peers, which will send a peerstate if leader.
2140+
groupPeerIDs := append([]string{}, rg.Peers...)
2141+
var samePeers bool
2142+
if nodePeers := node.Peers(); len(rg.Peers) == len(nodePeers) {
2143+
nodePeerIDs := make([]string, 0, len(nodePeers))
2144+
for _, n := range nodePeers {
2145+
nodePeerIDs = append(nodePeerIDs, n.ID)
2146+
}
2147+
slices.Sort(groupPeerIDs)
2148+
slices.Sort(nodePeerIDs)
2149+
samePeers = slices.Equal(groupPeerIDs, nodePeerIDs)
2150+
}
2151+
if !samePeers {
2152+
node.UpdateKnownPeers(groupPeerIDs)
2153+
}
21342154
rg.node = node
21352155
js.mu.Unlock()
21362156
return nil
@@ -8959,17 +8979,6 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) {
89598979
// mset.store never changes after being set, don't need lock.
89608980
mset.store.FastState(&state)
89618981

8962-
// Reset notion of first if this request wants sequences before our starting sequence
8963-
// and we would have nothing to send. If we have partial messages still need to send skips for those.
8964-
// We will keep sreq's first sequence to not create sequence mismatches on the follower, but we extend the last to our current state.
8965-
if sreq.FirstSeq < state.FirstSeq && state.FirstSeq > sreq.LastSeq {
8966-
s.Debugf("Catchup for stream '%s > %s' resetting request first sequence from %d to %d",
8967-
mset.account(), mset.name(), sreq.FirstSeq, state.FirstSeq)
8968-
if state.LastSeq > sreq.LastSeq {
8969-
sreq.LastSeq = state.LastSeq
8970-
}
8971-
}
8972-
89738982
// Setup sequences to walk through.
89748983
seq, last := sreq.FirstSeq, sreq.LastSeq
89758984
mset.setCatchupPeer(sreq.Peer, last-seq)
@@ -9133,25 +9142,10 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) {
91339142
if drOk && dr.First > 0 {
91349143
sendDR()
91359144
}
9136-
// Check for a condition where our state's first is now past the last that we could have sent.
9137-
// If so reset last and continue sending.
9138-
var state StreamState
9139-
mset.mu.RLock()
9140-
mset.store.FastState(&state)
9141-
mset.mu.RUnlock()
9142-
if last < state.FirstSeq {
9143-
last = state.LastSeq
9144-
}
9145-
// Recheck our exit condition.
9146-
if seq == last {
9147-
if drOk && dr.First > 0 {
9148-
sendDR()
9149-
}
9150-
s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name())
9151-
// EOF
9152-
s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
9153-
return false
9154-
}
9145+
s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name())
9146+
// EOF
9147+
s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
9148+
return false
91559149
}
91569150
select {
91579151
case <-remoteQuitCh:

server/jetstream_cluster_2_test.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6647,12 +6647,24 @@ func TestJetStreamClusterSnapshotBeforePurgeAndCatchup(t *testing.T) {
66476647
return nil
66486648
})
66496649

6650-
// Make sure we only sent 1002 sync catchup msgs.
6651-
// This is for the new messages, the delete range, and the EOF.
6650+
// Make sure we only sent 2 sync catchup msgs.
6651+
// This is for the delete range, and the EOF.
66526652
nmsgs, _, _ := sub.Pending()
6653-
if nmsgs != 1002 {
6654-
t.Fatalf("Expected only 1002 sync catchup msgs to be sent signaling eof, but got %d", nmsgs)
6653+
if nmsgs != 2 {
6654+
t.Fatalf("Expected only 2 sync catchup msgs to be sent signaling eof, but got %d", nmsgs)
66556655
}
6656+
6657+
msg, err := sub.NextMsg(0)
6658+
require_NoError(t, err)
6659+
mbuf := msg.Data[1:]
6660+
dr, err := decodeDeleteRange(mbuf)
6661+
require_NoError(t, err)
6662+
require_Equal(t, dr.First, 1001)
6663+
require_Equal(t, dr.Num, 1000)
6664+
6665+
msg, err = sub.NextMsg(0)
6666+
require_NoError(t, err)
6667+
require_Equal(t, len(msg.Data), 0)
66566668
}
66576669

66586670
func TestJetStreamClusterStreamResetWithLargeFirstSeq(t *testing.T) {

server/jetstream_cluster_3_test.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,6 +1600,11 @@ func TestJetStreamClusterParallelConsumerCreation(t *testing.T) {
16001600
}
16011601

16021602
func TestJetStreamClusterGhostEphemeralsAfterRestart(t *testing.T) {
1603+
consumerNotActiveStartInterval = time.Second * 5
1604+
defer func() {
1605+
consumerNotActiveStartInterval = defaultConsumerNotActiveStartInterval
1606+
}()
1607+
16031608
c := createJetStreamClusterExplicit(t, "R3S", 3)
16041609
defer c.shutdown()
16051610

@@ -1632,6 +1637,7 @@ func TestJetStreamClusterGhostEphemeralsAfterRestart(t *testing.T) {
16321637
time.Sleep(2 * time.Second)
16331638

16341639
// Restart first and wait so that we know it will try cleanup without a metaleader.
1640+
// It will fail as there's no metaleader at that time, it should keep retrying on an interval.
16351641
c.restartServer(rs)
16361642
time.Sleep(time.Second)
16371643

@@ -1643,8 +1649,9 @@ func TestJetStreamClusterGhostEphemeralsAfterRestart(t *testing.T) {
16431649
defer nc.Close()
16441650

16451651
subj := fmt.Sprintf(JSApiConsumerListT, "TEST")
1646-
checkFor(t, 10*time.Second, 200*time.Millisecond, func() error {
1647-
m, err := nc.Request(subj, nil, time.Second)
1652+
checkFor(t, 20*time.Second, 200*time.Millisecond, func() error {
1653+
// Request will take at most 4 seconds if some consumers can't be found.
1654+
m, err := nc.Request(subj, nil, 5*time.Second)
16481655
if err != nil {
16491656
return err
16501657
}
@@ -3910,6 +3917,7 @@ func TestJetStreamClusterStreamNodeShutdownBugOnStop(t *testing.T) {
39103917
node.InstallSnapshot(mset.stateSnapshot())
39113918
// Stop the stream
39123919
mset.stop(false, false)
3920+
node.WaitForStop()
39133921

39143922
if numNodes := s.numRaftNodes(); numNodes != numNodesStart-1 {
39153923
t.Fatalf("RAFT nodes after stream stop incorrect: %d vs %d", numNodesStart, numNodes)
@@ -5801,6 +5809,8 @@ func TestJetStreamClusterDetectOrphanNRGs(t *testing.T) {
58015809

58025810
// Should only be meta NRG left.
58035811
require_True(t, s.numRaftNodes() == 1)
5812+
s.rnMu.RLock()
5813+
defer s.rnMu.RUnlock()
58045814
require_True(t, s.lookupRaftNode(sgn) == nil)
58055815
require_True(t, s.lookupRaftNode(ogn) == nil)
58065816
}

0 commit comments

Comments
 (0)