Skip to content

Commit fee0522

Browse files
NRG (2.11): Ignore AEs from older terms (#5661)
Many of the follower resets that we see happen because append entries from previous terms can be in-flight during leadership transfers. This problem is worsened when high or variable latency is involved. When a follower receives an AE from an old term today, they will reset and nuke their WAL and then run a catch-up regardless of the integrity of the log. However this situation ignores the fact that the node might have been otherwise functioning normally and network latency may be worse on some links than others. Silently dropping the AEs from previous terms and restricting the reset behaviour to the last log term and last log index reduces the number of resets considerably whilst maintaining log consistency. Co-authored-by: Reuben Ninan <[email protected]> Signed-off-by: Neil Twigg <[email protected]>
2 parents 3305fc5 + be84a56 commit fee0522

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

server/raft.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3271,6 +3271,10 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) {
32713271
n.debug("Term higher than ours and we are not a follower: %v, stepping down to %q", n.State(), ae.leader)
32723272
n.stepdown.push(ae.leader)
32733273
}
3274+
} else if ae.term < n.term && !catchingUp && isNew {
3275+
n.debug("Ignoring AppendEntry from a leader (%s) with term %d which is less than ours", ae.leader, ae.term)
3276+
n.Unlock()
3277+
return
32743278
}
32753279

32763280
if isNew && n.leader != ae.leader && n.State() == Follower {

server/raft_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,53 @@ func TestNRGObserverMode(t *testing.T) {
220220
}
221221
}
222222

223+
func TestNRGAEFromOldLeader(t *testing.T) {
224+
c := createJetStreamClusterExplicit(t, "R3S", 3)
225+
defer c.shutdown()
226+
227+
nc, _ := jsClientConnect(t, c.leader(), nats.UserInfo("admin", "s3cr3t!"))
228+
defer nc.Close()
229+
230+
rg := c.createMemRaftGroup("TEST", 3, newStateAdder)
231+
rg.waitOnLeader()
232+
233+
// Listen out for catchup requests.
234+
ch := make(chan *nats.Msg, 16)
235+
_, err := nc.ChanSubscribe(fmt.Sprintf(raftCatchupReply, ">"), ch)
236+
require_NoError(t, err)
237+
238+
// Start next term so that we can reuse term 1 in the next step.
239+
leader := rg.leader().node().(*raft)
240+
leader.StepDown()
241+
time.Sleep(time.Millisecond * 100)
242+
rg.waitOnLeader()
243+
require_Equal(t, leader.Term(), 2)
244+
leader = rg.leader().node().(*raft)
245+
246+
// Send an append entry with an outdated term. Beforehand, doing
247+
// so would have caused a WAL reset and then would have triggered
248+
// a Raft-level catchup.
249+
ae := &appendEntry{
250+
term: 1,
251+
pindex: 0,
252+
leader: leader.id,
253+
reply: nc.NewRespInbox(),
254+
}
255+
payload, err := ae.encode(nil)
256+
require_NoError(t, err)
257+
resp, err := nc.Request(leader.asubj, payload, time.Second)
258+
require_NoError(t, err)
259+
260+
// Wait for the response, the server should have rejected it.
261+
ar := leader.decodeAppendEntryResponse(resp.Data)
262+
require_NotNil(t, ar)
263+
require_Equal(t, ar.success, false)
264+
265+
// No catchup should happen at this point because no reset should
266+
// have happened.
267+
require_NoChanRead(t, ch, time.Second*2)
268+
}
269+
223270
// TestNRGSimpleElection tests that a simple election succeeds. It is
224271
// simple because the group hasn't processed any entries and hasn't
225272
// suffered any interruptions of any kind, therefore there should be

0 commit comments

Comments
 (0)