Skip to content

Commit 69ff59e

Browse files
NRG: Campaign early unless already candidate (#6511)
If an outdated server wants to become leader, other servers can recognize they have a more up-to-date and start campaigning early to speed up the leader election process. However, if two servers would both become candidate at the same time due to this, and one of the candidate's log is slightly more ahead it would again start campaigning early. This can essentially short-circuit the voting process, starting a new leader election/term while that wouldn't be necessary to do. That ahead candidate could just reject the vote for the other candidate, and wait for successful votes for itself to come in. Signed-off-by: Maurice van Veen <[email protected]>
2 parents f148459 + 42d7651 commit 69ff59e

File tree

2 files changed

+58
-5
lines changed

2 files changed

+58
-5
lines changed

server/raft.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4138,11 +4138,10 @@ func (n *raft) processVoteRequest(vr *voteRequest) error {
41384138
n.vote = vr.candidate
41394139
n.writeTermVote()
41404140
n.resetElectionTimeout()
4141-
} else {
4142-
if vr.term >= n.term && n.vote == noVote {
4143-
n.term = vr.term
4144-
n.resetElect(randCampaignTimeout())
4145-
}
4141+
} else if n.vote == noVote && n.State() != Candidate {
4142+
// We have a more up-to-date log, and haven't voted yet.
4143+
// Start campaigning earlier, but only if not candidate already, as that would short-circuit us.
4144+
n.resetElect(randCampaignTimeout())
41464145
}
41474146

41484147
// Term might have changed, make sure response has the most current

server/raft_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,60 @@ func TestNRGUnsuccessfulVoteRequestDoesntResetElectionTimer(t *testing.T) {
526526
require_True(t, followerEqual)
527527
}
528528

529+
func TestNRGUnsuccessfulVoteRequestCampaignEarly(t *testing.T) {
530+
n, cleanup := initSingleMemRaftNode(t)
531+
defer cleanup()
532+
533+
nats0 := "S1Nunr6R" // "nats-0"
534+
n.etlr = time.Time{}
535+
536+
// Simple case: we are follower and vote for a candidate.
537+
require_NoError(t, n.processVoteRequest(&voteRequest{term: 1, lastTerm: 0, lastIndex: 0, candidate: nats0}))
538+
require_Equal(t, n.term, 1)
539+
require_Equal(t, n.vote, nats0)
540+
require_NotEqual(t, n.etlr, time.Time{}) // Resets election timer as it voted.
541+
n.etlr = time.Time{}
542+
543+
// We are follower and deny vote for outdated candidate.
544+
n.pterm, n.pindex = 1, 100
545+
require_NoError(t, n.processVoteRequest(&voteRequest{term: 2, lastTerm: 1, lastIndex: 2, candidate: nats0}))
546+
require_Equal(t, n.term, 2)
547+
require_Equal(t, n.vote, noVote)
548+
require_NotEqual(t, n.etlr, time.Time{}) // Resets election timer as it starts campaigning.
549+
n.etlr = time.Time{}
550+
551+
// Switch to candidate.
552+
n.pterm, n.pindex = 2, 200
553+
n.switchToCandidate()
554+
require_Equal(t, n.term, 3)
555+
require_Equal(t, n.State(), Candidate)
556+
require_NotEqual(t, n.etlr, time.Time{}) // Resets election timer as part of switching state.
557+
n.etlr = time.Time{}
558+
559+
// We are candidate and deny vote for outdated candidate. But they were on a more recent term, restart campaign.
560+
require_NoError(t, n.processVoteRequest(&voteRequest{term: 4, lastTerm: 1, lastIndex: 2, candidate: nats0}))
561+
require_Equal(t, n.term, 4)
562+
require_Equal(t, n.vote, noVote)
563+
require_NotEqual(t, n.etlr, time.Time{}) // Resets election timer as it restarts campaigning.
564+
n.etlr = time.Time{}
565+
566+
// Switch to candidate.
567+
n.pterm, n.pindex = 4, 400
568+
n.switchToCandidate()
569+
require_Equal(t, n.term, 5)
570+
require_Equal(t, n.State(), Candidate)
571+
require_NotEqual(t, n.etlr, time.Time{}) // Resets election timer as part of switching state.
572+
n.etlr = time.Time{}
573+
574+
// We are candidate and deny vote for outdated candidate. Don't start campaigning early.
575+
require_NoError(t, n.processVoteRequest(&voteRequest{term: 5, lastTerm: 1, lastIndex: 2, candidate: nats0}))
576+
require_Equal(t, n.term, 5)
577+
require_Equal(t, n.vote, noVote)
578+
// Election timer must NOT be updated as that would mean another candidate that we don't vote
579+
// for can short-circuit us by making us restart elections, denying us the ability to become leader.
580+
require_Equal(t, n.etlr, time.Time{})
581+
}
582+
529583
func TestNRGInvalidTAVDoesntPanic(t *testing.T) {
530584
c := createJetStreamClusterExplicit(t, "R3S", 3)
531585
defer c.shutdown()

0 commit comments

Comments
 (0)