feat: add quorum check before proposing entries (Gap 8.6)
Add HasQuorum() to RaftNode that counts peers with LastContact within 2 × ElectionTimeoutMaxMs and returns true only when self + current peers reaches majority. ProposeAsync now throws InvalidOperationException with "no quorum" when HasQuorum() returns false, preventing a partitioned leader from diverging the log. Add 14 tests in RaftQuorumCheckTests.cs covering single-node, 3-node, 5-node, boundary window, and heartbeat restore scenarios. Update RaftHealthTests.LastContact_updates_on_successful_replication to avoid triggering the new quorum guard.
This commit is contained in:
282
tests/NATS.Server.Tests/Raft/RaftQuorumCheckTests.cs
Normal file
282
tests/NATS.Server.Tests/Raft/RaftQuorumCheckTests.cs
Normal file
@@ -0,0 +1,282 @@
|
||||
using NATS.Server.Raft;
|
||||
|
||||
namespace NATS.Server.Tests.Raft;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for HasQuorum() and the quorum guard in ProposeAsync (Gap 8.6).
|
||||
///
|
||||
/// A leader must confirm that a majority of peers have contacted it recently
|
||||
/// (within 2 × ElectionTimeoutMaxMs) before it is allowed to append new log entries.
|
||||
/// This prevents a partitioned leader from diverging the log while isolated from
|
||||
/// the rest of the cluster.
|
||||
///
|
||||
/// Go reference: raft.go checkQuorum / stepDown — a leader steps down (and therefore
|
||||
/// blocks proposals) when it has not heard from a quorum of peers within the
|
||||
/// election-timeout window.
|
||||
/// </summary>
|
||||
public class RaftQuorumCheckTests
|
||||
{
|
||||
// -- Helpers (self-contained, no shared TestHelpers class) --
|
||||
|
||||
private static (RaftNode[] nodes, InMemoryRaftTransport transport) CreateCluster(int size)
|
||||
{
|
||||
var transport = new InMemoryRaftTransport();
|
||||
var nodes = Enumerable.Range(1, size)
|
||||
.Select(i => new RaftNode($"n{i}", transport))
|
||||
.ToArray();
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
transport.Register(node);
|
||||
node.ConfigureCluster(nodes);
|
||||
// Short timeouts so tests do not need real async delays.
|
||||
node.ElectionTimeoutMinMs = 50;
|
||||
node.ElectionTimeoutMaxMs = 100;
|
||||
}
|
||||
return (nodes, transport);
|
||||
}
|
||||
|
||||
private static RaftNode ElectLeader(RaftNode[] nodes)
|
||||
{
|
||||
var candidate = nodes[0];
|
||||
candidate.StartElection(nodes.Length);
|
||||
foreach (var voter in nodes.Skip(1))
|
||||
candidate.ReceiveVote(voter.GrantVote(candidate.Term, candidate.Id), nodes.Length);
|
||||
return candidate;
|
||||
}
|
||||
|
||||
// -- HasQuorum tests --
|
||||
|
||||
// Go reference: raft.go checkQuorum (leader confirms majority contact before acting)
|
||||
[Fact]
|
||||
public void HasQuorum_returns_true_with_majority_peers_current()
|
||||
{
|
||||
// 3-node cluster: leader + 2 peers. Both peers are freshly initialized by
|
||||
// ConfigureCluster so their LastContact is very close to UtcNow.
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Peers were initialized with DateTime.UtcNow — they are within the quorum window.
|
||||
leader.HasQuorum().ShouldBeTrue();
|
||||
}
|
||||
|
||||
// Go reference: raft.go checkQuorum (leader steps down when peers are stale)
|
||||
[Fact]
|
||||
public void HasQuorum_returns_false_with_stale_peers()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Set all peer contacts well beyond the quorum window (2 × 100 ms = 200 ms).
|
||||
foreach (var (_, state) in leader.GetPeerStates())
|
||||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
|
||||
leader.HasQuorum().ShouldBeFalse();
|
||||
}
|
||||
|
||||
// Go reference: raft.go — followers never have proposer quorum
|
||||
[Fact]
|
||||
public void HasQuorum_returns_false_for_non_leader()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
_ = ElectLeader(nodes);
|
||||
|
||||
// nodes[1] is a follower.
|
||||
var follower = nodes[1];
|
||||
follower.IsLeader.ShouldBeFalse();
|
||||
|
||||
follower.HasQuorum().ShouldBeFalse();
|
||||
}
|
||||
|
||||
// Go reference: raft.go — candidate also does not have proposer quorum
|
||||
[Fact]
|
||||
public void HasQuorum_returns_false_for_candidate()
|
||||
{
|
||||
// A node becomes a Candidate when StartElection is called but it has not yet
|
||||
// received enough votes to become Leader. In a 3-node cluster, after calling
|
||||
// StartElection on n1 the node is a Candidate (it voted for itself but the
|
||||
// other 2 nodes have not yet responded).
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var candidate = nodes[0];
|
||||
|
||||
// StartElection increments term, sets VotedFor=self, and calls TryBecomeLeader.
|
||||
// With only 1 self-vote in a 3-node cluster quorum is 2, so role stays Candidate.
|
||||
candidate.StartElection(clusterSize: 3);
|
||||
|
||||
candidate.Role.ShouldBe(RaftRole.Candidate);
|
||||
candidate.HasQuorum().ShouldBeFalse();
|
||||
}
|
||||
|
||||
// Go reference: raft.go single-node cluster — self is always a majority of one
|
||||
[Fact]
|
||||
public void HasQuorum_single_node_always_true()
|
||||
{
|
||||
var node = new RaftNode("solo");
|
||||
node.StartElection(clusterSize: 1);
|
||||
|
||||
node.IsLeader.ShouldBeTrue();
|
||||
node.HasQuorum().ShouldBeTrue();
|
||||
}
|
||||
|
||||
// 5-node cluster: with 2 current peers + self = 3, majority of 5 is 3, so quorum.
|
||||
[Fact]
|
||||
public void HasQuorum_five_node_with_two_current_peers_is_true()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(5);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Make 2 peers stale; keep 2 fresh (plus self = 3 voters, majority of 5 = 3).
|
||||
var peerStates = leader.GetPeerStates().Values.ToList();
|
||||
peerStates[0].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
peerStates[1].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
// peerStates[2] and peerStates[3] remain fresh (within window).
|
||||
|
||||
leader.HasQuorum().ShouldBeTrue();
|
||||
}
|
||||
|
||||
// 5-node cluster: with only 1 current peer + self = 2, majority of 5 is 3, so no quorum.
|
||||
[Fact]
|
||||
public void HasQuorum_five_node_with_one_current_peer_is_false()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(5);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Make 3 out of 4 peers stale; only 1 fresh peer + self = 2 voters (need 3).
|
||||
var peerStates = leader.GetPeerStates().Values.ToList();
|
||||
peerStates[0].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
peerStates[1].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
peerStates[2].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
// peerStates[3] is fresh.
|
||||
|
||||
leader.HasQuorum().ShouldBeFalse();
|
||||
}
|
||||
|
||||
// -- ProposeAsync quorum guard tests --
|
||||
|
||||
// Go reference: raft.go checkQuorum — leader rejects proposals when quorum lost
|
||||
[Fact]
|
||||
public async Task ProposeAsync_throws_when_no_quorum()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Make all peers stale to break quorum.
|
||||
foreach (var (_, state) in leader.GetPeerStates())
|
||||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
|
||||
var ex = await Should.ThrowAsync<InvalidOperationException>(
|
||||
() => leader.ProposeAsync("cmd", CancellationToken.None).AsTask());
|
||||
|
||||
ex.Message.ShouldContain("no quorum");
|
||||
}
|
||||
|
||||
// Go reference: raft.go normal proposal path when quorum is confirmed
|
||||
[Fact]
|
||||
public async Task ProposeAsync_succeeds_with_quorum()
|
||||
{
|
||||
// Peers are initialized with fresh LastContact by ConfigureCluster, so quorum holds.
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
var index = await leader.ProposeAsync("cmd-ok", CancellationToken.None);
|
||||
|
||||
index.ShouldBeGreaterThan(0);
|
||||
leader.AppliedIndex.ShouldBe(index);
|
||||
}
|
||||
|
||||
// After a heartbeat round, peers are fresh and quorum is restored.
|
||||
[Fact]
|
||||
public async Task ProposeAsync_succeeds_after_heartbeat_restores_quorum()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Make all peers stale.
|
||||
foreach (var (_, state) in leader.GetPeerStates())
|
||||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
|
||||
// Proposal should fail with no quorum.
|
||||
await Should.ThrowAsync<InvalidOperationException>(
|
||||
() => leader.ProposeAsync("should-fail", CancellationToken.None).AsTask());
|
||||
|
||||
// Simulate heartbeat responses updating LastContact on the leader.
|
||||
foreach (var peer in nodes.Skip(1))
|
||||
leader.GetPeerStates()[peer.Id].LastContact = DateTime.UtcNow;
|
||||
|
||||
// Quorum is restored; proposal should now succeed.
|
||||
var index = await leader.ProposeAsync("after-heartbeat", CancellationToken.None);
|
||||
index.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
// -- Heartbeat updates LastContact --
|
||||
|
||||
// Go reference: raft.go processHeartbeat — updates peer last-contact on a valid heartbeat
|
||||
[Fact]
|
||||
public void Heartbeat_updates_last_contact()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var node = nodes[0];
|
||||
|
||||
var peerStates = node.GetPeerStates();
|
||||
var oldTime = DateTime.UtcNow.AddMinutes(-5);
|
||||
peerStates["n2"].LastContact = oldTime;
|
||||
|
||||
node.ReceiveHeartbeat(term: 1, fromPeerId: "n2");
|
||||
|
||||
peerStates["n2"].LastContact.ShouldBeGreaterThan(oldTime);
|
||||
}
|
||||
|
||||
// Heartbeats from the leader to the cluster restore the leader's quorum tracking.
|
||||
[Fact]
|
||||
public void Heartbeat_from_peer_restores_peer_freshness_for_quorum()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// Simulate network partition: mark all peers stale.
|
||||
foreach (var (_, state) in leader.GetPeerStates())
|
||||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||||
|
||||
leader.HasQuorum().ShouldBeFalse();
|
||||
|
||||
// Leader receives heartbeat ACK from n2 (simulating that n2 is still reachable).
|
||||
// In a real RAFT loop the leader sends AppendEntries and processes the response;
|
||||
// here we simulate the response side by directly updating LastContact via ReceiveHeartbeat.
|
||||
// Note: ReceiveHeartbeat is called on a follower when it receives from the leader, not
|
||||
// on the leader itself. We instead update LastContact directly to simulate the leader
|
||||
// processing an AppendEntries response.
|
||||
leader.GetPeerStates()["n2"].LastContact = DateTime.UtcNow;
|
||||
|
||||
// 1 current peer + self = 2 voters; majority of 3 = 2, so quorum is restored.
|
||||
leader.HasQuorum().ShouldBeTrue();
|
||||
}
|
||||
|
||||
// -- Quorum window boundary tests --
|
||||
|
||||
[Fact]
|
||||
public void HasQuorum_peer_just_within_window_counts_as_current()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// ElectionTimeoutMaxMs = 100; window = 2 × 100 = 200 ms.
|
||||
// Set LastContact to 150 ms ago — just inside the 200 ms window.
|
||||
foreach (var (_, state) in leader.GetPeerStates())
|
||||
state.LastContact = DateTime.UtcNow.AddMilliseconds(-150);
|
||||
|
||||
leader.HasQuorum().ShouldBeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasQuorum_peer_just_outside_window_is_stale()
|
||||
{
|
||||
var (nodes, _) = CreateCluster(3);
|
||||
var leader = ElectLeader(nodes);
|
||||
|
||||
// ElectionTimeoutMaxMs = 100; window = 2 × 100 = 200 ms.
|
||||
// Set LastContact to 500 ms ago — comfortably outside the 200 ms window.
|
||||
foreach (var (_, state) in leader.GetPeerStates())
|
||||
state.LastContact = DateTime.UtcNow.AddMilliseconds(-500);
|
||||
|
||||
leader.HasQuorum().ShouldBeFalse();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user