Add HasQuorum() to RaftNode that counts peers with LastContact within 2 × ElectionTimeoutMaxMs and returns true only when self + current peers reaches majority. ProposeAsync now throws InvalidOperationException with "no quorum" when HasQuorum() returns false, preventing a partitioned leader from diverging the log. Add 14 tests in RaftQuorumCheckTests.cs covering single-node, 3-node, 5-node, boundary window, and heartbeat restore scenarios. Update RaftHealthTests.LastContact_updates_on_successful_replication to avoid triggering the new quorum guard.
283 lines
10 KiB
C#
283 lines
10 KiB
C#
using NATS.Server.Raft;
|
||
|
||
namespace NATS.Server.Tests.Raft;
|
||
|
||
/// <summary>
|
||
/// Tests for HasQuorum() and the quorum guard in ProposeAsync (Gap 8.6).
|
||
///
|
||
/// A leader must confirm that a majority of peers have contacted it recently
|
||
/// (within 2 × ElectionTimeoutMaxMs) before it is allowed to append new log entries.
|
||
/// This prevents a partitioned leader from diverging the log while isolated from
|
||
/// the rest of the cluster.
|
||
///
|
||
/// Go reference: raft.go checkQuorum / stepDown — a leader steps down (and therefore
|
||
/// blocks proposals) when it has not heard from a quorum of peers within the
|
||
/// election-timeout window.
|
||
/// </summary>
|
||
public class RaftQuorumCheckTests
|
||
{
|
||
// -- Helpers (self-contained, no shared TestHelpers class) --
|
||
|
||
private static (RaftNode[] nodes, InMemoryRaftTransport transport) CreateCluster(int size)
|
||
{
|
||
var transport = new InMemoryRaftTransport();
|
||
var nodes = Enumerable.Range(1, size)
|
||
.Select(i => new RaftNode($"n{i}", transport))
|
||
.ToArray();
|
||
foreach (var node in nodes)
|
||
{
|
||
transport.Register(node);
|
||
node.ConfigureCluster(nodes);
|
||
// Short timeouts so tests do not need real async delays.
|
||
node.ElectionTimeoutMinMs = 50;
|
||
node.ElectionTimeoutMaxMs = 100;
|
||
}
|
||
return (nodes, transport);
|
||
}
|
||
|
||
private static RaftNode ElectLeader(RaftNode[] nodes)
|
||
{
|
||
var candidate = nodes[0];
|
||
candidate.StartElection(nodes.Length);
|
||
foreach (var voter in nodes.Skip(1))
|
||
candidate.ReceiveVote(voter.GrantVote(candidate.Term, candidate.Id), nodes.Length);
|
||
return candidate;
|
||
}
|
||
|
||
// -- HasQuorum tests --
|
||
|
||
// Go reference: raft.go checkQuorum (leader confirms majority contact before acting)
|
||
[Fact]
|
||
public void HasQuorum_returns_true_with_majority_peers_current()
|
||
{
|
||
// 3-node cluster: leader + 2 peers. Both peers are freshly initialized by
|
||
// ConfigureCluster so their LastContact is very close to UtcNow.
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Peers were initialized with DateTime.UtcNow — they are within the quorum window.
|
||
leader.HasQuorum().ShouldBeTrue();
|
||
}
|
||
|
||
// Go reference: raft.go checkQuorum (leader steps down when peers are stale)
|
||
[Fact]
|
||
public void HasQuorum_returns_false_with_stale_peers()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Set all peer contacts well beyond the quorum window (2 × 100 ms = 200 ms).
|
||
foreach (var (_, state) in leader.GetPeerStates())
|
||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
|
||
leader.HasQuorum().ShouldBeFalse();
|
||
}
|
||
|
||
// Go reference: raft.go — followers never have proposer quorum
|
||
[Fact]
|
||
public void HasQuorum_returns_false_for_non_leader()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
_ = ElectLeader(nodes);
|
||
|
||
// nodes[1] is a follower.
|
||
var follower = nodes[1];
|
||
follower.IsLeader.ShouldBeFalse();
|
||
|
||
follower.HasQuorum().ShouldBeFalse();
|
||
}
|
||
|
||
// Go reference: raft.go — candidate also does not have proposer quorum
|
||
[Fact]
|
||
public void HasQuorum_returns_false_for_candidate()
|
||
{
|
||
// A node becomes a Candidate when StartElection is called but it has not yet
|
||
// received enough votes to become Leader. In a 3-node cluster, after calling
|
||
// StartElection on n1 the node is a Candidate (it voted for itself but the
|
||
// other 2 nodes have not yet responded).
|
||
var (nodes, _) = CreateCluster(3);
|
||
var candidate = nodes[0];
|
||
|
||
// StartElection increments term, sets VotedFor=self, and calls TryBecomeLeader.
|
||
// With only 1 self-vote in a 3-node cluster quorum is 2, so role stays Candidate.
|
||
candidate.StartElection(clusterSize: 3);
|
||
|
||
candidate.Role.ShouldBe(RaftRole.Candidate);
|
||
candidate.HasQuorum().ShouldBeFalse();
|
||
}
|
||
|
||
// Go reference: raft.go single-node cluster — self is always a majority of one
|
||
[Fact]
|
||
public void HasQuorum_single_node_always_true()
|
||
{
|
||
var node = new RaftNode("solo");
|
||
node.StartElection(clusterSize: 1);
|
||
|
||
node.IsLeader.ShouldBeTrue();
|
||
node.HasQuorum().ShouldBeTrue();
|
||
}
|
||
|
||
// 5-node cluster: with 2 current peers + self = 3, majority of 5 is 3, so quorum.
|
||
[Fact]
|
||
public void HasQuorum_five_node_with_two_current_peers_is_true()
|
||
{
|
||
var (nodes, _) = CreateCluster(5);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Make 2 peers stale; keep 2 fresh (plus self = 3 voters, majority of 5 = 3).
|
||
var peerStates = leader.GetPeerStates().Values.ToList();
|
||
peerStates[0].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
peerStates[1].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
// peerStates[2] and peerStates[3] remain fresh (within window).
|
||
|
||
leader.HasQuorum().ShouldBeTrue();
|
||
}
|
||
|
||
// 5-node cluster: with only 1 current peer + self = 2, majority of 5 is 3, so no quorum.
|
||
[Fact]
|
||
public void HasQuorum_five_node_with_one_current_peer_is_false()
|
||
{
|
||
var (nodes, _) = CreateCluster(5);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Make 3 out of 4 peers stale; only 1 fresh peer + self = 2 voters (need 3).
|
||
var peerStates = leader.GetPeerStates().Values.ToList();
|
||
peerStates[0].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
peerStates[1].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
peerStates[2].LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
// peerStates[3] is fresh.
|
||
|
||
leader.HasQuorum().ShouldBeFalse();
|
||
}
|
||
|
||
// -- ProposeAsync quorum guard tests --
|
||
|
||
// Go reference: raft.go checkQuorum — leader rejects proposals when quorum lost
|
||
[Fact]
|
||
public async Task ProposeAsync_throws_when_no_quorum()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Make all peers stale to break quorum.
|
||
foreach (var (_, state) in leader.GetPeerStates())
|
||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
|
||
var ex = await Should.ThrowAsync<InvalidOperationException>(
|
||
() => leader.ProposeAsync("cmd", CancellationToken.None).AsTask());
|
||
|
||
ex.Message.ShouldContain("no quorum");
|
||
}
|
||
|
||
// Go reference: raft.go normal proposal path when quorum is confirmed
|
||
[Fact]
|
||
public async Task ProposeAsync_succeeds_with_quorum()
|
||
{
|
||
// Peers are initialized with fresh LastContact by ConfigureCluster, so quorum holds.
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
var index = await leader.ProposeAsync("cmd-ok", CancellationToken.None);
|
||
|
||
index.ShouldBeGreaterThan(0);
|
||
leader.AppliedIndex.ShouldBe(index);
|
||
}
|
||
|
||
// After a heartbeat round, peers are fresh and quorum is restored.
|
||
[Fact]
|
||
public async Task ProposeAsync_succeeds_after_heartbeat_restores_quorum()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Make all peers stale.
|
||
foreach (var (_, state) in leader.GetPeerStates())
|
||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
|
||
// Proposal should fail with no quorum.
|
||
await Should.ThrowAsync<InvalidOperationException>(
|
||
() => leader.ProposeAsync("should-fail", CancellationToken.None).AsTask());
|
||
|
||
// Simulate heartbeat responses updating LastContact on the leader.
|
||
foreach (var peer in nodes.Skip(1))
|
||
leader.GetPeerStates()[peer.Id].LastContact = DateTime.UtcNow;
|
||
|
||
// Quorum is restored; proposal should now succeed.
|
||
var index = await leader.ProposeAsync("after-heartbeat", CancellationToken.None);
|
||
index.ShouldBeGreaterThan(0);
|
||
}
|
||
|
||
// -- Heartbeat updates LastContact --
|
||
|
||
// Go reference: raft.go processHeartbeat — updates peer last-contact on a valid heartbeat
|
||
[Fact]
|
||
public void Heartbeat_updates_last_contact()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var node = nodes[0];
|
||
|
||
var peerStates = node.GetPeerStates();
|
||
var oldTime = DateTime.UtcNow.AddMinutes(-5);
|
||
peerStates["n2"].LastContact = oldTime;
|
||
|
||
node.ReceiveHeartbeat(term: 1, fromPeerId: "n2");
|
||
|
||
peerStates["n2"].LastContact.ShouldBeGreaterThan(oldTime);
|
||
}
|
||
|
||
// Heartbeats from the leader to the cluster restore the leader's quorum tracking.
|
||
[Fact]
|
||
public void Heartbeat_from_peer_restores_peer_freshness_for_quorum()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// Simulate network partition: mark all peers stale.
|
||
foreach (var (_, state) in leader.GetPeerStates())
|
||
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
|
||
|
||
leader.HasQuorum().ShouldBeFalse();
|
||
|
||
// Leader receives heartbeat ACK from n2 (simulating that n2 is still reachable).
|
||
// In a real RAFT loop the leader sends AppendEntries and processes the response;
|
||
// here we simulate the response side by directly updating LastContact via ReceiveHeartbeat.
|
||
// Note: ReceiveHeartbeat is called on a follower when it receives from the leader, not
|
||
// on the leader itself. We instead update LastContact directly to simulate the leader
|
||
// processing an AppendEntries response.
|
||
leader.GetPeerStates()["n2"].LastContact = DateTime.UtcNow;
|
||
|
||
// 1 current peer + self = 2 voters; majority of 3 = 2, so quorum is restored.
|
||
leader.HasQuorum().ShouldBeTrue();
|
||
}
|
||
|
||
// -- Quorum window boundary tests --
|
||
|
||
[Fact]
|
||
public void HasQuorum_peer_just_within_window_counts_as_current()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// ElectionTimeoutMaxMs = 100; window = 2 × 100 = 200 ms.
|
||
// Set LastContact to 150 ms ago — just inside the 200 ms window.
|
||
foreach (var (_, state) in leader.GetPeerStates())
|
||
state.LastContact = DateTime.UtcNow.AddMilliseconds(-150);
|
||
|
||
leader.HasQuorum().ShouldBeTrue();
|
||
}
|
||
|
||
[Fact]
|
||
public void HasQuorum_peer_just_outside_window_is_stale()
|
||
{
|
||
var (nodes, _) = CreateCluster(3);
|
||
var leader = ElectLeader(nodes);
|
||
|
||
// ElectionTimeoutMaxMs = 100; window = 2 × 100 = 200 ms.
|
||
// Set LastContact to 500 ms ago — comfortably outside the 200 ms window.
|
||
foreach (var (_, state) in leader.GetPeerStates())
|
||
state.LastContact = DateTime.UtcNow.AddMilliseconds(-500);
|
||
|
||
leader.HasQuorum().ShouldBeFalse();
|
||
}
|
||
}
|