Files
natsdotnet/tests/NATS.Server.Tests/Raft/RaftQuorumCheckTests.cs
Joseph Doherty 5a62100397 feat: add quorum check before proposing entries (Gap 8.6)
Add HasQuorum() to RaftNode that counts peers with LastContact within
2 × ElectionTimeoutMaxMs and returns true only when self + current peers
reaches majority. ProposeAsync now throws InvalidOperationException with
"no quorum" when HasQuorum() returns false, preventing a partitioned
leader from diverging the log. Add 14 tests in RaftQuorumCheckTests.cs
covering single-node, 3-node, 5-node, boundary window, and heartbeat
restore scenarios. Update RaftHealthTests.LastContact_updates_on_successful_replication
to avoid triggering the new quorum guard.
2026-02-25 08:26:37 -05:00

283 lines
10 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using NATS.Server.Raft;
namespace NATS.Server.Tests.Raft;
/// <summary>
/// Tests for HasQuorum() and the quorum guard in ProposeAsync (Gap 8.6).
///
/// A leader must confirm that a majority of peers have contacted it recently
/// (within 2 × ElectionTimeoutMaxMs) before it is allowed to append new log entries.
/// This prevents a partitioned leader from diverging the log while isolated from
/// the rest of the cluster.
///
/// Go reference: raft.go checkQuorum / stepDown — a leader steps down (and therefore
/// blocks proposals) when it has not heard from a quorum of peers within the
/// election-timeout window.
/// </summary>
public class RaftQuorumCheckTests
{
// -- Helpers (self-contained, no shared TestHelpers class) --
private static (RaftNode[] nodes, InMemoryRaftTransport transport) CreateCluster(int size)
{
var transport = new InMemoryRaftTransport();
var nodes = Enumerable.Range(1, size)
.Select(i => new RaftNode($"n{i}", transport))
.ToArray();
foreach (var node in nodes)
{
transport.Register(node);
node.ConfigureCluster(nodes);
// Short timeouts so tests do not need real async delays.
node.ElectionTimeoutMinMs = 50;
node.ElectionTimeoutMaxMs = 100;
}
return (nodes, transport);
}
private static RaftNode ElectLeader(RaftNode[] nodes)
{
var candidate = nodes[0];
candidate.StartElection(nodes.Length);
foreach (var voter in nodes.Skip(1))
candidate.ReceiveVote(voter.GrantVote(candidate.Term, candidate.Id), nodes.Length);
return candidate;
}
// -- HasQuorum tests --
// Go reference: raft.go checkQuorum (leader confirms majority contact before acting)
[Fact]
public void HasQuorum_returns_true_with_majority_peers_current()
{
// 3-node cluster: leader + 2 peers. Both peers are freshly initialized by
// ConfigureCluster so their LastContact is very close to UtcNow.
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// Peers were initialized with DateTime.UtcNow — they are within the quorum window.
leader.HasQuorum().ShouldBeTrue();
}
// Go reference: raft.go checkQuorum (leader steps down when peers are stale)
[Fact]
public void HasQuorum_returns_false_with_stale_peers()
{
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// Set all peer contacts well beyond the quorum window (2 × 100 ms = 200 ms).
foreach (var (_, state) in leader.GetPeerStates())
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
leader.HasQuorum().ShouldBeFalse();
}
// Go reference: raft.go — followers never have proposer quorum
[Fact]
public void HasQuorum_returns_false_for_non_leader()
{
var (nodes, _) = CreateCluster(3);
_ = ElectLeader(nodes);
// nodes[1] is a follower.
var follower = nodes[1];
follower.IsLeader.ShouldBeFalse();
follower.HasQuorum().ShouldBeFalse();
}
// Go reference: raft.go — candidate also does not have proposer quorum
[Fact]
public void HasQuorum_returns_false_for_candidate()
{
// A node becomes a Candidate when StartElection is called but it has not yet
// received enough votes to become Leader. In a 3-node cluster, after calling
// StartElection on n1 the node is a Candidate (it voted for itself but the
// other 2 nodes have not yet responded).
var (nodes, _) = CreateCluster(3);
var candidate = nodes[0];
// StartElection increments term, sets VotedFor=self, and calls TryBecomeLeader.
// With only 1 self-vote in a 3-node cluster quorum is 2, so role stays Candidate.
candidate.StartElection(clusterSize: 3);
candidate.Role.ShouldBe(RaftRole.Candidate);
candidate.HasQuorum().ShouldBeFalse();
}
// Go reference: raft.go single-node cluster — self is always a majority of one
[Fact]
public void HasQuorum_single_node_always_true()
{
var node = new RaftNode("solo");
node.StartElection(clusterSize: 1);
node.IsLeader.ShouldBeTrue();
node.HasQuorum().ShouldBeTrue();
}
// 5-node cluster: with 2 current peers + self = 3, majority of 5 is 3, so quorum.
[Fact]
public void HasQuorum_five_node_with_two_current_peers_is_true()
{
var (nodes, _) = CreateCluster(5);
var leader = ElectLeader(nodes);
// Make 2 peers stale; keep 2 fresh (plus self = 3 voters, majority of 5 = 3).
var peerStates = leader.GetPeerStates().Values.ToList();
peerStates[0].LastContact = DateTime.UtcNow.AddMinutes(-5);
peerStates[1].LastContact = DateTime.UtcNow.AddMinutes(-5);
// peerStates[2] and peerStates[3] remain fresh (within window).
leader.HasQuorum().ShouldBeTrue();
}
// 5-node cluster: with only 1 current peer + self = 2, majority of 5 is 3, so no quorum.
[Fact]
public void HasQuorum_five_node_with_one_current_peer_is_false()
{
var (nodes, _) = CreateCluster(5);
var leader = ElectLeader(nodes);
// Make 3 out of 4 peers stale; only 1 fresh peer + self = 2 voters (need 3).
var peerStates = leader.GetPeerStates().Values.ToList();
peerStates[0].LastContact = DateTime.UtcNow.AddMinutes(-5);
peerStates[1].LastContact = DateTime.UtcNow.AddMinutes(-5);
peerStates[2].LastContact = DateTime.UtcNow.AddMinutes(-5);
// peerStates[3] is fresh.
leader.HasQuorum().ShouldBeFalse();
}
// -- ProposeAsync quorum guard tests --
// Go reference: raft.go checkQuorum — leader rejects proposals when quorum lost
[Fact]
public async Task ProposeAsync_throws_when_no_quorum()
{
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// Make all peers stale to break quorum.
foreach (var (_, state) in leader.GetPeerStates())
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
var ex = await Should.ThrowAsync<InvalidOperationException>(
() => leader.ProposeAsync("cmd", CancellationToken.None).AsTask());
ex.Message.ShouldContain("no quorum");
}
// Go reference: raft.go normal proposal path when quorum is confirmed
[Fact]
public async Task ProposeAsync_succeeds_with_quorum()
{
// Peers are initialized with fresh LastContact by ConfigureCluster, so quorum holds.
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
var index = await leader.ProposeAsync("cmd-ok", CancellationToken.None);
index.ShouldBeGreaterThan(0);
leader.AppliedIndex.ShouldBe(index);
}
// After a heartbeat round, peers are fresh and quorum is restored.
[Fact]
public async Task ProposeAsync_succeeds_after_heartbeat_restores_quorum()
{
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// Make all peers stale.
foreach (var (_, state) in leader.GetPeerStates())
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
// Proposal should fail with no quorum.
await Should.ThrowAsync<InvalidOperationException>(
() => leader.ProposeAsync("should-fail", CancellationToken.None).AsTask());
// Simulate heartbeat responses updating LastContact on the leader.
foreach (var peer in nodes.Skip(1))
leader.GetPeerStates()[peer.Id].LastContact = DateTime.UtcNow;
// Quorum is restored; proposal should now succeed.
var index = await leader.ProposeAsync("after-heartbeat", CancellationToken.None);
index.ShouldBeGreaterThan(0);
}
// -- Heartbeat updates LastContact --
// Go reference: raft.go processHeartbeat — updates peer last-contact on a valid heartbeat
[Fact]
public void Heartbeat_updates_last_contact()
{
var (nodes, _) = CreateCluster(3);
var node = nodes[0];
var peerStates = node.GetPeerStates();
var oldTime = DateTime.UtcNow.AddMinutes(-5);
peerStates["n2"].LastContact = oldTime;
node.ReceiveHeartbeat(term: 1, fromPeerId: "n2");
peerStates["n2"].LastContact.ShouldBeGreaterThan(oldTime);
}
// Heartbeats from the leader to the cluster restore the leader's quorum tracking.
[Fact]
public void Heartbeat_from_peer_restores_peer_freshness_for_quorum()
{
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// Simulate network partition: mark all peers stale.
foreach (var (_, state) in leader.GetPeerStates())
state.LastContact = DateTime.UtcNow.AddMinutes(-5);
leader.HasQuorum().ShouldBeFalse();
// Leader receives heartbeat ACK from n2 (simulating that n2 is still reachable).
// In a real RAFT loop the leader sends AppendEntries and processes the response;
// here we simulate the response side by directly updating LastContact via ReceiveHeartbeat.
// Note: ReceiveHeartbeat is called on a follower when it receives from the leader, not
// on the leader itself. We instead update LastContact directly to simulate the leader
// processing an AppendEntries response.
leader.GetPeerStates()["n2"].LastContact = DateTime.UtcNow;
// 1 current peer + self = 2 voters; majority of 3 = 2, so quorum is restored.
leader.HasQuorum().ShouldBeTrue();
}
// -- Quorum window boundary tests --
[Fact]
public void HasQuorum_peer_just_within_window_counts_as_current()
{
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// ElectionTimeoutMaxMs = 100; window = 2 × 100 = 200 ms.
// Set LastContact to 150 ms ago — just inside the 200 ms window.
foreach (var (_, state) in leader.GetPeerStates())
state.LastContact = DateTime.UtcNow.AddMilliseconds(-150);
leader.HasQuorum().ShouldBeTrue();
}
[Fact]
public void HasQuorum_peer_just_outside_window_is_stale()
{
var (nodes, _) = CreateCluster(3);
var leader = ElectLeader(nodes);
// ElectionTimeoutMaxMs = 100; window = 2 × 100 = 200 ms.
// Set LastContact to 500 ms ago — comfortably outside the 200 ms window.
foreach (var (_, state) in leader.GetPeerStates())
state.LastContact = DateTime.UtcNow.AddMilliseconds(-500);
leader.HasQuorum().ShouldBeFalse();
}
}