Files
natsdotnet/tests/NATS.E2E.Cluster.Tests/RaftConsensusTests.cs
Joseph Doherty f64b7103f4 test: add gateway failover E2E tests and fix SW003/SW004 violations across cluster tests
Replace all Task.Delay-based interest propagation waits with active probe loops
(PeriodicTimer + publish-and-read) in GatewayFailoverTests, LeafNodeFailoverTests,
JetStreamClusterTests, and RaftConsensusTests. Fix SW003 empty-catch violations in
ClusterResilienceTests by adding _ = e discard statements. Correct State.Messages
type from ulong to long to match the NATS.Client.JetStream API.
2026-03-12 23:38:18 -04:00

189 lines
6.5 KiB
C#

using NATS.Client.Core;
using NATS.Client.JetStream;
using NATS.Client.JetStream.Models;
using NATS.E2E.Cluster.Tests.Infrastructure;
namespace NATS.E2E.Cluster.Tests;
// Go reference: server/raft_test.go — TestNRGLeaderElection, TestNRGStepDown,
// TestNRGAppendEntry, TestNRGCatchup
public class RaftConsensusTests(JetStreamClusterFixture fixture) : IClassFixture<JetStreamClusterFixture>
{
// Polls until the stream on the given node reports at least minMessages, or the token is cancelled.
private static async Task WaitForStreamMessagesAsync(
NatsJSContext js,
string streamName,
long minMessages,
CancellationToken ct)
{
using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(200));
while (await timer.WaitForNextTickAsync(ct).ConfigureAwait(false))
{
try
{
var info = await js.GetStreamAsync(streamName, cancellationToken: ct);
if (info.Info.State.Messages >= minMessages)
return;
}
catch (NatsJSApiException ex)
{
// Stream not yet available on this node — keep polling
_ = ex;
}
}
}
// Go ref: server/raft_test.go TestNRGLeaderElection
[Fact]
public async Task LeaderElection_ClusterFormsLeader()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var ct = cts.Token;
await using var client = fixture.CreateClient(0);
var js = new NatsJSContext(client);
var stream = await js.CreateStreamAsync(
new StreamConfig("RAFT_LEADER", ["raft.leader.>"])
{
NumReplicas = 3
},
ct);
stream.Info.Config.Name.ShouldBe("RAFT_LEADER");
stream.Info.State.ShouldNotBeNull();
}
// Go ref: server/raft_test.go TestNRGStepDown
[Fact]
public async Task LeaderDies_NewLeaderElected()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var ct = cts.Token;
await using var client0 = fixture.CreateClient(0);
var js0 = new NatsJSContext(client0);
await js0.CreateStreamAsync(
new StreamConfig("RAFT_FAILOVER", ["raft.failover.>"])
{
NumReplicas = 3
},
ct);
// Publish 5 messages on node 0
for (var i = 0; i < 5; i++)
{
await js0.PublishAsync($"raft.failover.{i}", $"msg{i}", cancellationToken: ct);
}
// Kill node 0 to trigger RAFT leader re-election
await fixture.KillNode(0);
// Connect to node 1 and poll until stream is accessible with the expected messages —
// this confirms a new RAFT leader was elected and the stream is available
await using var client1 = fixture.CreateClient(1);
var js1 = new NatsJSContext(client1);
await WaitForStreamMessagesAsync(js1, "RAFT_FAILOVER", minMessages: 5, ct);
var info = await js1.GetStreamAsync("RAFT_FAILOVER", cancellationToken: ct);
info.Info.State.Messages.ShouldBeGreaterThanOrEqualTo(5L);
// Restore node 0 and wait for full mesh to reform
await fixture.RestartNode(0);
await fixture.WaitForFullMeshAsync();
}
// Go ref: server/raft_test.go TestNRGAppendEntry
[Fact]
public async Task LogReplication_AllReplicasHaveData()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var ct = cts.Token;
await using var client = fixture.CreateClient(0);
var js = new NatsJSContext(client);
await js.CreateStreamAsync(
new StreamConfig("RAFT_REPL", ["raft.repl.>"])
{
NumReplicas = 3
},
ct);
// Publish 10 messages
for (var i = 0; i < 10; i++)
{
await js.PublishAsync($"raft.repl.{i}", $"msg{i}", cancellationToken: ct);
}
// Query stream info from each node, polling until all replicas report 10 messages
for (var nodeIndex = 0; nodeIndex < 3; nodeIndex++)
{
await using var nodeClient = fixture.CreateClient(nodeIndex);
var nodeJs = new NatsJSContext(nodeClient);
await WaitForStreamMessagesAsync(nodeJs, "RAFT_REPL", minMessages: 10, ct);
var info = await nodeJs.GetStreamAsync("RAFT_REPL", cancellationToken: ct);
info.Info.State.Messages.ShouldBe(10L,
$"node {nodeIndex} should have 10 messages after replication");
}
}
// Go ref: server/raft_test.go TestNRGCatchup
[Fact]
public async Task LeaderRestart_RejoinsAsFollower()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
var ct = cts.Token;
await using var client0 = fixture.CreateClient(0);
var js0 = new NatsJSContext(client0);
await js0.CreateStreamAsync(
new StreamConfig("RAFT_REJOIN", ["raft.rejoin.>"])
{
NumReplicas = 3
},
ct);
// Publish 5 messages on node 0
for (var i = 0; i < 5; i++)
{
await js0.PublishAsync($"raft.rejoin.{i}", $"msg{i}", cancellationToken: ct);
}
// Kill node 0 — it drops out of the RAFT group
await fixture.KillNode(0);
// Connect to node 1 and poll until a new leader is serving the stream,
// then publish 5 more messages while node 0 is down
await using var client1 = fixture.CreateClient(1);
var js1 = new NatsJSContext(client1);
await WaitForStreamMessagesAsync(js1, "RAFT_REJOIN", minMessages: 5, ct);
for (var i = 5; i < 10; i++)
{
await js1.PublishAsync($"raft.rejoin.{i}", $"msg{i}", cancellationToken: ct);
}
// Restart node 0 — it should rejoin as a follower and catch up via RAFT log
await fixture.RestartNode(0);
await fixture.WaitForFullMeshAsync();
// Poll node 0 directly until it has caught up with all 10 messages
await using var client0Restarted = fixture.CreateClient(0);
var js0Restarted = new NatsJSContext(client0Restarted);
await WaitForStreamMessagesAsync(js0Restarted, "RAFT_REJOIN", minMessages: 10, ct);
var info = await js0Restarted.GetStreamAsync("RAFT_REJOIN", cancellationToken: ct);
info.Info.State.Messages.ShouldBe(10L,
"node 0 should have all 10 messages after rejoining and catching up");
}
}