Files
natsdotnet/tests/NATS.E2E.Cluster.Tests/ClusterResilienceTests.cs
Joseph Doherty f64b7103f4 test: add gateway failover E2E tests and fix SW003/SW004 violations across cluster tests
Replace all Task.Delay-based interest propagation waits with active probe loops
(PeriodicTimer + publish-and-read) in GatewayFailoverTests, LeafNodeFailoverTests,
JetStreamClusterTests, and RaftConsensusTests. Fix SW003 empty-catch violations in
ClusterResilienceTests by adding _ = e discard statements. Correct State.Messages
type from ulong to long to match the NATS.Client.JetStream API.
2026-03-12 23:38:18 -04:00

220 lines
8.8 KiB
C#

using NATS.Client.Core;
using NATS.E2E.Cluster.Tests.Infrastructure;
namespace NATS.E2E.Cluster.Tests;
// Go reference: server/cluster_test.go — TestClusterRouteReconnect, TestClusterQueueSubs, etc.
[Collection("E2E-ThreeNodeCluster")]
public class ClusterResilienceTests(ThreeNodeClusterFixture fixture) : IClassFixture<ThreeNodeClusterFixture>
{
/// <summary>
/// Kill node 2, then verify a pub on node 0 reaches a sub on node 1.
/// Go reference: server/cluster_test.go TestClusterRouteSingleHopSubjectMatch
/// </summary>
[Fact]
public async Task NodeDies_TrafficReroutesToSurvivors()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
await fixture.KillNode(2);
// Surviving cluster (nodes 0 and 1) should have 1 route each
await fixture.WaitForFullMeshAsync(expectedRoutes: 1);
await using var publisher = fixture.CreateClient(0);
await using var subscriber = fixture.CreateClient(1);
await publisher.ConnectAsync();
await subscriber.ConnectAsync();
const string subject = "resilience.node-dies";
await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);
await publisher.PublishAsync(subject, "hello-after-kill", cancellationToken: cts.Token);
await publisher.PingAsync(cts.Token);
var msg = await sub.Msgs.ReadAsync(cts.Token);
msg.Data.ShouldBe("hello-after-kill");
// Restore node 2 and wait for full mesh before next test
await fixture.RestartNode(2);
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
}
/// <summary>
/// Kill node 2, restart it, then verify subscriptions on the restarted node receive messages from node 0.
/// Go reference: server/cluster_test.go TestClusterRouteReconnect
/// </summary>
[Fact]
public async Task NodeRejoins_SubscriptionsPropagateAgain()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
await fixture.KillNode(2);
await fixture.RestartNode(2);
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
await using var publisher = fixture.CreateClient(0);
await using var subscriber = fixture.CreateClient(2);
await publisher.ConnectAsync();
await subscriber.ConnectAsync();
const string subject = "resilience.node-rejoins";
await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);
await publisher.PublishAsync(subject, "hello-after-rejoin", cancellationToken: cts.Token);
await publisher.PingAsync(cts.Token);
var msg = await sub.Msgs.ReadAsync(cts.Token);
msg.Data.ShouldBe("hello-after-rejoin");
}
/// <summary>
/// Kill node 1 and restart it, then verify pub on node 0 reaches sub on node 1 once full mesh is restored.
/// Go reference: server/cluster_test.go TestClusterRouteReconnect
/// </summary>
[Fact]
public async Task AllRoutesReconnect_AfterNodeRestart()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
await fixture.KillNode(1);
await fixture.RestartNode(1);
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
await using var publisher = fixture.CreateClient(0);
await using var subscriber = fixture.CreateClient(1);
await publisher.ConnectAsync();
await subscriber.ConnectAsync();
const string subject = "resilience.all-routes-reconnect";
await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);
await publisher.PublishAsync(subject, "hello-after-restart", cancellationToken: cts.Token);
await publisher.PingAsync(cts.Token);
var msg = await sub.Msgs.ReadAsync(cts.Token);
msg.Data.ShouldBe("hello-after-restart");
}
/// <summary>
/// Two queue subscribers on node 1 in the same group receive all 10 messages even when node 2 is killed.
/// Go reference: server/cluster_test.go TestClusterQueueSubscriberWithLoss
/// </summary>
[Fact]
[SlopwatchSuppress("SW004", "Task.Delay(Timeout.Infinite) is used purely as a cancellable wait primitive alongside WhenAny — it never actually delays; drainTimeout drives the deadline")]
public async Task QueueGroup_NodeDies_RemainingMembersDeliver()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
await using var publisher = fixture.CreateClient(0);
await using var subConn1 = fixture.CreateClient(1);
await using var subConn2 = fixture.CreateClient(1);
await publisher.ConnectAsync();
await subConn1.ConnectAsync();
await subConn2.ConnectAsync();
const string subject = "resilience.queue-group";
const string queue = "rq";
await using var sub1 = await subConn1.SubscribeCoreAsync<string>(
subject, queueGroup: queue, cancellationToken: cts.Token);
await using var sub2 = await subConn2.SubscribeCoreAsync<string>(
subject, queueGroup: queue, cancellationToken: cts.Token);
// Wait for subscriptions to propagate to node 0
await WaitForPropagationAsync(publisher, subConn1, $"probe.{subject}", cts.Token);
// Kill node 2 — this tests cluster stability but node 1 subs remain unaffected
await fixture.KillNode(2);
await fixture.WaitForFullMeshAsync(expectedRoutes: 1);
const int messageCount = 10;
for (var i = 0; i < messageCount; i++)
{
await publisher.PublishAsync(subject, $"msg-{i}", cancellationToken: cts.Token);
}
await publisher.PingAsync(cts.Token);
// Drain messages from both subscribers; each message is delivered to exactly one member.
// Use a TaskCompletionSource as the completion signal so the happy path never throws.
var received = new List<string>(messageCount);
var allReceived = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
async Task DrainAsync(INatsSub<string> sub)
{
await foreach (var msg in sub.Msgs.ReadAllAsync(cts.Token))
{
int count;
lock (received)
{
received.Add(msg.Data!);
count = received.Count;
}
if (count >= messageCount)
{
allReceived.TrySetResult();
return;
}
}
}
var drain1 = DrainAsync(sub1);
var drain2 = DrainAsync(sub2);
using var drainTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(10));
await allReceived.Task.WaitAsync(drainTimeout.Token);
received.Count.ShouldBe(messageCount);
// Restore node 2 and wait for full mesh before next test
await fixture.RestartNode(2);
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// <summary>
/// Probes the route between publisher and subscriber by sending probe messages until
/// the subscriber sees one, indicating subscriptions have propagated across the cluster.
/// </summary>
[SlopwatchSuppress("SW003", "OperationCanceledException from the inner readCts timeout is the expected retry signal; swallowing it is intentional so the polling loop continues to the next tick")]
private static async Task WaitForPropagationAsync(
NatsConnection publisher,
NatsConnection subscriber,
string probeSubject,
CancellationToken ct)
{
await using var probeSub = await subscriber.SubscribeCoreAsync<string>(probeSubject, cancellationToken: ct);
// PingAsync on subscriber flushes the SUB to the server before we start probing.
await subscriber.PingAsync(ct);
using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(150));
while (await timer.WaitForNextTickAsync(ct))
{
await publisher.PublishAsync(probeSubject, "probe", cancellationToken: ct);
// PingAsync is a request/reply round-trip: when the pong returns, any message
// the server dispatched before the ping is already buffered in probeSub.Msgs.
await publisher.PingAsync(ct);
if (probeSub.Msgs.TryRead(out _))
return;
}
}
}