using NATS.Client.Core; using NATS.E2E.Cluster.Tests.Infrastructure; namespace NATS.E2E.Cluster.Tests; // Go reference: server/cluster_test.go — TestClusterRouteReconnect, TestClusterQueueSubs, etc. [Collection("E2E-ThreeNodeCluster")] public class ClusterResilienceTests(ThreeNodeClusterFixture fixture) : IClassFixture { /// /// Kill node 2, then verify a pub on node 0 reaches a sub on node 1. /// Go reference: server/cluster_test.go TestClusterRouteSingleHopSubjectMatch /// [Fact] public async Task NodeDies_TrafficReroutesToSurvivors() { using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60)); await fixture.KillNode(2); // Surviving cluster (nodes 0 and 1) should have 1 route each await fixture.WaitForFullMeshAsync(expectedRoutes: 1); await using var publisher = fixture.CreateClient(0); await using var subscriber = fixture.CreateClient(1); await publisher.ConnectAsync(); await subscriber.ConnectAsync(); const string subject = "resilience.node-dies"; await using var sub = await subscriber.SubscribeCoreAsync(subject, cancellationToken: cts.Token); await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token); await publisher.PublishAsync(subject, "hello-after-kill", cancellationToken: cts.Token); await publisher.PingAsync(cts.Token); var msg = await sub.Msgs.ReadAsync(cts.Token); msg.Data.ShouldBe("hello-after-kill"); // Restore node 2 and wait for full mesh before next test await fixture.RestartNode(2); await fixture.WaitForFullMeshAsync(expectedRoutes: 2); } /// /// Kill node 2, restart it, then verify subscriptions on the restarted node receive messages from node 0. /// Go reference: server/cluster_test.go TestClusterRouteReconnect /// [Fact] public async Task NodeRejoins_SubscriptionsPropagateAgain() { using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60)); await fixture.KillNode(2); await fixture.RestartNode(2); await fixture.WaitForFullMeshAsync(expectedRoutes: 2); await using var publisher = fixture.CreateClient(0); await using var subscriber = fixture.CreateClient(2); await publisher.ConnectAsync(); await subscriber.ConnectAsync(); const string subject = "resilience.node-rejoins"; await using var sub = await subscriber.SubscribeCoreAsync(subject, cancellationToken: cts.Token); await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token); await publisher.PublishAsync(subject, "hello-after-rejoin", cancellationToken: cts.Token); await publisher.PingAsync(cts.Token); var msg = await sub.Msgs.ReadAsync(cts.Token); msg.Data.ShouldBe("hello-after-rejoin"); } /// /// Kill node 1 and restart it, then verify pub on node 0 reaches sub on node 1 once full mesh is restored. /// Go reference: server/cluster_test.go TestClusterRouteReconnect /// [Fact] public async Task AllRoutesReconnect_AfterNodeRestart() { using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60)); await fixture.KillNode(1); await fixture.RestartNode(1); await fixture.WaitForFullMeshAsync(expectedRoutes: 2); await using var publisher = fixture.CreateClient(0); await using var subscriber = fixture.CreateClient(1); await publisher.ConnectAsync(); await subscriber.ConnectAsync(); const string subject = "resilience.all-routes-reconnect"; await using var sub = await subscriber.SubscribeCoreAsync(subject, cancellationToken: cts.Token); await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token); await publisher.PublishAsync(subject, "hello-after-restart", cancellationToken: cts.Token); await publisher.PingAsync(cts.Token); var msg = await sub.Msgs.ReadAsync(cts.Token); msg.Data.ShouldBe("hello-after-restart"); } /// /// Two queue subscribers on node 1 in the same group receive all 10 messages even when node 2 is killed. /// Go reference: server/cluster_test.go TestClusterQueueSubscriberWithLoss /// [Fact] [SlopwatchSuppress("SW004", "Task.Delay(Timeout.Infinite) is used purely as a cancellable wait primitive alongside WhenAny — it never actually delays; drainTimeout drives the deadline")] public async Task QueueGroup_NodeDies_RemainingMembersDeliver() { using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60)); await using var publisher = fixture.CreateClient(0); await using var subConn1 = fixture.CreateClient(1); await using var subConn2 = fixture.CreateClient(1); await publisher.ConnectAsync(); await subConn1.ConnectAsync(); await subConn2.ConnectAsync(); const string subject = "resilience.queue-group"; const string queue = "rq"; await using var sub1 = await subConn1.SubscribeCoreAsync( subject, queueGroup: queue, cancellationToken: cts.Token); await using var sub2 = await subConn2.SubscribeCoreAsync( subject, queueGroup: queue, cancellationToken: cts.Token); // Wait for subscriptions to propagate to node 0 await WaitForPropagationAsync(publisher, subConn1, $"probe.{subject}", cts.Token); // Kill node 2 — this tests cluster stability but node 1 subs remain unaffected await fixture.KillNode(2); await fixture.WaitForFullMeshAsync(expectedRoutes: 1); const int messageCount = 10; for (var i = 0; i < messageCount; i++) { await publisher.PublishAsync(subject, $"msg-{i}", cancellationToken: cts.Token); } await publisher.PingAsync(cts.Token); // Drain messages from both subscribers; each message is delivered to exactly one member. // Use a TaskCompletionSource as the completion signal so the happy path never throws. var received = new List(messageCount); var allReceived = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); async Task DrainAsync(INatsSub sub) { await foreach (var msg in sub.Msgs.ReadAllAsync(cts.Token)) { int count; lock (received) { received.Add(msg.Data!); count = received.Count; } if (count >= messageCount) { allReceived.TrySetResult(); return; } } } var drain1 = DrainAsync(sub1); var drain2 = DrainAsync(sub2); using var drainTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(10)); await allReceived.Task.WaitAsync(drainTimeout.Token); received.Count.ShouldBe(messageCount); // Restore node 2 and wait for full mesh before next test await fixture.RestartNode(2); await fixture.WaitForFullMeshAsync(expectedRoutes: 2); } // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- /// /// Probes the route between publisher and subscriber by sending probe messages until /// the subscriber sees one, indicating subscriptions have propagated across the cluster. /// [SlopwatchSuppress("SW003", "OperationCanceledException from the inner readCts timeout is the expected retry signal; swallowing it is intentional so the polling loop continues to the next tick")] private static async Task WaitForPropagationAsync( NatsConnection publisher, NatsConnection subscriber, string probeSubject, CancellationToken ct) { await using var probeSub = await subscriber.SubscribeCoreAsync(probeSubject, cancellationToken: ct); // PingAsync on subscriber flushes the SUB to the server before we start probing. await subscriber.PingAsync(ct); using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(150)); while (await timer.WaitForNextTickAsync(ct)) { await publisher.PublishAsync(probeSubject, "probe", cancellationToken: ct); // PingAsync is a request/reply round-trip: when the pong returns, any message // the server dispatched before the ping is already buffered in probeSub.Msgs. await publisher.PingAsync(ct); if (probeSub.Msgs.TryRead(out _)) return; } } }