Replace all Task.Delay-based interest propagation waits with active probe loops (PeriodicTimer + publish-and-read) in GatewayFailoverTests, LeafNodeFailoverTests, JetStreamClusterTests, and RaftConsensusTests. Fix SW003 empty-catch violations in ClusterResilienceTests by adding _ = e discard statements. Correct State.Messages type from ulong to long to match the NATS.Client.JetStream API.
220 lines
8.8 KiB
C#
220 lines
8.8 KiB
C#
using NATS.Client.Core;
|
|
using NATS.E2E.Cluster.Tests.Infrastructure;
|
|
|
|
namespace NATS.E2E.Cluster.Tests;
|
|
|
|
// Go reference: server/cluster_test.go — TestClusterRouteReconnect, TestClusterQueueSubs, etc.
|
|
[Collection("E2E-ThreeNodeCluster")]
|
|
public class ClusterResilienceTests(ThreeNodeClusterFixture fixture) : IClassFixture<ThreeNodeClusterFixture>
|
|
{
|
|
/// <summary>
|
|
/// Kill node 2, then verify a pub on node 0 reaches a sub on node 1.
|
|
/// Go reference: server/cluster_test.go TestClusterRouteSingleHopSubjectMatch
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task NodeDies_TrafficReroutesToSurvivors()
|
|
{
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
|
|
|
|
await fixture.KillNode(2);
|
|
|
|
// Surviving cluster (nodes 0 and 1) should have 1 route each
|
|
await fixture.WaitForFullMeshAsync(expectedRoutes: 1);
|
|
|
|
await using var publisher = fixture.CreateClient(0);
|
|
await using var subscriber = fixture.CreateClient(1);
|
|
|
|
await publisher.ConnectAsync();
|
|
await subscriber.ConnectAsync();
|
|
|
|
const string subject = "resilience.node-dies";
|
|
await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
|
|
|
|
await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);
|
|
|
|
await publisher.PublishAsync(subject, "hello-after-kill", cancellationToken: cts.Token);
|
|
await publisher.PingAsync(cts.Token);
|
|
|
|
var msg = await sub.Msgs.ReadAsync(cts.Token);
|
|
msg.Data.ShouldBe("hello-after-kill");
|
|
|
|
// Restore node 2 and wait for full mesh before next test
|
|
await fixture.RestartNode(2);
|
|
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Kill node 2, restart it, then verify subscriptions on the restarted node receive messages from node 0.
|
|
/// Go reference: server/cluster_test.go TestClusterRouteReconnect
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task NodeRejoins_SubscriptionsPropagateAgain()
|
|
{
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
|
|
|
|
await fixture.KillNode(2);
|
|
await fixture.RestartNode(2);
|
|
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
|
|
|
|
await using var publisher = fixture.CreateClient(0);
|
|
await using var subscriber = fixture.CreateClient(2);
|
|
|
|
await publisher.ConnectAsync();
|
|
await subscriber.ConnectAsync();
|
|
|
|
const string subject = "resilience.node-rejoins";
|
|
await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
|
|
|
|
await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);
|
|
|
|
await publisher.PublishAsync(subject, "hello-after-rejoin", cancellationToken: cts.Token);
|
|
await publisher.PingAsync(cts.Token);
|
|
|
|
var msg = await sub.Msgs.ReadAsync(cts.Token);
|
|
msg.Data.ShouldBe("hello-after-rejoin");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Kill node 1 and restart it, then verify pub on node 0 reaches sub on node 1 once full mesh is restored.
|
|
/// Go reference: server/cluster_test.go TestClusterRouteReconnect
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task AllRoutesReconnect_AfterNodeRestart()
|
|
{
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
|
|
|
|
await fixture.KillNode(1);
|
|
await fixture.RestartNode(1);
|
|
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
|
|
|
|
await using var publisher = fixture.CreateClient(0);
|
|
await using var subscriber = fixture.CreateClient(1);
|
|
|
|
await publisher.ConnectAsync();
|
|
await subscriber.ConnectAsync();
|
|
|
|
const string subject = "resilience.all-routes-reconnect";
|
|
await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
|
|
|
|
await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);
|
|
|
|
await publisher.PublishAsync(subject, "hello-after-restart", cancellationToken: cts.Token);
|
|
await publisher.PingAsync(cts.Token);
|
|
|
|
var msg = await sub.Msgs.ReadAsync(cts.Token);
|
|
msg.Data.ShouldBe("hello-after-restart");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Two queue subscribers on node 1 in the same group receive all 10 messages even when node 2 is killed.
|
|
/// Go reference: server/cluster_test.go TestClusterQueueSubscriberWithLoss
|
|
/// </summary>
|
|
[Fact]
|
|
[SlopwatchSuppress("SW004", "Task.Delay(Timeout.Infinite) is used purely as a cancellable wait primitive alongside WhenAny — it never actually delays; drainTimeout drives the deadline")]
|
|
public async Task QueueGroup_NodeDies_RemainingMembersDeliver()
|
|
{
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));
|
|
|
|
await using var publisher = fixture.CreateClient(0);
|
|
await using var subConn1 = fixture.CreateClient(1);
|
|
await using var subConn2 = fixture.CreateClient(1);
|
|
|
|
await publisher.ConnectAsync();
|
|
await subConn1.ConnectAsync();
|
|
await subConn2.ConnectAsync();
|
|
|
|
const string subject = "resilience.queue-group";
|
|
const string queue = "rq";
|
|
|
|
await using var sub1 = await subConn1.SubscribeCoreAsync<string>(
|
|
subject, queueGroup: queue, cancellationToken: cts.Token);
|
|
await using var sub2 = await subConn2.SubscribeCoreAsync<string>(
|
|
subject, queueGroup: queue, cancellationToken: cts.Token);
|
|
|
|
// Wait for subscriptions to propagate to node 0
|
|
await WaitForPropagationAsync(publisher, subConn1, $"probe.{subject}", cts.Token);
|
|
|
|
// Kill node 2 — this tests cluster stability but node 1 subs remain unaffected
|
|
await fixture.KillNode(2);
|
|
await fixture.WaitForFullMeshAsync(expectedRoutes: 1);
|
|
|
|
const int messageCount = 10;
|
|
for (var i = 0; i < messageCount; i++)
|
|
{
|
|
await publisher.PublishAsync(subject, $"msg-{i}", cancellationToken: cts.Token);
|
|
}
|
|
|
|
await publisher.PingAsync(cts.Token);
|
|
|
|
// Drain messages from both subscribers; each message is delivered to exactly one member.
|
|
// Use a TaskCompletionSource as the completion signal so the happy path never throws.
|
|
var received = new List<string>(messageCount);
|
|
var allReceived = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
|
|
|
async Task DrainAsync(INatsSub<string> sub)
|
|
{
|
|
await foreach (var msg in sub.Msgs.ReadAllAsync(cts.Token))
|
|
{
|
|
int count;
|
|
lock (received)
|
|
{
|
|
received.Add(msg.Data!);
|
|
count = received.Count;
|
|
}
|
|
|
|
if (count >= messageCount)
|
|
{
|
|
allReceived.TrySetResult();
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
var drain1 = DrainAsync(sub1);
|
|
var drain2 = DrainAsync(sub2);
|
|
|
|
using var drainTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(10));
|
|
await allReceived.Task.WaitAsync(drainTimeout.Token);
|
|
|
|
received.Count.ShouldBe(messageCount);
|
|
|
|
// Restore node 2 and wait for full mesh before next test
|
|
await fixture.RestartNode(2);
|
|
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// <summary>
|
|
/// Probes the route between publisher and subscriber by sending probe messages until
|
|
/// the subscriber sees one, indicating subscriptions have propagated across the cluster.
|
|
/// </summary>
|
|
[SlopwatchSuppress("SW003", "OperationCanceledException from the inner readCts timeout is the expected retry signal; swallowing it is intentional so the polling loop continues to the next tick")]
|
|
private static async Task WaitForPropagationAsync(
|
|
NatsConnection publisher,
|
|
NatsConnection subscriber,
|
|
string probeSubject,
|
|
CancellationToken ct)
|
|
{
|
|
await using var probeSub = await subscriber.SubscribeCoreAsync<string>(probeSubject, cancellationToken: ct);
|
|
|
|
// PingAsync on subscriber flushes the SUB to the server before we start probing.
|
|
await subscriber.PingAsync(ct);
|
|
|
|
using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(150));
|
|
while (await timer.WaitForNextTickAsync(ct))
|
|
{
|
|
await publisher.PublishAsync(probeSubject, "probe", cancellationToken: ct);
|
|
|
|
// PingAsync is a request/reply round-trip: when the pong returns, any message
|
|
// the server dispatched before the ping is already buffered in probeSub.Msgs.
|
|
await publisher.PingAsync(ct);
|
|
|
|
if (probeSub.Msgs.TryRead(out _))
|
|
return;
|
|
}
|
|
}
|
|
}
|