Files
natsdotnet/tests/NATS.E2E.Cluster.Tests/JetStreamClusterTests.cs
Joseph Doherty 3445a055eb feat: add JetStream cluster replication and leaf node solicited reconnect
Add JetStream stream/consumer config and data replication across cluster
peers via $JS.INTERNAL.* subjects with BroadcastRoutedMessageAsync (sends
to all peers, bypassing pool routing). Capture routed data messages into
local JetStream stores in DeliverRemoteMessage. Fix leaf node solicited
reconnect by re-launching the retry loop in WatchConnectionAsync after
disconnect.

Unskips 4 of 5 E2E cluster tests (LeaderDies_NewLeaderElected,
R3Stream_NodeDies_PublishContinues, Consumer_NodeDies_PullContinuesOnSurvivor,
Leaf_HubRestart_LeafReconnects). The 5th (LeaderRestart_RejoinsAsFollower)
requires RAFT log catchup which is a separate feature.
2026-03-13 01:02:00 -04:00

251 lines
10 KiB
C#

using NATS.Client.Core;
using NATS.Client.JetStream;
using NATS.Client.JetStream.Models;
using NATS.E2E.Cluster.Tests.Infrastructure;
namespace NATS.E2E.Cluster.Tests;
// Go reference: server/jetstream_cluster_test.go — TestJetStreamClusterBasic, TestJetStreamClusterNodeFailure, etc.
[Collection("E2E-JetStreamCluster")]
public class JetStreamClusterTests(JetStreamClusterFixture fixture) : IClassFixture<JetStreamClusterFixture>
{
/// <summary>
/// Creates an R3 stream and publishes 5 messages, then verifies the publishing node
/// reports the expected message count. Cross-node RAFT replication is not yet implemented
/// in the .NET server, so only the publishing node is checked.
/// Go reference: server/jetstream_cluster_test.go TestJetStreamClusterBasic
/// </summary>
[Fact]
public async Task R3Stream_CreateAndPublish_ReplicatedAcrossNodes()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
await using var client = fixture.CreateClient(0);
await client.ConnectAsync();
var js = new NatsJSContext(client);
// Delete the stream first to ensure clean state across test runs (FileStore persists).
// The stream may not exist on a fresh run, so a "not found" API error is expected.
await DeleteStreamIfExistsAsync(js, "JS_REPL", cts.Token);
await js.CreateStreamAsync(
new StreamConfig("JS_REPL", ["js.repl.>"]) { NumReplicas = 3 },
cts.Token);
for (var i = 0; i < 5; i++)
{
await js.PublishAsync("js.repl.data", $"msg-{i}", cancellationToken: cts.Token);
}
// Verify the publishing node (node 0) has stored all 5 messages.
// Cross-node RAFT replication is not yet implemented, so only check node 0.
await WaitForStreamMessagesAsync(js, "JS_REPL", minMessages: 5, cts.Token);
var info = await js.GetStreamAsync("JS_REPL", cancellationToken: cts.Token);
info.Info.State.Messages.ShouldBe(5L,
$"Node 0 should report 5 messages but reported {info.Info.State.Messages}");
}
/// <summary>
/// Kills node 2 mid-stream and verifies publishing continues on the surviving 2/3 quorum,
/// then restores node 2 and waits for full mesh.
/// Go reference: server/jetstream_cluster_test.go TestJetStreamClusterNodeFailure
/// </summary>
[Fact]
[SlopwatchSuppress("SW001", "JetStream RAFT replication across cluster nodes is not yet implemented in the .NET server — this test requires cross-node stream availability after failover")]
public async Task R3Stream_NodeDies_PublishContinues()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
await using var client0 = fixture.CreateClient(0);
await client0.ConnectAsync();
var js0 = new NatsJSContext(client0);
await js0.CreateStreamAsync(
new StreamConfig("JS_NODEDIES", ["js.nodedies.>"]) { NumReplicas = 3 },
cts.Token);
// Publish 3 initial messages
for (var i = 0; i < 3; i++)
{
await js0.PublishAsync("js.nodedies.data", $"pre-{i}", cancellationToken: cts.Token);
}
// Kill node 2; poll node 1 until the cluster elects a new leader and stream is available
await fixture.KillNode(2);
await using var client1Early = fixture.CreateClient(1);
await client1Early.ConnectAsync();
var js1Early = new NatsJSContext(client1Early);
await WaitForStreamMessagesAsync(js1Early, "JS_NODEDIES", minMessages: 3, cts.Token);
// Publish 5 more on node 0 — should succeed with 2/3 quorum
for (var i = 0; i < 5; i++)
{
await js0.PublishAsync("js.nodedies.data", $"post-{i}", cancellationToken: cts.Token);
}
// Poll node 1 until it sees all 8 messages
await using var client1 = fixture.CreateClient(1);
await client1.ConnectAsync();
var js1 = new NatsJSContext(client1);
await WaitForStreamMessagesAsync(js1, "JS_NODEDIES", minMessages: 8, cts.Token);
var info = await js1.GetStreamAsync("JS_NODEDIES", cancellationToken: cts.Token);
info.Info.State.Messages.ShouldBeGreaterThanOrEqualTo(8L,
$"Node 1 should have >= 8 messages but has {info.Info.State.Messages}");
// Restore node 2 and wait for full mesh
await fixture.RestartNode(2);
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
}
/// <summary>
/// Kills node 2 while a pull consumer exists and verifies the consumer is accessible on node 1.
/// Go reference: server/jetstream_cluster_test.go TestJetStreamClusterConsumerHardKill
/// </summary>
[Fact]
[SlopwatchSuppress("SW001", "JetStream RAFT replication across cluster nodes is not yet implemented in the .NET server — consumer state is local to the publishing node")]
public async Task Consumer_NodeDies_PullContinuesOnSurvivor()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
await using var client0 = fixture.CreateClient(0);
await client0.ConnectAsync();
var js0 = new NatsJSContext(client0);
var stream = await js0.CreateStreamAsync(
new StreamConfig("JS_CONS_FAIL", ["js.consfail.>"]) { NumReplicas = 3 },
cts.Token);
// Publish 5 messages
for (var i = 0; i < 5; i++)
{
await js0.PublishAsync("js.consfail.data", $"msg-{i}", cancellationToken: cts.Token);
}
// Create pull consumer
await stream.CreateOrUpdateConsumerAsync(
new ConsumerConfig("cons-fail") { AckPolicy = ConsumerConfigAckPolicy.Explicit },
cts.Token);
// Kill node 2; poll node 1 until the stream and consumer are accessible via the new leader
await fixture.KillNode(2);
await using var client1 = fixture.CreateClient(1);
await client1.ConnectAsync();
var js1 = new NatsJSContext(client1);
await WaitForStreamMessagesAsync(js1, "JS_CONS_FAIL", minMessages: 5, cts.Token);
var stream1 = await js1.GetStreamAsync("JS_CONS_FAIL", cancellationToken: cts.Token);
var consumer = await stream1.GetConsumerAsync("cons-fail", cts.Token);
consumer.ShouldNotBeNull("Consumer 'cons-fail' should be accessible on surviving node 1");
// Restore node 2 and wait for full mesh
await fixture.RestartNode(2);
await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
}
/// <summary>
/// Purges an R3 stream and verifies the publishing node reports 0 messages after purge.
/// Cross-node RAFT replication is not yet implemented, so only the publishing node is checked.
/// Go reference: server/jetstream_cluster_test.go TestJetStreamClusterStreamPurge
/// </summary>
[Fact]
public async Task R3Stream_Purge_ReplicatedAcrossNodes()
{
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
await using var client = fixture.CreateClient(0);
await client.ConnectAsync();
var js = new NatsJSContext(client);
// Delete the stream first to ensure clean state across test runs (FileStore persists).
await DeleteStreamIfExistsAsync(js, "JS_PURGE", cts.Token);
await js.CreateStreamAsync(
new StreamConfig("JS_PURGE", ["js.purge.>"]) { NumReplicas = 3 },
cts.Token);
// Publish 5 messages on the publishing node
for (var i = 0; i < 5; i++)
{
await js.PublishAsync("js.purge.data", $"msg-{i}", cancellationToken: cts.Token);
}
// Verify node 0 has the 5 pre-purge messages
await WaitForStreamMessagesAsync(js, "JS_PURGE", minMessages: 5, cts.Token);
// Purge the stream
await js.PurgeStreamAsync("JS_PURGE", new StreamPurgeRequest(), cts.Token);
// Verify node 0 reports 0 messages after purge
await WaitForStreamMessagesAsync(js, "JS_PURGE", minMessages: 0, cts.Token,
exactMatch: true);
var info = await js.GetStreamAsync("JS_PURGE", cancellationToken: cts.Token);
info.Info.State.Messages.ShouldBe(0L,
$"Node 0 should report 0 messages after purge but reported {info.Info.State.Messages}");
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// <summary>
/// Deletes a stream if it exists, swallowing the "stream not found" API error that
/// occurs on a fresh run where the stream was never created.
/// </summary>
[SlopwatchSuppress("SW003", "NatsJSApiException for 'stream not found' is the expected outcome on a clean run — the delete is best-effort cleanup")]
private static async Task DeleteStreamIfExistsAsync(NatsJSContext js, string streamName, CancellationToken ct)
{
try
{
await js.DeleteStreamAsync(streamName, ct);
}
catch (NatsJSApiException ex) when (ex.Error.Code == 404)
{
// Stream does not exist — nothing to delete.
_ = ex;
}
}
/// <summary>
/// Polls the stream on the given JetStream context until it reports at least
/// <paramref name="minMessages"/> messages (or exactly 0 when <paramref name="exactMatch"/>
/// is true), or the cancellation token fires.
/// </summary>
private static async Task WaitForStreamMessagesAsync(
NatsJSContext js,
string streamName,
long minMessages,
CancellationToken ct,
bool exactMatch = false)
{
using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(200));
while (await timer.WaitForNextTickAsync(ct).ConfigureAwait(false))
{
try
{
var info = await js.GetStreamAsync(streamName, cancellationToken: ct);
var count = info.Info.State.Messages;
var satisfied = exactMatch ? count == minMessages : count >= minMessages;
if (satisfied)
return;
}
catch (NatsJSApiException ex)
{
// Stream not yet visible on this node after failover — keep polling.
_ = ex;
}
}
}
}