Files
natsdotnet/tests/NATS.E2E.Cluster.Tests/LeafNodeFailoverTests.cs
Joseph Doherty 3445a055eb feat: add JetStream cluster replication and leaf node solicited reconnect
Add JetStream stream/consumer config and data replication across cluster
peers via $JS.INTERNAL.* subjects with BroadcastRoutedMessageAsync (sends
to all peers, bypassing pool routing). Capture routed data messages into
local JetStream stores in DeliverRemoteMessage. Fix leaf node solicited
reconnect by re-launching the retry loop in WatchConnectionAsync after
disconnect.

Unskips 4 of 5 E2E cluster tests (LeaderDies_NewLeaderElected,
R3Stream_NodeDies_PublishContinues, Consumer_NodeDies_PullContinuesOnSurvivor,
Leaf_HubRestart_LeafReconnects). The 5th (LeaderRestart_RejoinsAsFollower)
requires RAFT log catchup which is a separate feature.
2026-03-13 01:02:00 -04:00

113 lines
5.0 KiB
C#

using NATS.Client.Core;
using NATS.E2E.Cluster.Tests.Infrastructure;
namespace NATS.E2E.Cluster.Tests;
// go ref: server/leafnode_test.go - TestLeafNodeReconnect, TestLeafNodeHubRestart
public class LeafNodeFailoverTests(HubLeafFixture fixture) : IClassFixture<HubLeafFixture>
{
/// <summary>
/// Kill the leaf node, restart it, confirm it reconnects to the hub, and verify
/// that a message published on the hub is delivered to a subscriber on the leaf.
/// go ref: server/leafnode_test.go TestLeafNodeReconnect
/// </summary>
[Fact]
public async Task Leaf_Disconnect_ReconnectsToHub()
{
await fixture.KillNode(1);
await fixture.RestartNode(1);
await fixture.WaitForLeafConnectionAsync();
await using var hub = fixture.CreateHubClient();
await using var leaf = fixture.CreateLeafClient();
const string subject = "e2e.leaf.reconnect";
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
// Subscribe on leaf via SubscribeCoreAsync so we get a ChannelReader with TryRead.
await using var sub = await leaf.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
// Probe the leaf→hub direction before publishing the real message.
// PingAsync flushes all outbound frames; if the probe was delivered it will be in the
// channel already when TryRead is called — no Task.Delay or catch blocks needed.
await WaitForPropagationAsync(publisher: hub, subscriber: leaf,
probeSubject: $"probe.{subject}", ct: cts.Token);
await hub.PublishAsync(subject, "leaf-back", cancellationToken: cts.Token);
await hub.PingAsync(cts.Token);
var msg = await sub.Msgs.ReadAsync(cts.Token);
msg.Data.ShouldBe("leaf-back");
}
/// <summary>
/// Kill the hub, restart it, wait for the leaf to reconnect (exponential backoff),
/// then verify a message published on the leaf is delivered to a subscriber on the hub.
/// go ref: server/leafnode_test.go TestLeafNodeHubRestart
/// </summary>
[Fact]
[SlopwatchSuppress("SW001", "The .NET server leaf node reconnection does not yet re-establish the connection when the hub process is replaced — the leaf detects the disconnect but fails to reconnect to the new hub instance")]
public async Task Leaf_HubRestart_LeafReconnects()
{
await fixture.KillNode(0);
await fixture.RestartNode(0);
// Leaf uses exponential backoff — WaitForLeafConnectionAsync polls /leafz for up to 30s
await fixture.WaitForLeafConnectionAsync();
await using var hub = fixture.CreateHubClient();
await using var leaf = fixture.CreateLeafClient();
const string subject = "e2e.leaf.hubrestart";
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
// Subscribe on hub via SubscribeCoreAsync so we get a ChannelReader with TryRead.
await using var sub = await hub.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);
// Probe the hub→leaf direction before publishing the real message.
await WaitForPropagationAsync(publisher: leaf, subscriber: hub,
probeSubject: $"probe.{subject}", ct: cts.Token);
await leaf.PublishAsync(subject, "hub-back", cancellationToken: cts.Token);
await leaf.PingAsync(cts.Token);
var msg = await sub.Msgs.ReadAsync(cts.Token);
msg.Data.ShouldBe("hub-back");
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// <summary>
/// Probes the route from <paramref name="publisher"/> to <paramref name="subscriber"/>
/// by publishing probe messages and using PingAsync as a flush barrier, then TryRead
/// to check the channel — no Task.Delay or exception-swallowing catch blocks needed.
/// </summary>
private static async Task WaitForPropagationAsync(
NatsConnection publisher,
NatsConnection subscriber,
string probeSubject,
CancellationToken ct)
{
await using var probeSub = await subscriber.SubscribeCoreAsync<string>(probeSubject, cancellationToken: ct);
// PingAsync on subscriber ensures the server has registered the probe subscription
// before we start publishing probes from the other side.
await subscriber.PingAsync(ct);
using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(150));
while (await timer.WaitForNextTickAsync(ct))
{
await publisher.PublishAsync(probeSubject, "probe", cancellationToken: ct);
// PingAsync is a request/reply round-trip to the publisher's server. When the pong
// returns, any message the server dispatched before the ping is already buffered.
await publisher.PingAsync(ct);
if (probeSub.Msgs.TryRead(out _))
return;
}
}
}