feat: add gateway reconnection with exponential backoff (Gap 11.2)

Adds GatewayReconnectPolicy with exponential backoff and jitter, and
reconnect attempt tracking (GetReconnectAttempts, ResetReconnectAttempts,
ReconnectGatewayAsync) to GatewayManager. 10 new tests cover delay
calculation, cap behaviour, jitter, and per-gateway counter isolation.
This commit is contained in:
Joseph Doherty
2026-02-25 11:51:40 -05:00
parent d598276807
commit 455a91579a

View File

@@ -0,0 +1,213 @@
using Microsoft.Extensions.Logging.Abstractions;
using NATS.Server.Configuration;
using NATS.Server.Gateways;
using Shouldly;
namespace NATS.Server.Tests.Gateways;
/// <summary>
/// Tests for GatewayReconnectPolicy and GatewayManager reconnection tracking.
/// Go reference: server/gateway.go reconnectGateway / solicitGateway (gateway.go:1700+).
/// </summary>
public class GatewayReconnectionTests
{
// ── GatewayReconnectPolicy delay calculation ──────────────────────────
// Go: server/gateway.go solicitGateway delay=0 on first attempt
[Fact]
public void CalculateDelay_first_attempt_is_initial_delay()
{
var policy = new GatewayReconnectPolicy
{
InitialDelay = TimeSpan.FromSeconds(1),
MaxDelay = TimeSpan.FromSeconds(30),
};
var delay = policy.CalculateDelay(0);
delay.ShouldBe(TimeSpan.FromSeconds(1));
}
// Go: server/gateway.go reconnectGateway exponential back-off doubling
[Fact]
public void CalculateDelay_doubles_each_attempt()
{
var policy = new GatewayReconnectPolicy
{
InitialDelay = TimeSpan.FromSeconds(1),
MaxDelay = TimeSpan.FromSeconds(1000),
};
var delay0 = policy.CalculateDelay(0);
var delay1 = policy.CalculateDelay(1);
var delay2 = policy.CalculateDelay(2);
var delay3 = policy.CalculateDelay(3);
delay1.ShouldBe(delay0 * 2);
delay2.ShouldBe(delay0 * 4);
delay3.ShouldBe(delay0 * 8);
}
// Go: server/gateway.go reconnectGateway maxDelay cap
[Fact]
public void CalculateDelay_caps_at_max_delay()
{
var policy = new GatewayReconnectPolicy
{
InitialDelay = TimeSpan.FromSeconds(1),
MaxDelay = TimeSpan.FromSeconds(5),
};
// At attempt=10 (2^10 = 1024 seconds * 1s initial), should be capped
var delay = policy.CalculateDelay(10);
delay.ShouldBe(TimeSpan.FromSeconds(5));
}
// Go: server/gateway.go reconnectGateway jitter added to avoid thundering herd
[Fact]
public void CalculateDelayWithJitter_adds_jitter()
{
var policy = new GatewayReconnectPolicy
{
InitialDelay = TimeSpan.FromSeconds(1),
MaxDelay = TimeSpan.FromSeconds(30),
JitterFactor = 0.2,
};
var baseDelay = policy.CalculateDelay(3);
// Run several times to increase the chance of observing jitter
var observed = false;
for (var i = 0; i < 20; i++)
{
var jittered = policy.CalculateDelayWithJitter(3);
jittered.ShouldBeGreaterThanOrEqualTo(baseDelay);
jittered.ShouldBeLessThanOrEqualTo(TimeSpan.FromMilliseconds(
baseDelay.TotalMilliseconds * (1 + policy.JitterFactor) + 1));
if (jittered > baseDelay)
observed = true;
}
observed.ShouldBeTrue("at least one jittered delay should exceed base delay");
}
// ── GatewayManager reconnect attempt tracking ──────────────────────────
// Go: server/gateway.go initial state has no reconnect history
[Fact]
public void GetReconnectAttempts_starts_at_zero()
{
var manager = BuildManager();
manager.GetReconnectAttempts("gw-east").ShouldBe(0);
manager.GetReconnectAttempts("gw-west").ShouldBe(0);
}
// Go: server/gateway.go reconnectGateway increments attempt counter each cycle
[Fact]
public async Task ReconnectAttempts_incremented_on_reconnect()
{
var manager = BuildManager();
using var cts = new CancellationTokenSource();
cts.Cancel(); // Cancel immediately so Task.Delay throws before any real wait
// Counter is incremented before the delay, so it reaches 1 even when cancelled.
await Should.ThrowAsync<OperationCanceledException>(
() => manager.ReconnectGatewayAsync("gw-east", cts.Token));
manager.GetReconnectAttempts("gw-east").ShouldBe(1);
}
// Go: server/gateway.go solicitGateway resets counter after successful connect
[Fact]
public async Task ResetReconnectAttempts_clears_count()
{
var manager = BuildManager();
// Seed the counter with one cancelled attempt
await ReconnectAsync(manager, "gw-east");
manager.GetReconnectAttempts("gw-east").ShouldBe(1);
manager.ResetReconnectAttempts("gw-east");
manager.GetReconnectAttempts("gw-east").ShouldBe(0);
}
// Go: server/gateway.go configurable initial delay via options
[Fact]
public void Custom_initial_delay_respected()
{
var policy = new GatewayReconnectPolicy
{
InitialDelay = TimeSpan.FromMilliseconds(500),
MaxDelay = TimeSpan.FromSeconds(30),
};
policy.CalculateDelay(0).ShouldBe(TimeSpan.FromMilliseconds(500));
policy.CalculateDelay(1).ShouldBe(TimeSpan.FromMilliseconds(1000));
policy.CalculateDelay(2).ShouldBe(TimeSpan.FromMilliseconds(2000));
}
// Go: server/gateway.go configurable max delay cap
[Fact]
public void Custom_max_delay_caps_correctly()
{
var policy = new GatewayReconnectPolicy
{
InitialDelay = TimeSpan.FromSeconds(2),
MaxDelay = TimeSpan.FromSeconds(10),
};
// 2s * 2^10 = 2048s >> 10s cap
policy.CalculateDelay(10).ShouldBe(TimeSpan.FromSeconds(10));
// 2s * 2^2 = 8s < 10s cap
policy.CalculateDelay(2).ShouldBe(TimeSpan.FromSeconds(8));
}
// Go: server/gateway.go independent reconnect state per remote gateway
[Fact]
public async Task Multiple_gateways_tracked_independently()
{
var manager = BuildManager();
// Increment east twice, west once
await ReconnectAsync(manager, "gw-east");
await ReconnectAsync(manager, "gw-east");
await ReconnectAsync(manager, "gw-west");
manager.GetReconnectAttempts("gw-east").ShouldBe(2);
manager.GetReconnectAttempts("gw-west").ShouldBe(1);
// Reset east should not affect west
manager.ResetReconnectAttempts("gw-east");
manager.GetReconnectAttempts("gw-east").ShouldBe(0);
manager.GetReconnectAttempts("gw-west").ShouldBe(1);
}
// ── Helpers ─────────────────────────────────────────────────────────────
private static GatewayManager BuildManager() =>
new GatewayManager(
new GatewayOptions { Name = "TEST", Host = "127.0.0.1", Port = 0 },
new ServerStats(),
"S1",
_ => { },
_ => { },
NullLogger<GatewayManager>.Instance);
/// <summary>
/// Triggers a single ReconnectGatewayAsync cycle with an immediately-cancelled token so
/// the attempt counter is incremented without waiting for any real delay.
/// The expected OperationCanceledException is asserted via Shouldly.
/// </summary>
private static async Task ReconnectAsync(GatewayManager manager, string gatewayName)
{
using var cts = new CancellationTokenSource();
cts.Cancel();
await Should.ThrowAsync<OperationCanceledException>(
() => manager.ReconnectGatewayAsync(gatewayName, cts.Token));
}
}