feat(dcl): failover on repeated unstable connections (connect-then-stale pattern)

Previously, failover only triggered when ConnectAsync failed consecutively.
If a connection succeeded but went stale quickly (e.g., heartbeat timeout),
the failure counter reset on each successful connect and failover never
triggered.

Added a separate _consecutiveUnstableDisconnects counter that increments
when a connection lasts less than StableConnectionThreshold (60s) before
disconnecting. When this counter reaches failoverRetryCount, the actor
fails over to the backup endpoint. Stable connections (lasting >60s)
reset this counter.

The original connection-failure failover path is unchanged.
This commit is contained in:
Joseph Doherty
2026-03-24 15:23:54 -04:00
parent ff2784b862
commit 5fdeaf613f
2 changed files with 64 additions and 3 deletions

View File

@@ -77,6 +77,8 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
private readonly int _failoverRetryCount;
private ActiveEndpoint _activeEndpoint = ActiveEndpoint.Primary;
private int _consecutiveFailures;
private int _consecutiveUnstableDisconnects;
private DateTimeOffset _lastConnectedAt;
/// <summary>
/// Captured Self reference for use from non-actor threads (event handlers, callbacks).
@@ -180,9 +182,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
// ── Connected State ──
/// <summary>
/// Minimum time connected before we consider the connection stable.
/// If we disconnect before this, it counts as an unstable connection toward failover.
/// </summary>
private static readonly TimeSpan StableConnectionThreshold = TimeSpan.FromSeconds(60);
private void BecomeConnected()
{
_log.Info("[{0}] Entering Connected state", _connectionName);
_lastConnectedAt = DateTimeOffset.UtcNow;
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Connected);
_healthCollector.UpdateTagResolution(_connectionName, _totalSubscribed, _resolvedTags);
var endpointLabel = _backupConfig == null ? "Connected" : $"Connected to {_activeEndpoint.ToString().ToLower()}";
@@ -236,6 +245,57 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Disconnected);
_healthCollector.UpdateConnectionEndpoint(_connectionName, "Disconnected");
// Track unstable connections toward failover.
// If we were connected for less than the stability threshold, this counts
// as an unstable cycle (e.g., connect succeeded but heartbeat went stale).
var connectionDuration = DateTimeOffset.UtcNow - _lastConnectedAt;
if (_lastConnectedAt != default && connectionDuration < StableConnectionThreshold)
{
_consecutiveUnstableDisconnects++;
_log.Warning("[{0}] Unstable connection (lasted {1:F0}s) — consecutive unstable disconnects: {2}/{3}",
_connectionName, connectionDuration.TotalSeconds, _consecutiveUnstableDisconnects,
_backupConfig != null ? _failoverRetryCount : 0);
}
else
{
_consecutiveUnstableDisconnects = 0;
}
// Failover if we keep connecting and going stale repeatedly
if (_backupConfig != null && _consecutiveUnstableDisconnects >= _failoverRetryCount)
{
var previousEndpoint = _activeEndpoint;
_activeEndpoint = _activeEndpoint == ActiveEndpoint.Primary
? ActiveEndpoint.Backup
: ActiveEndpoint.Primary;
_consecutiveUnstableDisconnects = 0;
_consecutiveFailures = 0;
var newConfig = _activeEndpoint == ActiveEndpoint.Primary
? _primaryConfig
: _backupConfig;
// Dispose old adapter
_adapter.Disconnected -= OnAdapterDisconnected;
_ = _adapter.DisposeAsync().AsTask();
// Create new adapter for the target endpoint
_adapter = _factory.Create(_protocolType, newConfig);
_connectionDetails = newConfig;
_adapter.Disconnected += OnAdapterDisconnected;
_log.Warning("[{0}] Failing over from {1} to {2} (unstable connection pattern)",
_connectionName, previousEndpoint, _activeEndpoint);
if (_siteEventLogger != null)
{
_ = _siteEventLogger.LogEventAsync(
"connection", "Warning", null, _connectionName,
$"Failover from {previousEndpoint} to {_activeEndpoint} (unstable connection)",
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {StableConnectionThreshold.TotalSeconds:F0}s");
}
}
// Log disconnect to site event log
if (_siteEventLogger != null)
{

View File

@@ -347,7 +347,7 @@ public class DataConnectionActorTests : TestKit
var count = Interlocked.Increment(ref connectCount);
// count 1: initial connect → success
// count 2,3: reconnect failures
// count 4: reconnect success (resets counter)
// count 4: reconnect success
// count 5,6: reconnect failures again
// count 7: reconnect success again
return count switch
@@ -366,7 +366,7 @@ public class DataConnectionActorTests : TestKit
AwaitCondition(() => connectCount >= 1, TimeSpan.FromSeconds(2));
await Task.Delay(200);
// Disconnect: triggers 2 failures then success (count 2,3,4)
// Disconnect: triggers 1 unstable disconnect + 2 failures then success (count 2,3,4)
RaiseDisconnected(primaryAdapter);
// Wait for successful reconnect (count 4)
@@ -380,7 +380,8 @@ public class DataConnectionActorTests : TestKit
AwaitCondition(() => connectCount >= 7, TimeSpan.FromSeconds(5));
await Task.Delay(200);
// Factory should never be called — counter reset each time before reaching 3
// Factory should never be called — connection failures counter resets on each
// successful reconnect, and unstable disconnect counter is separate
_mockFactory.DidNotReceive().Create(Arg.Any<string>(), Arg.Any<IDictionary<string, string>>());
}