feat(dcl): failover on repeated unstable connections (connect-then-stale pattern)
Previously, failover only triggered when ConnectAsync failed consecutively. If a connection succeeded but went stale quickly (e.g., heartbeat timeout), the failure counter reset on each successful connect and failover never triggered. Added a separate _consecutiveUnstableDisconnects counter that increments when a connection lasts less than StableConnectionThreshold (60s) before disconnecting. When this counter reaches failoverRetryCount, the actor fails over to the backup endpoint. Stable connections (lasting >60s) reset this counter. The original connection-failure failover path is unchanged.
This commit is contained in:
@@ -77,6 +77,8 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
private readonly int _failoverRetryCount;
|
||||
private ActiveEndpoint _activeEndpoint = ActiveEndpoint.Primary;
|
||||
private int _consecutiveFailures;
|
||||
private int _consecutiveUnstableDisconnects;
|
||||
private DateTimeOffset _lastConnectedAt;
|
||||
|
||||
/// <summary>
|
||||
/// Captured Self reference for use from non-actor threads (event handlers, callbacks).
|
||||
@@ -180,9 +182,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
// ── Connected State ──
|
||||
|
||||
/// <summary>
|
||||
/// Minimum time connected before we consider the connection stable.
|
||||
/// If we disconnect before this, it counts as an unstable connection toward failover.
|
||||
/// </summary>
|
||||
private static readonly TimeSpan StableConnectionThreshold = TimeSpan.FromSeconds(60);
|
||||
|
||||
private void BecomeConnected()
|
||||
{
|
||||
_log.Info("[{0}] Entering Connected state", _connectionName);
|
||||
_lastConnectedAt = DateTimeOffset.UtcNow;
|
||||
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Connected);
|
||||
_healthCollector.UpdateTagResolution(_connectionName, _totalSubscribed, _resolvedTags);
|
||||
var endpointLabel = _backupConfig == null ? "Connected" : $"Connected to {_activeEndpoint.ToString().ToLower()}";
|
||||
@@ -236,6 +245,57 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Disconnected);
|
||||
_healthCollector.UpdateConnectionEndpoint(_connectionName, "Disconnected");
|
||||
|
||||
// Track unstable connections toward failover.
|
||||
// If we were connected for less than the stability threshold, this counts
|
||||
// as an unstable cycle (e.g., connect succeeded but heartbeat went stale).
|
||||
var connectionDuration = DateTimeOffset.UtcNow - _lastConnectedAt;
|
||||
if (_lastConnectedAt != default && connectionDuration < StableConnectionThreshold)
|
||||
{
|
||||
_consecutiveUnstableDisconnects++;
|
||||
_log.Warning("[{0}] Unstable connection (lasted {1:F0}s) — consecutive unstable disconnects: {2}/{3}",
|
||||
_connectionName, connectionDuration.TotalSeconds, _consecutiveUnstableDisconnects,
|
||||
_backupConfig != null ? _failoverRetryCount : 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
_consecutiveUnstableDisconnects = 0;
|
||||
}
|
||||
|
||||
// Failover if we keep connecting and going stale repeatedly
|
||||
if (_backupConfig != null && _consecutiveUnstableDisconnects >= _failoverRetryCount)
|
||||
{
|
||||
var previousEndpoint = _activeEndpoint;
|
||||
_activeEndpoint = _activeEndpoint == ActiveEndpoint.Primary
|
||||
? ActiveEndpoint.Backup
|
||||
: ActiveEndpoint.Primary;
|
||||
_consecutiveUnstableDisconnects = 0;
|
||||
_consecutiveFailures = 0;
|
||||
|
||||
var newConfig = _activeEndpoint == ActiveEndpoint.Primary
|
||||
? _primaryConfig
|
||||
: _backupConfig;
|
||||
|
||||
// Dispose old adapter
|
||||
_adapter.Disconnected -= OnAdapterDisconnected;
|
||||
_ = _adapter.DisposeAsync().AsTask();
|
||||
|
||||
// Create new adapter for the target endpoint
|
||||
_adapter = _factory.Create(_protocolType, newConfig);
|
||||
_connectionDetails = newConfig;
|
||||
_adapter.Disconnected += OnAdapterDisconnected;
|
||||
|
||||
_log.Warning("[{0}] Failing over from {1} to {2} (unstable connection pattern)",
|
||||
_connectionName, previousEndpoint, _activeEndpoint);
|
||||
|
||||
if (_siteEventLogger != null)
|
||||
{
|
||||
_ = _siteEventLogger.LogEventAsync(
|
||||
"connection", "Warning", null, _connectionName,
|
||||
$"Failover from {previousEndpoint} to {_activeEndpoint} (unstable connection)",
|
||||
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {StableConnectionThreshold.TotalSeconds:F0}s");
|
||||
}
|
||||
}
|
||||
|
||||
// Log disconnect to site event log
|
||||
if (_siteEventLogger != null)
|
||||
{
|
||||
|
||||
@@ -347,7 +347,7 @@ public class DataConnectionActorTests : TestKit
|
||||
var count = Interlocked.Increment(ref connectCount);
|
||||
// count 1: initial connect → success
|
||||
// count 2,3: reconnect failures
|
||||
// count 4: reconnect success (resets counter)
|
||||
// count 4: reconnect success
|
||||
// count 5,6: reconnect failures again
|
||||
// count 7: reconnect success again
|
||||
return count switch
|
||||
@@ -366,7 +366,7 @@ public class DataConnectionActorTests : TestKit
|
||||
AwaitCondition(() => connectCount >= 1, TimeSpan.FromSeconds(2));
|
||||
await Task.Delay(200);
|
||||
|
||||
// Disconnect: triggers 2 failures then success (count 2,3,4)
|
||||
// Disconnect: triggers 1 unstable disconnect + 2 failures then success (count 2,3,4)
|
||||
RaiseDisconnected(primaryAdapter);
|
||||
|
||||
// Wait for successful reconnect (count 4)
|
||||
@@ -380,7 +380,8 @@ public class DataConnectionActorTests : TestKit
|
||||
AwaitCondition(() => connectCount >= 7, TimeSpan.FromSeconds(5));
|
||||
await Task.Delay(200);
|
||||
|
||||
// Factory should never be called — counter reset each time before reaching 3
|
||||
// Factory should never be called — connection failures counter resets on each
|
||||
// successful reconnect, and unstable disconnect counter is separate
|
||||
_mockFactory.DidNotReceive().Create(Arg.Any<string>(), Arg.Any<IDictionary<string, string>>());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user