feat(dcl): failover on repeated unstable connections (connect-then-stale pattern)
Previously, failover only triggered when ConnectAsync failed consecutively. If a connection succeeded but went stale quickly (e.g., heartbeat timeout), the failure counter reset on each successful connect and failover never triggered. Added a separate _consecutiveUnstableDisconnects counter that increments when a connection lasts less than StableConnectionThreshold (60s) before disconnecting. When this counter reaches failoverRetryCount, the actor fails over to the backup endpoint. Stable connections (lasting >60s) reset this counter. The original connection-failure failover path is unchanged.
This commit is contained in:
@@ -77,6 +77,8 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
private readonly int _failoverRetryCount;
|
||||
private ActiveEndpoint _activeEndpoint = ActiveEndpoint.Primary;
|
||||
private int _consecutiveFailures;
|
||||
private int _consecutiveUnstableDisconnects;
|
||||
private DateTimeOffset _lastConnectedAt;
|
||||
|
||||
/// <summary>
|
||||
/// Captured Self reference for use from non-actor threads (event handlers, callbacks).
|
||||
@@ -180,9 +182,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
// ── Connected State ──
|
||||
|
||||
/// <summary>
|
||||
/// Minimum time connected before we consider the connection stable.
|
||||
/// If we disconnect before this, it counts as an unstable connection toward failover.
|
||||
/// </summary>
|
||||
private static readonly TimeSpan StableConnectionThreshold = TimeSpan.FromSeconds(60);
|
||||
|
||||
private void BecomeConnected()
|
||||
{
|
||||
_log.Info("[{0}] Entering Connected state", _connectionName);
|
||||
_lastConnectedAt = DateTimeOffset.UtcNow;
|
||||
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Connected);
|
||||
_healthCollector.UpdateTagResolution(_connectionName, _totalSubscribed, _resolvedTags);
|
||||
var endpointLabel = _backupConfig == null ? "Connected" : $"Connected to {_activeEndpoint.ToString().ToLower()}";
|
||||
@@ -236,6 +245,57 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Disconnected);
|
||||
_healthCollector.UpdateConnectionEndpoint(_connectionName, "Disconnected");
|
||||
|
||||
// Track unstable connections toward failover.
|
||||
// If we were connected for less than the stability threshold, this counts
|
||||
// as an unstable cycle (e.g., connect succeeded but heartbeat went stale).
|
||||
var connectionDuration = DateTimeOffset.UtcNow - _lastConnectedAt;
|
||||
if (_lastConnectedAt != default && connectionDuration < StableConnectionThreshold)
|
||||
{
|
||||
_consecutiveUnstableDisconnects++;
|
||||
_log.Warning("[{0}] Unstable connection (lasted {1:F0}s) — consecutive unstable disconnects: {2}/{3}",
|
||||
_connectionName, connectionDuration.TotalSeconds, _consecutiveUnstableDisconnects,
|
||||
_backupConfig != null ? _failoverRetryCount : 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
_consecutiveUnstableDisconnects = 0;
|
||||
}
|
||||
|
||||
// Failover if we keep connecting and going stale repeatedly
|
||||
if (_backupConfig != null && _consecutiveUnstableDisconnects >= _failoverRetryCount)
|
||||
{
|
||||
var previousEndpoint = _activeEndpoint;
|
||||
_activeEndpoint = _activeEndpoint == ActiveEndpoint.Primary
|
||||
? ActiveEndpoint.Backup
|
||||
: ActiveEndpoint.Primary;
|
||||
_consecutiveUnstableDisconnects = 0;
|
||||
_consecutiveFailures = 0;
|
||||
|
||||
var newConfig = _activeEndpoint == ActiveEndpoint.Primary
|
||||
? _primaryConfig
|
||||
: _backupConfig;
|
||||
|
||||
// Dispose old adapter
|
||||
_adapter.Disconnected -= OnAdapterDisconnected;
|
||||
_ = _adapter.DisposeAsync().AsTask();
|
||||
|
||||
// Create new adapter for the target endpoint
|
||||
_adapter = _factory.Create(_protocolType, newConfig);
|
||||
_connectionDetails = newConfig;
|
||||
_adapter.Disconnected += OnAdapterDisconnected;
|
||||
|
||||
_log.Warning("[{0}] Failing over from {1} to {2} (unstable connection pattern)",
|
||||
_connectionName, previousEndpoint, _activeEndpoint);
|
||||
|
||||
if (_siteEventLogger != null)
|
||||
{
|
||||
_ = _siteEventLogger.LogEventAsync(
|
||||
"connection", "Warning", null, _connectionName,
|
||||
$"Failover from {previousEndpoint} to {_activeEndpoint} (unstable connection)",
|
||||
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {StableConnectionThreshold.TotalSeconds:F0}s");
|
||||
}
|
||||
}
|
||||
|
||||
// Log disconnect to site event log
|
||||
if (_siteEventLogger != null)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user