From 5fdeaf613f0dda58788ddc3ad2020ddc09e499e8 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Tue, 24 Mar 2026 15:23:54 -0400 Subject: [PATCH] feat(dcl): failover on repeated unstable connections (connect-then-stale pattern) Previously, failover only triggered when ConnectAsync failed consecutively. If a connection succeeded but went stale quickly (e.g., heartbeat timeout), the failure counter reset on each successful connect and failover never triggered. Added a separate _consecutiveUnstableDisconnects counter that increments when a connection lasts less than StableConnectionThreshold (60s) before disconnecting. When this counter reaches failoverRetryCount, the actor fails over to the backup endpoint. Stable connections (lasting >60s) reset this counter. The original connection-failure failover path is unchanged. --- .../Actors/DataConnectionActor.cs | 60 +++++++++++++++++++ .../DataConnectionActorTests.cs | 7 ++- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs b/src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs index 16658cb..50a4904 100644 --- a/src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs +++ b/src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs @@ -77,6 +77,8 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers private readonly int _failoverRetryCount; private ActiveEndpoint _activeEndpoint = ActiveEndpoint.Primary; private int _consecutiveFailures; + private int _consecutiveUnstableDisconnects; + private DateTimeOffset _lastConnectedAt; /// /// Captured Self reference for use from non-actor threads (event handlers, callbacks). @@ -180,9 +182,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers // ── Connected State ── + /// + /// Minimum time connected before we consider the connection stable. + /// If we disconnect before this, it counts as an unstable connection toward failover. + /// + private static readonly TimeSpan StableConnectionThreshold = TimeSpan.FromSeconds(60); + private void BecomeConnected() { _log.Info("[{0}] Entering Connected state", _connectionName); + _lastConnectedAt = DateTimeOffset.UtcNow; _healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Connected); _healthCollector.UpdateTagResolution(_connectionName, _totalSubscribed, _resolvedTags); var endpointLabel = _backupConfig == null ? "Connected" : $"Connected to {_activeEndpoint.ToString().ToLower()}"; @@ -236,6 +245,57 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers _healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Disconnected); _healthCollector.UpdateConnectionEndpoint(_connectionName, "Disconnected"); + // Track unstable connections toward failover. + // If we were connected for less than the stability threshold, this counts + // as an unstable cycle (e.g., connect succeeded but heartbeat went stale). + var connectionDuration = DateTimeOffset.UtcNow - _lastConnectedAt; + if (_lastConnectedAt != default && connectionDuration < StableConnectionThreshold) + { + _consecutiveUnstableDisconnects++; + _log.Warning("[{0}] Unstable connection (lasted {1:F0}s) — consecutive unstable disconnects: {2}/{3}", + _connectionName, connectionDuration.TotalSeconds, _consecutiveUnstableDisconnects, + _backupConfig != null ? _failoverRetryCount : 0); + } + else + { + _consecutiveUnstableDisconnects = 0; + } + + // Failover if we keep connecting and going stale repeatedly + if (_backupConfig != null && _consecutiveUnstableDisconnects >= _failoverRetryCount) + { + var previousEndpoint = _activeEndpoint; + _activeEndpoint = _activeEndpoint == ActiveEndpoint.Primary + ? ActiveEndpoint.Backup + : ActiveEndpoint.Primary; + _consecutiveUnstableDisconnects = 0; + _consecutiveFailures = 0; + + var newConfig = _activeEndpoint == ActiveEndpoint.Primary + ? _primaryConfig + : _backupConfig; + + // Dispose old adapter + _adapter.Disconnected -= OnAdapterDisconnected; + _ = _adapter.DisposeAsync().AsTask(); + + // Create new adapter for the target endpoint + _adapter = _factory.Create(_protocolType, newConfig); + _connectionDetails = newConfig; + _adapter.Disconnected += OnAdapterDisconnected; + + _log.Warning("[{0}] Failing over from {1} to {2} (unstable connection pattern)", + _connectionName, previousEndpoint, _activeEndpoint); + + if (_siteEventLogger != null) + { + _ = _siteEventLogger.LogEventAsync( + "connection", "Warning", null, _connectionName, + $"Failover from {previousEndpoint} to {_activeEndpoint} (unstable connection)", + $"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {StableConnectionThreshold.TotalSeconds:F0}s"); + } + } + // Log disconnect to site event log if (_siteEventLogger != null) { diff --git a/tests/ScadaLink.DataConnectionLayer.Tests/DataConnectionActorTests.cs b/tests/ScadaLink.DataConnectionLayer.Tests/DataConnectionActorTests.cs index 6bb0396..8fce188 100644 --- a/tests/ScadaLink.DataConnectionLayer.Tests/DataConnectionActorTests.cs +++ b/tests/ScadaLink.DataConnectionLayer.Tests/DataConnectionActorTests.cs @@ -347,7 +347,7 @@ public class DataConnectionActorTests : TestKit var count = Interlocked.Increment(ref connectCount); // count 1: initial connect → success // count 2,3: reconnect failures - // count 4: reconnect success (resets counter) + // count 4: reconnect success // count 5,6: reconnect failures again // count 7: reconnect success again return count switch @@ -366,7 +366,7 @@ public class DataConnectionActorTests : TestKit AwaitCondition(() => connectCount >= 1, TimeSpan.FromSeconds(2)); await Task.Delay(200); - // Disconnect: triggers 2 failures then success (count 2,3,4) + // Disconnect: triggers 1 unstable disconnect + 2 failures then success (count 2,3,4) RaiseDisconnected(primaryAdapter); // Wait for successful reconnect (count 4) @@ -380,7 +380,8 @@ public class DataConnectionActorTests : TestKit AwaitCondition(() => connectCount >= 7, TimeSpan.FromSeconds(5)); await Task.Delay(200); - // Factory should never be called — counter reset each time before reaching 3 + // Factory should never be called — connection failures counter resets on each + // successful reconnect, and unstable disconnect counter is separate _mockFactory.DidNotReceive().Create(Arg.Any(), Arg.Any>()); }