fix(data-connection-layer): resolve DataConnectionLayer-008,013 — O(1) unsubscribe via reverse index, atomic disconnect guard

This commit is contained in:
Joseph Doherty
2026-05-16 22:14:23 -04:00
parent 7d1cc5cbb4
commit ff4a4bdeb7
6 changed files with 196 additions and 24 deletions

View File

@@ -50,6 +50,13 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
/// </summary>
private readonly Dictionary<string, string> _subscriptionIds = new();
/// <summary>
/// DataConnectionLayer-008: reverse index of how many instances subscribe to each
/// tag path. Lets <see cref="HandleUnsubscribe"/> decide whether any other instance
/// still needs a tag in O(1) instead of scanning every instance's tag set.
/// </summary>
private readonly Dictionary<string, int> _tagSubscriberCount = new();
/// <summary>
/// Tags whose path resolution failed and are awaiting retry.
/// </summary>
@@ -600,7 +607,12 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
foreach (var result in msg.Results)
{
instanceTags.Add(result.TagPath);
// DataConnectionLayer-008: only a tag newly added to THIS instance's set
// increments the reference count, so the count stays an accurate "number
// of distinct instances subscribed to this tag".
if (instanceTags.Add(result.TagPath))
_tagSubscriberCount[result.TagPath] =
_tagSubscriberCount.GetValueOrDefault(result.TagPath) + 1;
// Re-check against current state: another subscribe may have resolved the
// same tag while this request's I/O was in flight.
@@ -687,20 +699,29 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
// WP-14: Cleanup on Instance Actor stop
foreach (var tagPath in tags)
{
// Check if any other instance is still subscribed to this tag
var otherSubscribers = _subscriptionsByInstance
.Where(kvp => kvp.Key != request.InstanceUniqueName && kvp.Value.Contains(tagPath))
.Any();
// DataConnectionLayer-008: drop this instance's reference; the tag is only
// released at the adapter when no other instance still subscribes to it.
// The reference count makes this O(1) instead of an O(instances) scan.
var remaining = _tagSubscriberCount.GetValueOrDefault(tagPath) - 1;
if (remaining > 0)
{
_tagSubscriberCount[tagPath] = remaining;
continue;
}
_tagSubscriberCount.Remove(tagPath);
if (!otherSubscribers && _subscriptionIds.TryGetValue(tagPath, out var subId))
// Last subscriber gone. A tag with a subscription id is a resolved tag;
// an unresolved tag never has a subscription id, so reaching this branch
// via TryGetValue means the tag was resolved — decrement _resolvedTags
// unconditionally (the previous `!_unresolvedTags.Contains` re-check after
// an unconditional Remove was always-true dead logic).
if (_subscriptionIds.TryGetValue(tagPath, out var subId))
{
_ = _adapter.UnsubscribeAsync(subId);
_subscriptionIds.Remove(tagPath);
_unresolvedTags.Remove(tagPath);
_resolutionInFlight.Remove(tagPath);
_totalSubscribed--;
if (!_unresolvedTags.Contains(tagPath))
_resolvedTags--;
_resolvedTags--;
// DataConnectionLayer-006: drop the tag's tracked quality so it is no
// longer counted by PushBadQualityForAllTags (which sets _tagsBadQuality
@@ -716,6 +737,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
}
}
}
else if (_unresolvedTags.Remove(tagPath))
{
// Last subscriber gone for a tag that had never resolved: stop
// retrying it and drop it from the subscribed total. The previous
// implementation never reached this case (its guard required a
// subscription id), so an unresolved tag leaked into the retry timer
// and TotalSubscribedTags forever after its instance unsubscribed.
_resolutionInFlight.Remove(tagPath);
_totalSubscribed--;
}
}
_subscriptionsByInstance.Remove(request.InstanceUniqueName);

View File

@@ -38,7 +38,12 @@ public class OpcUaDataConnection : IDataConnection
_logger = logger;
}
private volatile bool _disconnectFired;
// DataConnectionLayer-013: an int flag toggled with Interlocked.Exchange so the
// "only the first caller fires Disconnected" guard in RaiseDisconnected is genuinely
// atomic. A plain volatile bool gives visibility but not atomicity — two threads
// (e.g. the keep-alive thread and a ReadAsync failure path) could both observe it
// false and both raise the event. 0 = not fired, 1 = fired.
private int _disconnectFired;
public ConnectionHealth Status => _status;
public event Action? Disconnected;
@@ -82,7 +87,7 @@ public class OpcUaDataConnection : IDataConnection
await _client.ConnectAsync(_endpointUrl, options, cancellationToken);
_status = ConnectionHealth.Connected;
_disconnectFired = false;
Interlocked.Exchange(ref _disconnectFired, 0);
_logger.LogInformation("OPC UA connected to {Endpoint}", _endpointUrl);
await StartHeartbeatMonitorAsync(config.Heartbeat, cancellationToken);
@@ -285,12 +290,15 @@ public class OpcUaDataConnection : IDataConnection
/// <summary>
/// Marks the connection as disconnected and fires the Disconnected event once.
/// Thread-safe: only the first caller triggers the event.
/// Thread-safe: the firing guard is an atomic compare-and-set
/// (<see cref="Interlocked.Exchange(ref int, int)"/>), so when several threads race
/// here — e.g. the keep-alive thread via <see cref="OnClientConnectionLost"/> and a
/// <c>ReadAsync</c> failure path — exactly one of them observes the 0→1 transition
/// and invokes <see cref="Disconnected"/>.
/// </summary>
private void RaiseDisconnected()
{
if (_disconnectFired) return;
_disconnectFired = true;
if (Interlocked.Exchange(ref _disconnectFired, 1) != 0) return;
_status = ConnectionHealth.Disconnected;
_logger.LogWarning("OPC UA connection to {Endpoint} lost", _endpointUrl);
Disconnected?.Invoke();

View File

@@ -24,7 +24,10 @@ public class RealOpcUaClient : IOpcUaClient
// Clear() is undefined behaviour, so they must be ConcurrentDictionary.
private readonly ConcurrentDictionary<string, MonitoredItem> _monitoredItems = new();
private readonly ConcurrentDictionary<string, Action<string, object?, DateTime, uint>> _callbacks = new();
private volatile bool _connectionLostFired;
// DataConnectionLayer-013: int flag toggled with Interlocked.Exchange so the
// once-only ConnectionLost guard in OnSessionKeepAlive is atomic, not just visible.
// 0 = not fired, 1 = fired.
private int _connectionLostFired;
private OpcUaConnectionOptions _options = new();
private readonly OpcUaGlobalOptions _globalOptions;
private readonly ILogger<RealOpcUaClient> _logger;
@@ -112,7 +115,7 @@ public class RealOpcUaClient : IOpcUaClient
"ScadaLink-DCL-Session", (uint)opts.SessionTimeoutMs, userIdentity, null, cancellationToken);
// Detect server going offline via keep-alive failures
_connectionLostFired = false;
Interlocked.Exchange(ref _connectionLostFired, 0);
_session.KeepAlive += OnSessionKeepAlive;
// Store options for monitored item creation
@@ -243,14 +246,15 @@ public class RealOpcUaClient : IOpcUaClient
/// <summary>
/// Called by the OPC UA SDK when a keep-alive response arrives (or fails).
/// When CurrentState is bad, the server is unreachable.
/// When CurrentState is bad, the server is unreachable. The once-only guard is an
/// atomic compare-and-set, so a burst of failed keep-alives raises
/// <see cref="ConnectionLost"/> exactly once.
/// </summary>
private void OnSessionKeepAlive(ISession session, KeepAliveEventArgs e)
{
if (ServiceResult.IsBad(e.Status))
{
if (_connectionLostFired) return;
_connectionLostFired = true;
if (Interlocked.Exchange(ref _connectionLostFired, 1) != 0) return;
ConnectionLost?.Invoke();
}
}