using System.Collections.Concurrent; using ScadaLink.Commons.Messages.Health; using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.HealthMonitoring; /// /// Collects health metrics from all site subsystems. /// Thread-safe: counters use Interlocked operations, connection/tag data uses ConcurrentDictionary. /// public class SiteHealthCollector : ISiteHealthCollector { private int _scriptErrorCount; private int _alarmErrorCount; private int _deadLetterCount; private int _siteAuditWriteFailures; private int _auditRedactionFailures; private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog; private readonly ConcurrentDictionary _connectionStatuses = new(); private readonly ConcurrentDictionary _tagResolutionCounts = new(); private readonly ConcurrentDictionary _connectionEndpoints = new(); private readonly ConcurrentDictionary _tagQualityCounts = new(); private IReadOnlyDictionary _sfBufferDepths = new Dictionary(); private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount; private int _parkedMessageCount; private volatile string _nodeHostname = ""; private volatile IReadOnlyList? _clusterNodes; private volatile bool _isActiveNode; private readonly TimeProvider _timeProvider; /// /// Creates a collector. The stamps each /// report's timestamp; it defaults to and /// is injectable so the report timestamp is deterministically testable — /// consistent with the rest of the module's time-dependent classes. /// /// Optional custom time provider; defaults to system time. public SiteHealthCollector(TimeProvider? timeProvider = null) { _timeProvider = timeProvider ?? TimeProvider.System; } /// public void IncrementScriptError() { Interlocked.Increment(ref _scriptErrorCount); } /// public void IncrementAlarmError() { Interlocked.Increment(ref _alarmErrorCount); } /// public void IncrementDeadLetter() { Interlocked.Increment(ref _deadLetterCount); } /// public void IncrementSiteAuditWriteFailures() { Interlocked.Increment(ref _siteAuditWriteFailures); } /// public void IncrementAuditRedactionFailure() { Interlocked.Increment(ref _auditRedactionFailures); } /// public void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot) { _siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot)); } /// public void UpdateConnectionHealth(string connectionName, ConnectionHealth health) { _connectionStatuses[connectionName] = health; } /// public void RemoveConnection(string connectionName) { _connectionStatuses.TryRemove(connectionName, out _); _tagResolutionCounts.TryRemove(connectionName, out _); _connectionEndpoints.TryRemove(connectionName, out _); _tagQualityCounts.TryRemove(connectionName, out _); } /// public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved) { _tagResolutionCounts[connectionName] = new TagResolutionStatus(totalSubscribed, successfullyResolved); } /// public void UpdateConnectionEndpoint(string connectionName, string endpoint) { _connectionEndpoints[connectionName] = endpoint; } /// public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain) { _tagQualityCounts[connectionName] = new TagQualityCounts(good, bad, uncertain); } /// public void SetParkedMessageCount(int count) { Interlocked.Exchange(ref _parkedMessageCount, count); } /// public void SetNodeHostname(string hostname) => _nodeHostname = hostname; /// public void SetClusterNodes(IReadOnlyList nodes) => _clusterNodes = nodes; /// public void SetStoreAndForwardDepths(IReadOnlyDictionary depths) { _sfBufferDepths = depths; } /// public void SetInstanceCounts(int deployed, int enabled, int disabled) { Interlocked.Exchange(ref _deployedInstanceCount, deployed); Interlocked.Exchange(ref _enabledInstanceCount, enabled); Interlocked.Exchange(ref _disabledInstanceCount, disabled); } /// public void SetActiveNode(bool isActive) => _isActiveNode = isActive; /// public bool IsActiveNode => _isActiveNode; /// public void AddIntervalCounters( int scriptErrors, int alarmErrors, int deadLetters, int siteAuditWriteFailures, int auditRedactionFailures) { // HealthMonitoring-017: each counter is restored atomically via // Interlocked.Add so an increment that arrived during the failed Send // (and therefore accumulated against the zero left by CollectReport's // Exchange) is correctly summed with the values being put back. No // ordering between the five Adds is required — they target independent // fields. if (scriptErrors != 0) Interlocked.Add(ref _scriptErrorCount, scriptErrors); if (alarmErrors != 0) Interlocked.Add(ref _alarmErrorCount, alarmErrors); if (deadLetters != 0) Interlocked.Add(ref _deadLetterCount, deadLetters); if (siteAuditWriteFailures != 0) Interlocked.Add(ref _siteAuditWriteFailures, siteAuditWriteFailures); if (auditRedactionFailures != 0) Interlocked.Add(ref _auditRedactionFailures, auditRedactionFailures); } /// public SiteHealthReport CollectReport(string siteId) { // Atomically read and reset the counters var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0); var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0); var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0); var siteAuditWriteFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0); var auditRedactionFailures = Interlocked.Exchange(ref _auditRedactionFailures, 0); // Snapshot current connection and tag resolution state var connectionStatuses = new Dictionary(_connectionStatuses); var tagResolution = new Dictionary(_tagResolutionCounts); var connectionEndpoints = new Dictionary(_connectionEndpoints); var tagQuality = new Dictionary(_tagQualityCounts); // Snapshot current S&F buffer depths var sfBufferDepths = new Dictionary(_sfBufferDepths); // Determine node role from active/standby state var nodeRole = _isActiveNode ? "Active" : "Standby"; return new SiteHealthReport( SiteId: siteId, SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number ReportTimestamp: _timeProvider.GetUtcNow(), DataConnectionStatuses: connectionStatuses, TagResolutionCounts: tagResolution, ScriptErrorCount: scriptErrors, AlarmEvaluationErrorCount: alarmErrors, StoreAndForwardBufferDepths: sfBufferDepths, DeadLetterCount: deadLetters, DeployedInstanceCount: _deployedInstanceCount, EnabledInstanceCount: _enabledInstanceCount, DisabledInstanceCount: _disabledInstanceCount, NodeRole: nodeRole, NodeHostname: _nodeHostname, DataConnectionEndpoints: connectionEndpoints, DataConnectionTagQuality: tagQuality, ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0), ClusterNodes: _clusterNodes?.ToList(), SiteAuditWriteFailures: siteAuditWriteFailures, AuditRedactionFailure: auditRedactionFailures, SiteAuditBacklog: _siteAuditBacklog); } }