Files
ScadaBridge/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs
T
Joseph Doherty dd3351da93 feat(health): SiteAuditWriteFailures counter + AuditLog bridge (#23)
Bundle G of Audit Log #23 M2. Bridges the FallbackAuditWriter primary-
failure counter into the Site Health Monitoring report payload so a
sustained audit-write outage surfaces on /monitoring/health instead of
disappearing into a NoOp sink.

- SiteHealthReport: add SiteAuditWriteFailures (defaulted, additive).
- ISiteHealthCollector + SiteHealthCollector: new
  IncrementSiteAuditWriteFailures() counter, per-interval reset
  semantics matching ScriptErrorCount / DeadLetterCount.
- HealthMetricsAuditWriteFailureCounter: adapter forwarding
  IAuditWriteFailureCounter.Increment() to the collector.
- AddAuditLogHealthMetricsBridge(): swaps the NoOp default
  registration for the real bridge; called from
  SiteServiceRegistration after AddSiteHealthMonitoring + AddAuditLog.
- Existing host-wiring test updated: site composition now resolves
  HealthMetricsAuditWriteFailureCounter (not NoOp).

Tests: HealthMonitoring 60 -> 63 (3 new), AuditLog 56 -> 59 (3 new),
full solution green.
2026-05-20 13:22:25 -04:00

196 lines
8.0 KiB
C#

using System.Collections.Concurrent;
using ScadaLink.Commons.Messages.Health;
using ScadaLink.Commons.Types.Enums;
namespace ScadaLink.HealthMonitoring;
/// <summary>
/// Collects health metrics from all site subsystems.
/// Thread-safe: counters use Interlocked operations, connection/tag data uses ConcurrentDictionary.
/// </summary>
public class SiteHealthCollector : ISiteHealthCollector
{
private int _scriptErrorCount;
private int _alarmErrorCount;
private int _deadLetterCount;
private int _siteAuditWriteFailures;
private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
private readonly ConcurrentDictionary<string, TagQualityCounts> _tagQualityCounts = new();
private IReadOnlyDictionary<string, int> _sfBufferDepths = new Dictionary<string, int>();
private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount;
private int _parkedMessageCount;
private volatile string _nodeHostname = "";
private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
private volatile bool _isActiveNode;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a collector. The <paramref name="timeProvider"/> stamps each
/// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
/// is injectable so the report timestamp is deterministically testable —
/// consistent with the rest of the module's time-dependent classes.
/// </summary>
public SiteHealthCollector(TimeProvider? timeProvider = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
}
/// <summary>
/// Increment the script error counter. Covers unhandled exceptions,
/// timeouts, and recursion limit violations.
/// </summary>
public void IncrementScriptError()
{
Interlocked.Increment(ref _scriptErrorCount);
}
/// <summary>
/// Increment the alarm evaluation error counter.
/// </summary>
public void IncrementAlarmError()
{
Interlocked.Increment(ref _alarmErrorCount);
}
/// <summary>
/// Increment the dead letter counter for this reporting interval.
/// </summary>
public void IncrementDeadLetter()
{
Interlocked.Increment(ref _deadLetterCount);
}
/// <summary>
/// Audit Log (#23) Bundle G — increment the per-interval count of
/// <c>FallbackAuditWriter</c> primary failures. Bridged from the
/// <c>IAuditWriteFailureCounter</c> binding registered via
/// <c>AddAuditLogHealthMetricsBridge()</c>; reset every interval together
/// with the other per-interval counters.
/// </summary>
public void IncrementSiteAuditWriteFailures()
{
Interlocked.Increment(ref _siteAuditWriteFailures);
}
/// <summary>
/// Update the health status for a named data connection.
/// Called by DCL when connection state changes.
/// </summary>
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
{
_connectionStatuses[connectionName] = health;
}
/// <summary>
/// Remove a connection from tracking (e.g., on connection disposal).
/// </summary>
public void RemoveConnection(string connectionName)
{
_connectionStatuses.TryRemove(connectionName, out _);
_tagResolutionCounts.TryRemove(connectionName, out _);
_connectionEndpoints.TryRemove(connectionName, out _);
_tagQualityCounts.TryRemove(connectionName, out _);
}
/// <summary>
/// Update tag resolution counts for a named data connection.
/// Called by DCL after tag resolution attempts.
/// </summary>
public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved)
{
_tagResolutionCounts[connectionName] = new TagResolutionStatus(totalSubscribed, successfullyResolved);
}
public void UpdateConnectionEndpoint(string connectionName, string endpoint)
{
_connectionEndpoints[connectionName] = endpoint;
}
public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain)
{
_tagQualityCounts[connectionName] = new TagQualityCounts(good, bad, uncertain);
}
public void SetParkedMessageCount(int count)
{
Interlocked.Exchange(ref _parkedMessageCount, count);
}
public void SetNodeHostname(string hostname) => _nodeHostname = hostname;
public void SetClusterNodes(IReadOnlyList<Commons.Messages.Health.NodeStatus> nodes) => _clusterNodes = nodes;
/// <summary>
/// Set the current store-and-forward buffer depths snapshot.
/// Called before report collection with data from the S&amp;F service.
/// </summary>
public void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths)
{
_sfBufferDepths = depths;
}
/// <summary>
/// Set the current instance counts.
/// Called by the Deployment Manager after instance state changes.
/// </summary>
public void SetInstanceCounts(int deployed, int enabled, int disabled)
{
Interlocked.Exchange(ref _deployedInstanceCount, deployed);
Interlocked.Exchange(ref _enabledInstanceCount, enabled);
Interlocked.Exchange(ref _disabledInstanceCount, disabled);
}
public void SetActiveNode(bool isActive) => _isActiveNode = isActive;
public bool IsActiveNode => _isActiveNode;
/// <summary>
/// Collect the current health report for the site and reset interval counters.
/// Connection statuses and tag resolution counts are NOT reset (they reflect current state).
/// Script errors, alarm errors, and dead letters ARE reset (they are per-interval counts).
/// </summary>
public SiteHealthReport CollectReport(string siteId)
{
// Atomically read and reset the counters
var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0);
var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0);
var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0);
var siteAuditWriteFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0);
// Snapshot current connection and tag resolution state
var connectionStatuses = new Dictionary<string, ConnectionHealth>(_connectionStatuses);
var tagResolution = new Dictionary<string, TagResolutionStatus>(_tagResolutionCounts);
var connectionEndpoints = new Dictionary<string, string>(_connectionEndpoints);
var tagQuality = new Dictionary<string, TagQualityCounts>(_tagQualityCounts);
// Snapshot current S&F buffer depths
var sfBufferDepths = new Dictionary<string, int>(_sfBufferDepths);
// Determine node role from active/standby state
var nodeRole = _isActiveNode ? "Active" : "Standby";
return new SiteHealthReport(
SiteId: siteId,
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
ReportTimestamp: _timeProvider.GetUtcNow(),
DataConnectionStatuses: connectionStatuses,
TagResolutionCounts: tagResolution,
ScriptErrorCount: scriptErrors,
AlarmEvaluationErrorCount: alarmErrors,
StoreAndForwardBufferDepths: sfBufferDepths,
DeadLetterCount: deadLetters,
DeployedInstanceCount: _deployedInstanceCount,
EnabledInstanceCount: _enabledInstanceCount,
DisabledInstanceCount: _disabledInstanceCount,
NodeRole: nodeRole,
NodeHostname: _nodeHostname,
DataConnectionEndpoints: connectionEndpoints,
DataConnectionTagQuality: tagQuality,
ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0),
ClusterNodes: _clusterNodes?.ToList(),
SiteAuditWriteFailures: siteAuditWriteFailures);
}
}