Files
ScadaBridge/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs
T
Joseph Doherty 6ae0fea558 fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown
semantics, and audit-row coverage across 9 modules. Highlights:

Cancellation & lifecycle:
- AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the
  captured SyncContext that risked sync-over-async deadlock.
- AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS,
  threaded through drain paths instead of CancellationToken.None.
- Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls.
- Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM
  during the bounded-retry window aborts cleanly.

Cursor / retry / counter correctness:
- AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since`
  when any row's idempotent insert is still being retried (per-EventId
  retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical
  abandon). No more silent abandonment of permanently-failing rows.
- ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's
  SPLIT loop — by class-doc construction the catch could only mask real
  failures and let the next iteration create permanent partition holes.
- HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot
  per-interval counters before sending, restore via new
  ISiteHealthCollector.AddIntervalCounters on transport failure so counts
  aren't silently lost.

Fire-and-forget / shutdown waits:
- InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks
  via OnlyOnFaulted continuation (Warning log; response unchanged).
- SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep
  with a bounded SweepShutdownWaitTimeout (10s).

Leak / refactor:
- Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its
  own try/catch so a throw doesn't leak the relay actor or _activeStreams
  entry.
- Comm-022: VERIFIED already-closed by Comm-016's dead-code purge.
- CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync
  (auth-failure exit-code contract unified).

Defensive / validation:
- CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config
  prints a warning and returns defaults instead of crashing the CLI.
- Host-022: ParseLevel emits stderr one-shot warning for unrecognised
  MinimumLevel instead of silently coercing to Information.
- ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the
  per-call CTS is the sole timeout source (was clipped to 100s by .NET).
- Security-020: New SecurityOptionsValidator (IValidateOptions) rejects
  empty LdapServer/LdapSearchBase with ValidateOnStart.
- DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/
  DeleteTimedOut audit entries (mirrors DeployFailed pattern).

Plus reconciled stale per-module Open-findings counters that had drifted
from prior sessions.

20+ new regression tests across 11 test projects; build clean; affected
suites all green. README regenerated: 75 open (was 93).
2026-05-28 07:13:28 -04:00

212 lines
8.4 KiB
C#

using System.Collections.Concurrent;
using ScadaLink.Commons.Messages.Health;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
namespace ScadaLink.HealthMonitoring;
/// <summary>
/// Collects health metrics from all site subsystems.
/// Thread-safe: counters use Interlocked operations, connection/tag data uses ConcurrentDictionary.
/// </summary>
public class SiteHealthCollector : ISiteHealthCollector
{
private int _scriptErrorCount;
private int _alarmErrorCount;
private int _deadLetterCount;
private int _siteAuditWriteFailures;
private int _auditRedactionFailures;
private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog;
private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
private readonly ConcurrentDictionary<string, TagQualityCounts> _tagQualityCounts = new();
private IReadOnlyDictionary<string, int> _sfBufferDepths = new Dictionary<string, int>();
private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount;
private int _parkedMessageCount;
private volatile string _nodeHostname = "";
private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
private volatile bool _isActiveNode;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a collector. The <paramref name="timeProvider"/> stamps each
/// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
/// is injectable so the report timestamp is deterministically testable —
/// consistent with the rest of the module's time-dependent classes.
/// </summary>
/// <param name="timeProvider">Optional custom time provider; defaults to system time.</param>
public SiteHealthCollector(TimeProvider? timeProvider = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
}
/// <inheritdoc />
public void IncrementScriptError()
{
Interlocked.Increment(ref _scriptErrorCount);
}
/// <inheritdoc />
public void IncrementAlarmError()
{
Interlocked.Increment(ref _alarmErrorCount);
}
/// <inheritdoc />
public void IncrementDeadLetter()
{
Interlocked.Increment(ref _deadLetterCount);
}
/// <inheritdoc />
public void IncrementSiteAuditWriteFailures()
{
Interlocked.Increment(ref _siteAuditWriteFailures);
}
/// <inheritdoc />
public void IncrementAuditRedactionFailure()
{
Interlocked.Increment(ref _auditRedactionFailures);
}
/// <inheritdoc />
public void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot)
{
_siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
}
/// <inheritdoc />
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
{
_connectionStatuses[connectionName] = health;
}
/// <inheritdoc />
public void RemoveConnection(string connectionName)
{
_connectionStatuses.TryRemove(connectionName, out _);
_tagResolutionCounts.TryRemove(connectionName, out _);
_connectionEndpoints.TryRemove(connectionName, out _);
_tagQualityCounts.TryRemove(connectionName, out _);
}
/// <inheritdoc />
public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved)
{
_tagResolutionCounts[connectionName] = new TagResolutionStatus(totalSubscribed, successfullyResolved);
}
/// <inheritdoc />
public void UpdateConnectionEndpoint(string connectionName, string endpoint)
{
_connectionEndpoints[connectionName] = endpoint;
}
/// <inheritdoc />
public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain)
{
_tagQualityCounts[connectionName] = new TagQualityCounts(good, bad, uncertain);
}
/// <inheritdoc />
public void SetParkedMessageCount(int count)
{
Interlocked.Exchange(ref _parkedMessageCount, count);
}
/// <inheritdoc />
public void SetNodeHostname(string hostname) => _nodeHostname = hostname;
/// <inheritdoc />
public void SetClusterNodes(IReadOnlyList<Commons.Messages.Health.NodeStatus> nodes) => _clusterNodes = nodes;
/// <inheritdoc />
public void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths)
{
_sfBufferDepths = depths;
}
/// <inheritdoc />
public void SetInstanceCounts(int deployed, int enabled, int disabled)
{
Interlocked.Exchange(ref _deployedInstanceCount, deployed);
Interlocked.Exchange(ref _enabledInstanceCount, enabled);
Interlocked.Exchange(ref _disabledInstanceCount, disabled);
}
/// <inheritdoc />
public void SetActiveNode(bool isActive) => _isActiveNode = isActive;
/// <inheritdoc />
public bool IsActiveNode => _isActiveNode;
/// <inheritdoc />
public void AddIntervalCounters(
int scriptErrors,
int alarmErrors,
int deadLetters,
int siteAuditWriteFailures,
int auditRedactionFailures)
{
// HealthMonitoring-017: each counter is restored atomically via
// Interlocked.Add so an increment that arrived during the failed Send
// (and therefore accumulated against the zero left by CollectReport's
// Exchange) is correctly summed with the values being put back. No
// ordering between the five Adds is required — they target independent
// fields.
if (scriptErrors != 0) Interlocked.Add(ref _scriptErrorCount, scriptErrors);
if (alarmErrors != 0) Interlocked.Add(ref _alarmErrorCount, alarmErrors);
if (deadLetters != 0) Interlocked.Add(ref _deadLetterCount, deadLetters);
if (siteAuditWriteFailures != 0) Interlocked.Add(ref _siteAuditWriteFailures, siteAuditWriteFailures);
if (auditRedactionFailures != 0) Interlocked.Add(ref _auditRedactionFailures, auditRedactionFailures);
}
/// <inheritdoc />
public SiteHealthReport CollectReport(string siteId)
{
// Atomically read and reset the counters
var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0);
var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0);
var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0);
var siteAuditWriteFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0);
var auditRedactionFailures = Interlocked.Exchange(ref _auditRedactionFailures, 0);
// Snapshot current connection and tag resolution state
var connectionStatuses = new Dictionary<string, ConnectionHealth>(_connectionStatuses);
var tagResolution = new Dictionary<string, TagResolutionStatus>(_tagResolutionCounts);
var connectionEndpoints = new Dictionary<string, string>(_connectionEndpoints);
var tagQuality = new Dictionary<string, TagQualityCounts>(_tagQualityCounts);
// Snapshot current S&F buffer depths
var sfBufferDepths = new Dictionary<string, int>(_sfBufferDepths);
// Determine node role from active/standby state
var nodeRole = _isActiveNode ? "Active" : "Standby";
return new SiteHealthReport(
SiteId: siteId,
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
ReportTimestamp: _timeProvider.GetUtcNow(),
DataConnectionStatuses: connectionStatuses,
TagResolutionCounts: tagResolution,
ScriptErrorCount: scriptErrors,
AlarmEvaluationErrorCount: alarmErrors,
StoreAndForwardBufferDepths: sfBufferDepths,
DeadLetterCount: deadLetters,
DeployedInstanceCount: _deployedInstanceCount,
EnabledInstanceCount: _enabledInstanceCount,
DisabledInstanceCount: _disabledInstanceCount,
NodeRole: nodeRole,
NodeHostname: _nodeHostname,
DataConnectionEndpoints: connectionEndpoints,
DataConnectionTagQuality: tagQuality,
ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0),
ClusterNodes: _clusterNodes?.ToList(),
SiteAuditWriteFailures: siteAuditWriteFailures,
AuditRedactionFailure: auditRedactionFailures,
SiteAuditBacklog: _siteAuditBacklog);
}
}