New fields in SiteHealthReport: NodeHostname, DataConnectionEndpoints (primary/secondary), DataConnectionTagQuality (good/bad/uncertain), ParkedMessageCount. New collector methods to populate them. Health dashboard redesigned to match mockup: Nodes | Data Connections (with per-connection tag quality) | Instances + S&F Buffers | Error Counts + Parked Messages. Site names resolved from repository.
165 lines
6.5 KiB
C#
165 lines
6.5 KiB
C#
using System.Collections.Concurrent;
|
|
using ScadaLink.Commons.Messages.Health;
|
|
using ScadaLink.Commons.Types.Enums;
|
|
|
|
namespace ScadaLink.HealthMonitoring;
|
|
|
|
/// <summary>
|
|
/// Collects health metrics from all site subsystems.
|
|
/// Thread-safe: counters use Interlocked operations, connection/tag data uses ConcurrentDictionary.
|
|
/// </summary>
|
|
public class SiteHealthCollector : ISiteHealthCollector
|
|
{
|
|
private int _scriptErrorCount;
|
|
private int _alarmErrorCount;
|
|
private int _deadLetterCount;
|
|
private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
|
|
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
|
|
private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
|
|
private readonly ConcurrentDictionary<string, TagQualityCounts> _tagQualityCounts = new();
|
|
private IReadOnlyDictionary<string, int> _sfBufferDepths = new Dictionary<string, int>();
|
|
private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount;
|
|
private int _parkedMessageCount;
|
|
private volatile string _nodeHostname = "";
|
|
private volatile bool _isActiveNode;
|
|
|
|
/// <summary>
|
|
/// Increment the script error counter. Covers unhandled exceptions,
|
|
/// timeouts, and recursion limit violations.
|
|
/// </summary>
|
|
public void IncrementScriptError()
|
|
{
|
|
Interlocked.Increment(ref _scriptErrorCount);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Increment the alarm evaluation error counter.
|
|
/// </summary>
|
|
public void IncrementAlarmError()
|
|
{
|
|
Interlocked.Increment(ref _alarmErrorCount);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Increment the dead letter counter for this reporting interval.
|
|
/// </summary>
|
|
public void IncrementDeadLetter()
|
|
{
|
|
Interlocked.Increment(ref _deadLetterCount);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Update the health status for a named data connection.
|
|
/// Called by DCL when connection state changes.
|
|
/// </summary>
|
|
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
|
|
{
|
|
_connectionStatuses[connectionName] = health;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Remove a connection from tracking (e.g., on connection disposal).
|
|
/// </summary>
|
|
public void RemoveConnection(string connectionName)
|
|
{
|
|
_connectionStatuses.TryRemove(connectionName, out _);
|
|
_tagResolutionCounts.TryRemove(connectionName, out _);
|
|
_connectionEndpoints.TryRemove(connectionName, out _);
|
|
_tagQualityCounts.TryRemove(connectionName, out _);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Update tag resolution counts for a named data connection.
|
|
/// Called by DCL after tag resolution attempts.
|
|
/// </summary>
|
|
public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved)
|
|
{
|
|
_tagResolutionCounts[connectionName] = new TagResolutionStatus(totalSubscribed, successfullyResolved);
|
|
}
|
|
|
|
public void UpdateConnectionEndpoint(string connectionName, string endpoint)
|
|
{
|
|
_connectionEndpoints[connectionName] = endpoint;
|
|
}
|
|
|
|
public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain)
|
|
{
|
|
_tagQualityCounts[connectionName] = new TagQualityCounts(good, bad, uncertain);
|
|
}
|
|
|
|
public void SetParkedMessageCount(int count)
|
|
{
|
|
Interlocked.Exchange(ref _parkedMessageCount, count);
|
|
}
|
|
|
|
public void SetNodeHostname(string hostname) => _nodeHostname = hostname;
|
|
|
|
/// <summary>
|
|
/// Set the current store-and-forward buffer depths snapshot.
|
|
/// Called before report collection with data from the S&F service.
|
|
/// </summary>
|
|
public void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths)
|
|
{
|
|
_sfBufferDepths = depths;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Set the current instance counts.
|
|
/// Called by the Deployment Manager after instance state changes.
|
|
/// </summary>
|
|
public void SetInstanceCounts(int deployed, int enabled, int disabled)
|
|
{
|
|
Interlocked.Exchange(ref _deployedInstanceCount, deployed);
|
|
Interlocked.Exchange(ref _enabledInstanceCount, enabled);
|
|
Interlocked.Exchange(ref _disabledInstanceCount, disabled);
|
|
}
|
|
|
|
public void SetActiveNode(bool isActive) => _isActiveNode = isActive;
|
|
|
|
public bool IsActiveNode => _isActiveNode;
|
|
|
|
/// <summary>
|
|
/// Collect the current health report for the site and reset interval counters.
|
|
/// Connection statuses and tag resolution counts are NOT reset (they reflect current state).
|
|
/// Script errors, alarm errors, and dead letters ARE reset (they are per-interval counts).
|
|
/// </summary>
|
|
public SiteHealthReport CollectReport(string siteId)
|
|
{
|
|
// Atomically read and reset the counters
|
|
var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0);
|
|
var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0);
|
|
var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0);
|
|
|
|
// Snapshot current connection and tag resolution state
|
|
var connectionStatuses = new Dictionary<string, ConnectionHealth>(_connectionStatuses);
|
|
var tagResolution = new Dictionary<string, TagResolutionStatus>(_tagResolutionCounts);
|
|
var connectionEndpoints = new Dictionary<string, string>(_connectionEndpoints);
|
|
var tagQuality = new Dictionary<string, TagQualityCounts>(_tagQualityCounts);
|
|
|
|
// Snapshot current S&F buffer depths
|
|
var sfBufferDepths = new Dictionary<string, int>(_sfBufferDepths);
|
|
|
|
// Determine node role from active/standby state
|
|
var nodeRole = _isActiveNode ? "Active" : "Standby";
|
|
|
|
return new SiteHealthReport(
|
|
SiteId: siteId,
|
|
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
|
|
ReportTimestamp: DateTimeOffset.UtcNow,
|
|
DataConnectionStatuses: connectionStatuses,
|
|
TagResolutionCounts: tagResolution,
|
|
ScriptErrorCount: scriptErrors,
|
|
AlarmEvaluationErrorCount: alarmErrors,
|
|
StoreAndForwardBufferDepths: sfBufferDepths,
|
|
DeadLetterCount: deadLetters,
|
|
DeployedInstanceCount: _deployedInstanceCount,
|
|
EnabledInstanceCount: _enabledInstanceCount,
|
|
DisabledInstanceCount: _disabledInstanceCount,
|
|
NodeRole: nodeRole,
|
|
NodeHostname: _nodeHostname,
|
|
DataConnectionEndpoints: connectionEndpoints,
|
|
DataConnectionTagQuality: tagQuality,
|
|
ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0));
|
|
}
|
|
}
|