refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,263 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Central-side aggregator that receives health reports from all sites,
|
||||
/// tracks latest metrics in memory, and detects offline sites.
|
||||
/// No persistence — display-only for Central UI consumption.
|
||||
/// </summary>
|
||||
public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregator
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, SiteHealthState> _siteStates = new();
|
||||
private readonly HealthMonitoringOptions _options;
|
||||
private readonly ILogger<CentralHealthAggregator> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="CentralHealthAggregator"/>.</summary>
|
||||
/// <param name="options">Health monitoring configuration.</param>
|
||||
/// <param name="logger">Logger for aggregator diagnostics.</param>
|
||||
/// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
|
||||
public CentralHealthAggregator(
|
||||
IOptions<HealthMonitoringOptions> options,
|
||||
ILogger<CentralHealthAggregator> logger,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void ProcessReport(SiteHealthReport report)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (!_siteStates.TryGetValue(report.SiteId, out var existing))
|
||||
{
|
||||
var registered = new SiteHealthState
|
||||
{
|
||||
SiteId = report.SiteId,
|
||||
LatestReport = report,
|
||||
LastReportReceivedAt = now,
|
||||
LastHeartbeatAt = now,
|
||||
LastSequenceNumber = report.SequenceNumber,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryAdd(report.SiteId, registered))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
|
||||
return;
|
||||
}
|
||||
|
||||
// Lost the race — another thread registered first; retry as an update.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (report.SequenceNumber <= existing.LastSequenceNumber)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
|
||||
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
|
||||
return;
|
||||
}
|
||||
|
||||
var updated = existing with
|
||||
{
|
||||
LatestReport = report,
|
||||
LastReportReceivedAt = now,
|
||||
LastHeartbeatAt = now,
|
||||
LastSequenceNumber = report.SequenceNumber,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryUpdate(report.SiteId, updated, existing))
|
||||
{
|
||||
if (!existing.IsOnline)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// CAS lost — the entry changed under us; retry with the fresh value.
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
if (!_siteStates.TryGetValue(siteId, out var existing))
|
||||
{
|
||||
// Unknown site — register it as online, awaiting its first
|
||||
// full report. LatestReport and LastReportReceivedAt both stay
|
||||
// null until ProcessReport runs — "no report yet" is an explicit
|
||||
// nullable state, not a year-0001 sentinel the UI must special-case.
|
||||
var registered = new SiteHealthState
|
||||
{
|
||||
SiteId = siteId,
|
||||
LatestReport = null,
|
||||
LastReportReceivedAt = null,
|
||||
LastHeartbeatAt = receivedAt,
|
||||
LastSequenceNumber = 0,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryAdd(siteId, registered))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} registered online via heartbeat (awaiting first report)", siteId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Lost the race — another thread registered first; retry as an update.
|
||||
continue;
|
||||
}
|
||||
|
||||
// HealthMonitoring-020: when an offline→online transition is being
|
||||
// applied, the heartbeat timestamp must reflect a fresh observation,
|
||||
// not the prior stored value. If receivedAt is older than the stored
|
||||
// LastHeartbeatAt (clock skew, an out-of-order heartbeat arriving
|
||||
// after an earlier one already advanced the field), promoting the
|
||||
// site back to online while leaving LastHeartbeatAt stale would let
|
||||
// CheckForOfflineSites flap it straight back to offline on the next
|
||||
// tick. Anchor the heartbeat to the current time provider instead,
|
||||
// so an offline-to-online transition is always backed by an
|
||||
// up-to-date heartbeat.
|
||||
DateTimeOffset newHeartbeat;
|
||||
if (!existing.IsOnline)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
newHeartbeat = receivedAt > now ? receivedAt : now;
|
||||
}
|
||||
else
|
||||
{
|
||||
newHeartbeat = receivedAt > existing.LastHeartbeatAt
|
||||
? receivedAt
|
||||
: existing.LastHeartbeatAt;
|
||||
}
|
||||
|
||||
// Nothing to change — avoid a needless swap.
|
||||
if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
|
||||
return;
|
||||
|
||||
var updated = existing with
|
||||
{
|
||||
LastHeartbeatAt = newHeartbeat,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryUpdate(siteId, updated, existing))
|
||||
{
|
||||
if (!existing.IsOnline)
|
||||
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
||||
return;
|
||||
}
|
||||
|
||||
// CAS lost — retry with the fresh value.
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates()
|
||||
{
|
||||
return new Dictionary<string, SiteHealthState>(_siteStates);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public SiteHealthState? GetSiteState(string siteId)
|
||||
{
|
||||
_siteStates.TryGetValue(siteId, out var state);
|
||||
return state;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
|
||||
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
|
||||
|
||||
// Check at half the shorter of the two offline timeouts so detection is
|
||||
// timely for whichever site class (real or "central") has the tighter
|
||||
// window — see ComputeCheckInterval.
|
||||
using var timer = new PeriodicTimer(ComputeCheckInterval(_options));
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
CheckForOfflineSites();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the offline-check timer cadence: half of the <em>shorter</em> of
|
||||
/// <see cref="HealthMonitoringOptions.OfflineTimeout"/> and
|
||||
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>. Deriving it
|
||||
/// from the shorter timeout guarantees that whichever site class has the
|
||||
/// tighter window is still polled at least twice within it — so if an
|
||||
/// operator configures <c>CentralOfflineTimeout</c> smaller than
|
||||
/// <c>OfflineTimeout</c>, central offline detection is not delayed by up to a
|
||||
/// full <c>OfflineTimeout / 2</c>.
|
||||
/// </summary>
|
||||
/// <param name="options">The health monitoring options to derive the interval from.</param>
|
||||
internal static TimeSpan ComputeCheckInterval(HealthMonitoringOptions options)
|
||||
{
|
||||
var shorter = options.OfflineTimeout < options.CentralOfflineTimeout
|
||||
? options.OfflineTimeout
|
||||
: options.CentralOfflineTimeout;
|
||||
return TimeSpan.FromMilliseconds(shorter.TotalMilliseconds / 2);
|
||||
}
|
||||
|
||||
/// <summary>Iterates all tracked sites and marks any that have exceeded their offline timeout as offline.</summary>
|
||||
internal void CheckForOfflineSites()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var kvp in _siteStates)
|
||||
{
|
||||
var state = kvp.Value;
|
||||
if (!state.IsOnline) continue;
|
||||
|
||||
// Use LastHeartbeatAt — heartbeats arrive every ~5s from any
|
||||
// healthy site node (cadence owned by Cluster Infrastructure /
|
||||
// SiteCommunicationActor — CommunicationOptions.TransportHeartbeatInterval),
|
||||
// so the 60s OfflineTimeout tolerates several missed heartbeats and
|
||||
// only fires when no node can reach central, not during single-node
|
||||
// failovers.
|
||||
//
|
||||
// The synthetic "central" site has no heartbeat source — its only
|
||||
// signal is the 30s CentralHealthReportLoop self-report — so it gets
|
||||
// a longer grace window (CentralOfflineTimeout) to survive a single
|
||||
// skipped/late self-report.
|
||||
var timeout = kvp.Key == CentralHealthReportLoop.CentralSiteId
|
||||
? _options.CentralOfflineTimeout
|
||||
: _options.OfflineTimeout;
|
||||
|
||||
var elapsed = now - state.LastHeartbeatAt;
|
||||
if (elapsed <= timeout)
|
||||
continue;
|
||||
|
||||
// Atomically swap to an offline copy. If the CAS loses to a
|
||||
// concurrent report/heartbeat the site was just heard from, so
|
||||
// leaving it online is the correct outcome — no retry needed.
|
||||
var offline = state with { IsOnline = false };
|
||||
if (_siteStates.TryUpdate(kvp.Key, offline, state))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
||||
state.SiteId, elapsed.TotalSeconds, timeout.TotalSeconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Central-side counterpart to <see cref="HealthReportSender"/>.
|
||||
/// Periodically builds a SiteHealthReport for the central cluster itself
|
||||
/// (siteId = <see cref="CentralSiteId"/>) and feeds it into the local
|
||||
/// CentralHealthAggregator so the UI can render central as another card
|
||||
/// on /monitoring/health. Only the cluster leader (Primary) generates
|
||||
/// reports — the standby's aggregator catches up on failover when it
|
||||
/// becomes Primary and starts its own loop.
|
||||
/// </summary>
|
||||
public class CentralHealthReportLoop : BackgroundService
|
||||
{
|
||||
/// <summary>
|
||||
/// Reserved siteId used to represent the central cluster in the
|
||||
/// shared CentralHealthAggregator keyspace.
|
||||
///
|
||||
/// HealthMonitoring-021: the value is prefixed with <c>$</c> — a character
|
||||
/// that is forbidden in real site identifiers (the configuration /
|
||||
/// repository layer only permits Sites whose <c>SiteIdentifier</c> is a
|
||||
/// plain identifier) — so the synthetic central entry cannot collide with
|
||||
/// a real site whose operator-set identifier happened to be the bare word
|
||||
/// "central". A collision would have caused the two reports to clobber
|
||||
/// each other in the aggregator keyspace via the sequence-number guard,
|
||||
/// and the real site would inherit the longer
|
||||
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/> grace and
|
||||
/// stay falsely-online for an extra two minutes after going down.
|
||||
/// Consumers (<see cref="CentralHealthAggregator.CheckForOfflineSites"/>,
|
||||
/// the Central UI health dashboard) reference this constant rather than
|
||||
/// the literal string, so the change is local.
|
||||
/// </summary>
|
||||
public const string CentralSiteId = "$central";
|
||||
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly ICentralHealthAggregator _aggregator;
|
||||
private readonly IClusterNodeProvider _clusterNodeProvider;
|
||||
private readonly HealthMonitoringOptions _options;
|
||||
private readonly ILogger<CentralHealthReportLoop> _logger;
|
||||
|
||||
// Seeded with Unix-ms so reports from a newly-elected central leader
|
||||
// always sort after reports from any prior leader for siteId="central".
|
||||
// The clock is read through the injected TimeProvider so the seeding is
|
||||
// deterministically testable.
|
||||
private long _sequenceNumber;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the central health report loop.
|
||||
/// </summary>
|
||||
/// <param name="collector">Local health metrics collector for the central node.</param>
|
||||
/// <param name="aggregator">Aggregator that stores reports for the Central UI health dashboard.</param>
|
||||
/// <param name="clusterNodeProvider">Provider used to determine whether this node is primary.</param>
|
||||
/// <param name="options">Health monitoring configuration (report interval, offline threshold).</param>
|
||||
/// <param name="logger">Logger for diagnostics.</param>
|
||||
/// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
|
||||
public CentralHealthReportLoop(
|
||||
ISiteHealthCollector collector,
|
||||
ICentralHealthAggregator aggregator,
|
||||
IClusterNodeProvider clusterNodeProvider,
|
||||
IOptions<HealthMonitoringOptions> options,
|
||||
ILogger<CentralHealthReportLoop> logger,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_collector = collector;
|
||||
_aggregator = aggregator;
|
||||
_clusterNodeProvider = clusterNodeProvider;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Current sequence number (for testing).
|
||||
/// </summary>
|
||||
public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Central health report loop starting, interval {Interval}s",
|
||||
_options.ReportInterval.TotalSeconds);
|
||||
|
||||
using var timer = new PeriodicTimer(_options.ReportInterval);
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
try
|
||||
{
|
||||
var isPrimary = _clusterNodeProvider.SelfIsPrimary;
|
||||
_collector.SetActiveNode(isPrimary);
|
||||
|
||||
if (!isPrimary)
|
||||
continue;
|
||||
|
||||
_collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
|
||||
|
||||
var seq = Interlocked.Increment(ref _sequenceNumber);
|
||||
|
||||
// HealthMonitoring-018: CollectReport atomically read-and-resets
|
||||
// the per-interval error counters via Interlocked.Exchange. If
|
||||
// ProcessReport throws (or any other failure occurs between the
|
||||
// collect and the publish), those counts would otherwise be
|
||||
// lost — neither in the un-published report nor in the
|
||||
// now-zeroed collector. Snapshot the freshly-collected report
|
||||
// so that on a publish failure we can atomically restore the
|
||||
// counts back into the shared SiteHealthCollector via
|
||||
// Interlocked.Add. Concurrent increments arriving during the
|
||||
// ProcessReport call are preserved on the counter; the restore
|
||||
// Add safely sums with any such concurrent increments. Same
|
||||
// shape as the HealthMonitoring-017 fix in HealthReportSender.
|
||||
var report = _collector.CollectReport(CentralSiteId);
|
||||
var reportWithSeq = report with { SequenceNumber = seq };
|
||||
|
||||
try
|
||||
{
|
||||
_aggregator.ProcessReport(reportWithSeq);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Restore the captured per-interval counters atomically so
|
||||
// they roll forward into the next report — see
|
||||
// HealthMonitoring-018.
|
||||
_collector.AddIntervalCounters(
|
||||
scriptErrors: report.ScriptErrorCount,
|
||||
alarmErrors: report.AlarmEvaluationErrorCount,
|
||||
deadLetters: report.DeadLetterCount,
|
||||
siteAuditWriteFailures: report.SiteAuditWriteFailures,
|
||||
auditRedactionFailures: report.AuditRedactionFailure);
|
||||
throw;
|
||||
}
|
||||
|
||||
_logger.LogDebug("Generated central health report #{Seq}", seq);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to generate central health report");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
public class HealthMonitoringOptions
|
||||
{
|
||||
/// <summary>Interval at which sites emit health reports to the central cluster.</summary>
|
||||
public TimeSpan ReportInterval { get; set; } = TimeSpan.FromSeconds(30);
|
||||
/// <summary>Duration of silence after which a site is classified as offline.</summary>
|
||||
public TimeSpan OfflineTimeout { get; set; } = TimeSpan.FromMinutes(1);
|
||||
|
||||
/// <summary>
|
||||
/// Offline timeout applied to the synthetic "central" site only. Real sites
|
||||
/// emit frequent heartbeats that keep <c>LastHeartbeatAt</c> fresh, so the
|
||||
/// normal <see cref="OfflineTimeout"/> only fires on genuine total loss. The
|
||||
/// "central" self-report has no heartbeat source — its only signal is the
|
||||
/// 30s <see cref="CentralHealthReportLoop"/>, so a single skipped/late
|
||||
/// self-report (leader GC pause, brief stall, mid-failover before the new
|
||||
/// leader's loop spins up) would flap it offline under the 60s site timeout.
|
||||
/// A longer central grace gives the equivalent of "one missed report" that
|
||||
/// the design doc grants real sites. Default: 3x the report interval.
|
||||
/// </summary>
|
||||
public TimeSpan CentralOfflineTimeout { get; set; } = TimeSpan.FromMinutes(3);
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-014: validates <see cref="HealthMonitoringOptions"/> at
|
||||
/// startup. The interval values are fed straight into <c>new PeriodicTimer(...)</c>
|
||||
/// (and into a division for the offline-check cadence); a zero or negative value
|
||||
/// makes <see cref="PeriodicTimer"/>'s constructor throw
|
||||
/// <see cref="ArgumentOutOfRangeException"/>, crashing the
|
||||
/// <see cref="HealthReportSender"/> / <see cref="CentralHealthReportLoop"/> /
|
||||
/// <see cref="CentralHealthAggregator"/> hosted service with an opaque exception
|
||||
/// that does not name the offending config key. Registered with
|
||||
/// <c>ValidateOnStart()</c> so a bad <c>ScadaBridge:HealthMonitoring</c> section
|
||||
/// fails fast at boot with a clear, key-naming message.
|
||||
/// </summary>
|
||||
public sealed class HealthMonitoringOptionsValidator : IValidateOptions<HealthMonitoringOptions>
|
||||
{
|
||||
/// <summary>
|
||||
/// Validates the health monitoring options, returning a failure result if any interval values are non-positive.
|
||||
/// </summary>
|
||||
/// <param name="name">Named options instance name (unused).</param>
|
||||
/// <param name="options">The health monitoring options to validate.</param>
|
||||
public ValidateOptionsResult Validate(string? name, HealthMonitoringOptions options)
|
||||
{
|
||||
var failures = new List<string>();
|
||||
|
||||
if (options.ReportInterval <= TimeSpan.Zero)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaBridge:HealthMonitoring:ReportInterval must be a positive duration " +
|
||||
$"(was {options.ReportInterval}); it is used directly as a PeriodicTimer period.");
|
||||
}
|
||||
|
||||
if (options.OfflineTimeout <= TimeSpan.Zero)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaBridge:HealthMonitoring:OfflineTimeout must be a positive duration " +
|
||||
$"(was {options.OfflineTimeout}); it drives the offline-check PeriodicTimer cadence.");
|
||||
}
|
||||
|
||||
if (options.CentralOfflineTimeout <= TimeSpan.Zero)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaBridge:HealthMonitoring:CentralOfflineTimeout must be a positive duration " +
|
||||
$"(was {options.CentralOfflineTimeout}).");
|
||||
}
|
||||
|
||||
if (options.OfflineTimeout > TimeSpan.Zero
|
||||
&& options.CentralOfflineTimeout > TimeSpan.Zero
|
||||
&& options.CentralOfflineTimeout < options.OfflineTimeout)
|
||||
{
|
||||
failures.Add(
|
||||
$"ScadaBridge:HealthMonitoring:CentralOfflineTimeout ({options.CentralOfflineTimeout}) " +
|
||||
$"must be >= OfflineTimeout ({options.OfflineTimeout}): the synthetic 'central' site has " +
|
||||
"no heartbeat source and is fed only by the slower self-report loop, so it needs at " +
|
||||
"least as much offline grace as a real site.");
|
||||
}
|
||||
|
||||
return failures.Count > 0
|
||||
? ValidateOptionsResult.Fail(failures)
|
||||
: ValidateOptionsResult.Success;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,187 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Periodically collects a SiteHealthReport and sends it to central via Akka remoting.
|
||||
/// Sequence numbers are monotonic and reset on service restart. They are <b>not</b>
|
||||
/// zero/one-based: the per-process counter is seeded with the current Unix epoch
|
||||
/// (milliseconds) at construction so that, after a failover, reports from a
|
||||
/// freshly-active node always sort after reports from any prior active node for the
|
||||
/// same site — otherwise the central aggregator's sequence-number guard would
|
||||
/// silently reject the new active's first reports as stale.
|
||||
/// </summary>
|
||||
public class HealthReportSender : BackgroundService
|
||||
{
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly IHealthReportTransport _transport;
|
||||
private readonly HealthMonitoringOptions _options;
|
||||
private readonly ILogger<HealthReportSender> _logger;
|
||||
private readonly string _siteId;
|
||||
private readonly StoreAndForwardStorage? _sfStorage;
|
||||
private readonly IClusterNodeProvider? _clusterNodeProvider;
|
||||
|
||||
// Seeded with Unix-ms at construction so reports from a freshly-active
|
||||
// node always sort after reports from any prior active node for the same
|
||||
// site. Without this seeding, failover would silently drop the new
|
||||
// active's first reports because their per-process counter starts below
|
||||
// the prior active's last sequence number. The clock is read through the
|
||||
// injected TimeProvider so the seeding is deterministically testable.
|
||||
private long _sequenceNumber;
|
||||
|
||||
/// <summary>Initializes the sender, seeds the monotonic sequence number from the current Unix timestamp.</summary>
|
||||
/// <param name="collector">Site health metric collector supplying the report payload.</param>
|
||||
/// <param name="transport">Transport used to send the health report to central.</param>
|
||||
/// <param name="options">Health monitoring options including the report interval.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="siteIdentityProvider">Provides the site identifier embedded in each report.</param>
|
||||
/// <param name="sfStorage">Optional store-and-forward storage for queue depth metrics.</param>
|
||||
/// <param name="clusterNodeProvider">Optional cluster node provider for active-node detection.</param>
|
||||
/// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
|
||||
public HealthReportSender(
|
||||
ISiteHealthCollector collector,
|
||||
IHealthReportTransport transport,
|
||||
IOptions<HealthMonitoringOptions> options,
|
||||
ILogger<HealthReportSender> logger,
|
||||
ISiteIdentityProvider siteIdentityProvider,
|
||||
StoreAndForwardStorage? sfStorage = null,
|
||||
IClusterNodeProvider? clusterNodeProvider = null,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_collector = collector;
|
||||
_transport = transport;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_siteId = siteIdentityProvider.SiteId;
|
||||
_sfStorage = sfStorage;
|
||||
_clusterNodeProvider = clusterNodeProvider;
|
||||
_sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Current sequence number (for testing).
|
||||
/// </summary>
|
||||
public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Health report sender starting for site {SiteId}, interval {Interval}s",
|
||||
_siteId, _options.ReportInterval.TotalSeconds);
|
||||
|
||||
using var timer = new PeriodicTimer(_options.ReportInterval);
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
try
|
||||
{
|
||||
// Only the active node (running the DeploymentManager singleton) sends health reports.
|
||||
// The standby node has no instance/connection data and would overwrite the active's report.
|
||||
if (!_collector.IsActiveNode)
|
||||
continue;
|
||||
|
||||
if (_clusterNodeProvider != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
_collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Non-fatal — the report ships with the previous cluster
|
||||
// node list. Logged so a persistent failure is diagnosable.
|
||||
_logger.LogWarning(ex,
|
||||
"Failed to refresh cluster nodes for health report (site {SiteId}); using stale list",
|
||||
_siteId);
|
||||
}
|
||||
}
|
||||
|
||||
if (_sfStorage != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
var parkedCount = await _sfStorage.GetParkedMessageCountAsync();
|
||||
_collector.SetParkedMessageCount(parkedCount);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Non-fatal — parked count will be 0 in this report.
|
||||
_logger.LogWarning(ex,
|
||||
"Failed to query parked message count for health report (site {SiteId})",
|
||||
_siteId);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Per-category pending-message buffer depths (the documented
|
||||
// "store-and-forward buffer depth" triage metric). Keyed by
|
||||
// StoreAndForwardCategory name so the central dashboard can
|
||||
// render external/notification/DB-write depths separately.
|
||||
var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
|
||||
var depths = depthsByCategory.ToDictionary(
|
||||
kvp => kvp.Key.ToString(),
|
||||
kvp => kvp.Value);
|
||||
_collector.SetStoreAndForwardDepths(depths);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Non-fatal — buffer depths will be empty in this report.
|
||||
_logger.LogWarning(ex,
|
||||
"Failed to query store-and-forward buffer depths for health report (site {SiteId})",
|
||||
_siteId);
|
||||
}
|
||||
}
|
||||
|
||||
var seq = Interlocked.Increment(ref _sequenceNumber);
|
||||
|
||||
// HealthMonitoring-017: CollectReport atomically read-and-resets
|
||||
// the per-interval error counters via Interlocked.Exchange. If
|
||||
// the Send below throws, those counts are otherwise lost
|
||||
// forever — neither in the un-sent report nor in the now-zeroed
|
||||
// collector. Snapshot the freshly-collected report so that on a
|
||||
// transport failure we can atomically restore the counts back
|
||||
// into the collector via Interlocked.Add, so the next
|
||||
// successful report includes them. Concurrent increments
|
||||
// arriving during the Send are preserved on the counter (they
|
||||
// accumulate against zero); the restore Add safely sums with
|
||||
// any such concurrent increments.
|
||||
var report = _collector.CollectReport(_siteId);
|
||||
|
||||
// Replace the placeholder sequence number with our monotonic one
|
||||
var reportWithSeq = report with { SequenceNumber = seq };
|
||||
|
||||
try
|
||||
{
|
||||
_transport.Send(reportWithSeq);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Restore the captured per-interval counters atomically so
|
||||
// they roll forward into the next report — see
|
||||
// HealthMonitoring-017. Any concurrent increment that
|
||||
// arrived during the failed Send remains on the counter;
|
||||
// Interlocked.Add sums correctly with it.
|
||||
_collector.AddIntervalCounters(
|
||||
scriptErrors: report.ScriptErrorCount,
|
||||
alarmErrors: report.AlarmEvaluationErrorCount,
|
||||
deadLetters: report.DeadLetterCount,
|
||||
siteAuditWriteFailures: report.SiteAuditWriteFailures,
|
||||
auditRedactionFailures: report.AuditRedactionFailure);
|
||||
throw;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Sent health report #{Seq} for site {SiteId}", seq, _siteId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to send health report for site {SiteId}", _siteId);
|
||||
// Continue sending — don't let a single failure stop reporting
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for central-side health aggregation.
|
||||
/// Consumed by Central UI to display site health dashboards.
|
||||
/// </summary>
|
||||
public interface ICentralHealthAggregator
|
||||
{
|
||||
/// <summary>
|
||||
/// Processes an incoming health report from a site and updates the aggregated state.
|
||||
/// </summary>
|
||||
/// <param name="report">The health report received from the site.</param>
|
||||
void ProcessReport(SiteHealthReport report);
|
||||
|
||||
/// <summary>
|
||||
/// Bumps the last-seen timestamp for a site, keeping it marked online
|
||||
/// between full 30s reports when heartbeats are arriving — protects against
|
||||
/// the offline threshold firing on a transiently delayed report. Heartbeat
|
||||
/// cadence is owned by the Cluster Infrastructure / <c>SiteCommunicationActor</c>
|
||||
/// (the application-level heartbeat to central, sent every
|
||||
/// <c>CommunicationOptions.TransportHeartbeatInterval</c> — 5s by default);
|
||||
/// the 60s <see cref="HealthMonitoringOptions.OfflineTimeout"/> therefore
|
||||
/// tolerates several missed heartbeats. A heartbeat for a site with no
|
||||
/// aggregator state yet (e.g. just after a central restart/failover)
|
||||
/// registers that site as online with no
|
||||
/// <see cref="SiteHealthState.LatestReport"/>, so reachable sites are not
|
||||
/// shown as "unknown" during the failover window.
|
||||
/// </summary>
|
||||
/// <param name="siteId">The string identifier of the site that sent the heartbeat.</param>
|
||||
/// <param name="receivedAt">The UTC timestamp when the heartbeat was received.</param>
|
||||
void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);
|
||||
|
||||
/// <summary>Returns a snapshot of all currently tracked site health states.</summary>
|
||||
IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates();
|
||||
/// <summary>Returns the current health state for the specified site, or null if not tracked.</summary>
|
||||
/// <param name="siteId">The string identifier of the site to look up.</param>
|
||||
SiteHealthState? GetSiteState(string siteId);
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Provides cluster node status information for health reporting.
|
||||
/// Implemented by the Host project which has access to the Akka.NET actor system.
|
||||
/// </summary>
|
||||
public interface IClusterNodeProvider
|
||||
{
|
||||
/// <summary>Returns the current status of all cluster nodes for the provider's role scope.</summary>
|
||||
IReadOnlyList<NodeStatus> GetClusterNodes();
|
||||
|
||||
/// <summary>
|
||||
/// True when this node is currently the cluster leader (Primary) for the
|
||||
/// provider's role scope. Used by the central report loop to decide which
|
||||
/// node should generate the "central" health report.
|
||||
/// </summary>
|
||||
bool SelfIsPrimary { get; }
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Abstraction for sending health reports to central.
|
||||
/// In production, implemented via Akka remoting (Tell, fire-and-forget).
|
||||
/// </summary>
|
||||
public interface IHealthReportTransport
|
||||
{
|
||||
/// <summary>
|
||||
/// Sends a health report to central (fire-and-forget).
|
||||
/// </summary>
|
||||
/// <param name="report">The site health report to send.</param>
|
||||
void Send(SiteHealthReport report);
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for site-side health metric collection.
|
||||
/// Consumed by Site Runtime actors to report errors, and by DCL to report connection health.
|
||||
/// </summary>
|
||||
public interface ISiteHealthCollector
|
||||
{
|
||||
/// <summary>
|
||||
/// Increments the script error count.
|
||||
/// </summary>
|
||||
void IncrementScriptError();
|
||||
|
||||
/// <summary>
|
||||
/// Increments the alarm error count.
|
||||
/// </summary>
|
||||
void IncrementAlarmError();
|
||||
|
||||
/// <summary>
|
||||
/// Increments the dead letter count.
|
||||
/// </summary>
|
||||
void IncrementDeadLetter();
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) Bundle G — increment the per-interval count of
|
||||
/// <c>FallbackAuditWriter</c> primary failures. Bridged from the
|
||||
/// <c>IAuditWriteFailureCounter</c> binding registered via
|
||||
/// <c>AddAuditLogHealthMetricsBridge()</c>.
|
||||
/// </summary>
|
||||
void IncrementSiteAuditWriteFailures();
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M5 Bundle C — increment the per-interval count of
|
||||
/// payload-filter redactor over-redactions (header / body / SQL
|
||||
/// parameter stage throws routed to the
|
||||
/// <c><redacted: redactor error></c> marker). Bridged from the
|
||||
/// <c>IAuditRedactionFailureCounter</c> binding registered via
|
||||
/// <c>AddAuditLogHealthMetricsBridge()</c>.
|
||||
/// </summary>
|
||||
void IncrementAuditRedactionFailure();
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T6) — replace the latest site-local
|
||||
/// audit-queue backlog snapshot (pending count, oldest pending row,
|
||||
/// on-disk file bytes) used by the next <see cref="CollectReport"/> call.
|
||||
/// Refreshed periodically by the <c>SiteAuditBacklogReporter</c> hosted
|
||||
/// service so each report carries a recent point-in-time view of the
|
||||
/// site→central drain health.
|
||||
/// </summary>
|
||||
/// <param name="snapshot">The audit backlog snapshot.</param>
|
||||
void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot);
|
||||
|
||||
/// <summary>
|
||||
/// Updates the health status for a data connection.
|
||||
/// </summary>
|
||||
/// <param name="connectionName">The name of the connection.</param>
|
||||
/// <param name="health">The connection health status.</param>
|
||||
void UpdateConnectionHealth(string connectionName, ConnectionHealth health);
|
||||
|
||||
/// <summary>
|
||||
/// Removes a connection from health tracking.
|
||||
/// </summary>
|
||||
/// <param name="connectionName">The name of the connection.</param>
|
||||
void RemoveConnection(string connectionName);
|
||||
|
||||
/// <summary>
|
||||
/// Updates tag resolution metrics for a connection.
|
||||
/// </summary>
|
||||
/// <param name="connectionName">The name of the connection.</param>
|
||||
/// <param name="totalSubscribed">Total number of subscribed tags.</param>
|
||||
/// <param name="successfullyResolved">Number of successfully resolved tags.</param>
|
||||
void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved);
|
||||
|
||||
/// <summary>
|
||||
/// Updates the endpoint for a connection.
|
||||
/// </summary>
|
||||
/// <param name="connectionName">The name of the connection.</param>
|
||||
/// <param name="endpoint">The connection endpoint.</param>
|
||||
void UpdateConnectionEndpoint(string connectionName, string endpoint);
|
||||
|
||||
/// <summary>
|
||||
/// Updates tag quality metrics for a connection.
|
||||
/// </summary>
|
||||
/// <param name="connectionName">The name of the connection.</param>
|
||||
/// <param name="good">Number of good quality tags.</param>
|
||||
/// <param name="bad">Number of bad quality tags.</param>
|
||||
/// <param name="uncertain">Number of uncertain quality tags.</param>
|
||||
void UpdateTagQuality(string connectionName, int good, int bad, int uncertain);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the store-and-forward buffer depths for all categories.
|
||||
/// </summary>
|
||||
/// <param name="depths">Dictionary mapping category names to their buffer depths.</param>
|
||||
void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the counts of instances in each state.
|
||||
/// </summary>
|
||||
/// <param name="deployed">Number of deployed instances.</param>
|
||||
/// <param name="enabled">Number of enabled instances.</param>
|
||||
/// <param name="disabled">Number of disabled instances.</param>
|
||||
void SetInstanceCounts(int deployed, int enabled, int disabled);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the count of parked messages.
|
||||
/// </summary>
|
||||
/// <param name="count">The number of parked messages.</param>
|
||||
void SetParkedMessageCount(int count);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the hostname of this node.
|
||||
/// </summary>
|
||||
/// <param name="hostname">The node hostname.</param>
|
||||
void SetNodeHostname(string hostname);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the list of cluster nodes.
|
||||
/// </summary>
|
||||
/// <param name="nodes">The list of cluster node statuses.</param>
|
||||
void SetClusterNodes(IReadOnlyList<Commons.Messages.Health.NodeStatus> nodes);
|
||||
|
||||
/// <summary>
|
||||
/// Sets whether this node is the active node in the cluster.
|
||||
/// </summary>
|
||||
/// <param name="isActive">True if this node is active, false otherwise.</param>
|
||||
void SetActiveNode(bool isActive);
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether this node is the active node in the cluster.
|
||||
/// </summary>
|
||||
bool IsActiveNode { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Collects and returns a health report for a site.
|
||||
/// </summary>
|
||||
/// <param name="siteId">The site identifier.</param>
|
||||
/// <returns>A health report for the specified site.</returns>
|
||||
SiteHealthReport CollectReport(string siteId);
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-017: atomically add back the given per-interval error
|
||||
/// counts into the collector's accumulators. Called by the report sender
|
||||
/// when transport delivery of a freshly-collected report fails, so the
|
||||
/// counts that <see cref="CollectReport"/> already drained roll forward
|
||||
/// into the next report rather than being silently lost. Concurrent
|
||||
/// increments arriving between the failed Send and this restore are
|
||||
/// preserved — <c>Interlocked.Add</c> sums correctly with them. The
|
||||
/// default interface implementation is a no-op so existing test fakes
|
||||
/// (the only implementations outside <see cref="SiteHealthCollector"/>)
|
||||
/// continue to compile without per-fake updates; production callers see
|
||||
/// the real behaviour via the concrete class.
|
||||
/// </summary>
|
||||
/// <param name="scriptErrors">Script error count to add back.</param>
|
||||
/// <param name="alarmErrors">Alarm evaluation error count to add back.</param>
|
||||
/// <param name="deadLetters">Dead letter count to add back.</param>
|
||||
/// <param name="siteAuditWriteFailures">Site audit write failure count to add back.</param>
|
||||
/// <param name="auditRedactionFailures">Audit redaction failure count to add back.</param>
|
||||
void AddIntervalCounters(
|
||||
int scriptErrors,
|
||||
int alarmErrors,
|
||||
int deadLetters,
|
||||
int siteAuditWriteFailures,
|
||||
int auditRedactionFailures)
|
||||
{
|
||||
// Default no-op so test fakes do not need to be updated. The real
|
||||
// SiteHealthCollector overrides this with the Interlocked.Add restore.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Provides the identity of the current site.
|
||||
/// Implemented by the Host component to supply configuration-driven site ID.
|
||||
/// </summary>
|
||||
public interface ISiteIdentityProvider
|
||||
{
|
||||
/// <summary>The unique identifier of this site node.</summary>
|
||||
string SiteId { get; }
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Register site-side health monitoring services (metric collection + periodic reporting).
|
||||
/// Call this on site nodes only. For central, call AddCentralHealthAggregation() instead.
|
||||
/// </summary>
|
||||
/// <param name="services">The DI service collection to register into.</param>
|
||||
public static IServiceCollection AddSiteHealthMonitoring(this IServiceCollection services)
|
||||
{
|
||||
AddOptionsValidation(services);
|
||||
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
|
||||
services.AddHostedService<HealthReportSender>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register shared health monitoring services (safe for both central and site).
|
||||
/// Does not start the HealthReportSender — call AddSiteHealthMonitoring() on site nodes for that.
|
||||
/// </summary>
|
||||
/// <param name="services">The DI service collection to register into.</param>
|
||||
public static IServiceCollection AddHealthMonitoring(this IServiceCollection services)
|
||||
{
|
||||
AddOptionsValidation(services);
|
||||
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register central-side health aggregation services. Includes the
|
||||
/// <see cref="CentralHealthReportLoop"/> that generates a self-report
|
||||
/// for the central cluster so it appears on /monitoring/health.
|
||||
/// </summary>
|
||||
/// <param name="services">The DI service collection to register into.</param>
|
||||
public static IServiceCollection AddCentralHealthAggregation(this IServiceCollection services)
|
||||
{
|
||||
AddOptionsValidation(services);
|
||||
services.AddSingleton<CentralHealthAggregator>();
|
||||
services.AddSingleton<ICentralHealthAggregator>(sp => sp.GetRequiredService<CentralHealthAggregator>());
|
||||
services.AddHostedService(sp => sp.GetRequiredService<CentralHealthAggregator>());
|
||||
services.AddHostedService<CentralHealthReportLoop>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
|
||||
/// so a misconfigured <c>ScadaBridge:HealthMonitoring</c> section (zero/negative
|
||||
/// intervals, or a <c>CentralOfflineTimeout</c> shorter than
|
||||
/// <c>OfflineTimeout</c>) is rejected with a clear, key-naming message when the
|
||||
/// hosted services resolve their options at startup — rather than crashing
|
||||
/// later inside a <see cref="PeriodicTimer"/> constructor with an opaque
|
||||
/// <see cref="ArgumentOutOfRangeException"/>. Idempotent so it is safe when
|
||||
/// more than one of the registration methods above is called.
|
||||
/// </summary>
|
||||
private static void AddOptionsValidation(IServiceCollection services)
|
||||
{
|
||||
services.TryAddEnumerable(
|
||||
ServiceDescriptor.Singleton<IValidateOptions<HealthMonitoringOptions>, HealthMonitoringOptionsValidator>());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,211 @@
|
||||
using System.Collections.Concurrent;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// Collects health metrics from all site subsystems.
|
||||
/// Thread-safe: counters use Interlocked operations, connection/tag data uses ConcurrentDictionary.
|
||||
/// </summary>
|
||||
public class SiteHealthCollector : ISiteHealthCollector
|
||||
{
|
||||
private int _scriptErrorCount;
|
||||
private int _alarmErrorCount;
|
||||
private int _deadLetterCount;
|
||||
private int _siteAuditWriteFailures;
|
||||
private int _auditRedactionFailures;
|
||||
private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog;
|
||||
private readonly ConcurrentDictionary<string, ConnectionHealth> _connectionStatuses = new();
|
||||
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
|
||||
private readonly ConcurrentDictionary<string, string> _connectionEndpoints = new();
|
||||
private readonly ConcurrentDictionary<string, TagQualityCounts> _tagQualityCounts = new();
|
||||
private IReadOnlyDictionary<string, int> _sfBufferDepths = new Dictionary<string, int>();
|
||||
private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount;
|
||||
private int _parkedMessageCount;
|
||||
private volatile string _nodeHostname = "";
|
||||
private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
|
||||
private volatile bool _isActiveNode;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a collector. The <paramref name="timeProvider"/> stamps each
|
||||
/// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
|
||||
/// is injectable so the report timestamp is deterministically testable —
|
||||
/// consistent with the rest of the module's time-dependent classes.
|
||||
/// </summary>
|
||||
/// <param name="timeProvider">Optional custom time provider; defaults to system time.</param>
|
||||
public SiteHealthCollector(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void IncrementScriptError()
|
||||
{
|
||||
Interlocked.Increment(ref _scriptErrorCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void IncrementAlarmError()
|
||||
{
|
||||
Interlocked.Increment(ref _alarmErrorCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void IncrementDeadLetter()
|
||||
{
|
||||
Interlocked.Increment(ref _deadLetterCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void IncrementSiteAuditWriteFailures()
|
||||
{
|
||||
Interlocked.Increment(ref _siteAuditWriteFailures);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void IncrementAuditRedactionFailure()
|
||||
{
|
||||
Interlocked.Increment(ref _auditRedactionFailures);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot)
|
||||
{
|
||||
_siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void UpdateConnectionHealth(string connectionName, ConnectionHealth health)
|
||||
{
|
||||
_connectionStatuses[connectionName] = health;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void RemoveConnection(string connectionName)
|
||||
{
|
||||
_connectionStatuses.TryRemove(connectionName, out _);
|
||||
_tagResolutionCounts.TryRemove(connectionName, out _);
|
||||
_connectionEndpoints.TryRemove(connectionName, out _);
|
||||
_tagQualityCounts.TryRemove(connectionName, out _);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved)
|
||||
{
|
||||
_tagResolutionCounts[connectionName] = new TagResolutionStatus(totalSubscribed, successfullyResolved);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void UpdateConnectionEndpoint(string connectionName, string endpoint)
|
||||
{
|
||||
_connectionEndpoints[connectionName] = endpoint;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void UpdateTagQuality(string connectionName, int good, int bad, int uncertain)
|
||||
{
|
||||
_tagQualityCounts[connectionName] = new TagQualityCounts(good, bad, uncertain);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetParkedMessageCount(int count)
|
||||
{
|
||||
Interlocked.Exchange(ref _parkedMessageCount, count);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetNodeHostname(string hostname) => _nodeHostname = hostname;
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetClusterNodes(IReadOnlyList<Commons.Messages.Health.NodeStatus> nodes) => _clusterNodes = nodes;
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths)
|
||||
{
|
||||
_sfBufferDepths = depths;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetInstanceCounts(int deployed, int enabled, int disabled)
|
||||
{
|
||||
Interlocked.Exchange(ref _deployedInstanceCount, deployed);
|
||||
Interlocked.Exchange(ref _enabledInstanceCount, enabled);
|
||||
Interlocked.Exchange(ref _disabledInstanceCount, disabled);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void SetActiveNode(bool isActive) => _isActiveNode = isActive;
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsActiveNode => _isActiveNode;
|
||||
|
||||
/// <inheritdoc />
|
||||
public void AddIntervalCounters(
|
||||
int scriptErrors,
|
||||
int alarmErrors,
|
||||
int deadLetters,
|
||||
int siteAuditWriteFailures,
|
||||
int auditRedactionFailures)
|
||||
{
|
||||
// HealthMonitoring-017: each counter is restored atomically via
|
||||
// Interlocked.Add so an increment that arrived during the failed Send
|
||||
// (and therefore accumulated against the zero left by CollectReport's
|
||||
// Exchange) is correctly summed with the values being put back. No
|
||||
// ordering between the five Adds is required — they target independent
|
||||
// fields.
|
||||
if (scriptErrors != 0) Interlocked.Add(ref _scriptErrorCount, scriptErrors);
|
||||
if (alarmErrors != 0) Interlocked.Add(ref _alarmErrorCount, alarmErrors);
|
||||
if (deadLetters != 0) Interlocked.Add(ref _deadLetterCount, deadLetters);
|
||||
if (siteAuditWriteFailures != 0) Interlocked.Add(ref _siteAuditWriteFailures, siteAuditWriteFailures);
|
||||
if (auditRedactionFailures != 0) Interlocked.Add(ref _auditRedactionFailures, auditRedactionFailures);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public SiteHealthReport CollectReport(string siteId)
|
||||
{
|
||||
// Atomically read and reset the counters
|
||||
var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0);
|
||||
var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0);
|
||||
var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0);
|
||||
var siteAuditWriteFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0);
|
||||
var auditRedactionFailures = Interlocked.Exchange(ref _auditRedactionFailures, 0);
|
||||
|
||||
// Snapshot current connection and tag resolution state
|
||||
var connectionStatuses = new Dictionary<string, ConnectionHealth>(_connectionStatuses);
|
||||
var tagResolution = new Dictionary<string, TagResolutionStatus>(_tagResolutionCounts);
|
||||
var connectionEndpoints = new Dictionary<string, string>(_connectionEndpoints);
|
||||
var tagQuality = new Dictionary<string, TagQualityCounts>(_tagQualityCounts);
|
||||
|
||||
// Snapshot current S&F buffer depths
|
||||
var sfBufferDepths = new Dictionary<string, int>(_sfBufferDepths);
|
||||
|
||||
// Determine node role from active/standby state
|
||||
var nodeRole = _isActiveNode ? "Active" : "Standby";
|
||||
|
||||
return new SiteHealthReport(
|
||||
SiteId: siteId,
|
||||
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
|
||||
ReportTimestamp: _timeProvider.GetUtcNow(),
|
||||
DataConnectionStatuses: connectionStatuses,
|
||||
TagResolutionCounts: tagResolution,
|
||||
ScriptErrorCount: scriptErrors,
|
||||
AlarmEvaluationErrorCount: alarmErrors,
|
||||
StoreAndForwardBufferDepths: sfBufferDepths,
|
||||
DeadLetterCount: deadLetters,
|
||||
DeployedInstanceCount: _deployedInstanceCount,
|
||||
EnabledInstanceCount: _enabledInstanceCount,
|
||||
DisabledInstanceCount: _disabledInstanceCount,
|
||||
NodeRole: nodeRole,
|
||||
NodeHostname: _nodeHostname,
|
||||
DataConnectionEndpoints: connectionEndpoints,
|
||||
DataConnectionTagQuality: tagQuality,
|
||||
ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0),
|
||||
ClusterNodes: _clusterNodes?.ToList(),
|
||||
SiteAuditWriteFailures: siteAuditWriteFailures,
|
||||
AuditRedactionFailure: auditRedactionFailures,
|
||||
SiteAuditBacklog: _siteAuditBacklog);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// In-memory state for a single site's health, stored by the central aggregator.
|
||||
/// Immutable: every state transition produces a new instance which the aggregator
|
||||
/// installs into its <c>ConcurrentDictionary</c> via an atomic compare-and-swap.
|
||||
/// This makes handing the reference straight to UI callers safe — a consumer can
|
||||
/// never observe a torn or half-applied update.
|
||||
/// </summary>
|
||||
public sealed record SiteHealthState
|
||||
{
|
||||
/// <summary>Gets the unique identifier of the site this state record belongs to.</summary>
|
||||
public required string SiteId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The latest full <see cref="SiteHealthReport"/> received for the site, or
|
||||
/// <c>null</c> if the site is known only via heartbeats and has not yet sent
|
||||
/// a report.
|
||||
/// </summary>
|
||||
public SiteHealthReport? LatestReport { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time the latest full <see cref="SiteHealthReport"/> was processed, or
|
||||
/// <c>null</c> if the site is known only via heartbeats and has not yet sent
|
||||
/// a report. Used by the UI to surface report staleness during failover;
|
||||
/// the <c>null</c> case must be rendered as "no report yet" rather than as a
|
||||
/// timestamp (a <c>default</c> sentinel would display as year-0001).
|
||||
/// </summary>
|
||||
public DateTimeOffset? LastReportReceivedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time the most recent signal of any kind (full report OR heartbeat) was
|
||||
/// received. Drives offline detection — heartbeats from the standby keep the
|
||||
/// site marked online even when the active node is unable to produce a report
|
||||
/// (mid-failover, brief stalls). Heartbeat cadence is owned by the Cluster
|
||||
/// Infrastructure / SiteCommunicationActor (every
|
||||
/// CommunicationOptions.TransportHeartbeatInterval — 5s by default).
|
||||
/// </summary>
|
||||
public DateTimeOffset LastHeartbeatAt { get; init; }
|
||||
|
||||
/// <summary>Gets the sequence number of the last accepted health report, used to reject out-of-order duplicates.</summary>
|
||||
public long LastSequenceNumber { get; init; }
|
||||
/// <summary>Gets a value indicating whether the site is currently considered online.</summary>
|
||||
public bool IsOnline { get; init; }
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests" />
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.IntegrationTests" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user