7b0b9c7365
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
145 lines
6.7 KiB
C#
145 lines
6.7 KiB
C#
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
|
|
|
/// <summary>
|
|
/// Central-side counterpart to <see cref="HealthReportSender"/>.
|
|
/// Periodically builds a SiteHealthReport for the central cluster itself
|
|
/// (siteId = <see cref="CentralSiteId"/>) and feeds it into the local
|
|
/// CentralHealthAggregator so the UI can render central as another card
|
|
/// on /monitoring/health. Only the cluster leader (Primary) generates
|
|
/// reports — the standby's aggregator catches up on failover when it
|
|
/// becomes Primary and starts its own loop.
|
|
/// </summary>
|
|
public class CentralHealthReportLoop : BackgroundService
|
|
{
|
|
/// <summary>
|
|
/// Reserved siteId used to represent the central cluster in the
|
|
/// shared CentralHealthAggregator keyspace.
|
|
///
|
|
/// HealthMonitoring-021: the value is prefixed with <c>$</c> — a character
|
|
/// that is forbidden in real site identifiers (the configuration /
|
|
/// repository layer only permits Sites whose <c>SiteIdentifier</c> is a
|
|
/// plain identifier) — so the synthetic central entry cannot collide with
|
|
/// a real site whose operator-set identifier happened to be the bare word
|
|
/// "central". A collision would have caused the two reports to clobber
|
|
/// each other in the aggregator keyspace via the sequence-number guard,
|
|
/// and the real site would inherit the longer
|
|
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/> grace and
|
|
/// stay falsely-online for an extra two minutes after going down.
|
|
/// Consumers (<see cref="CentralHealthAggregator.CheckForOfflineSites"/>,
|
|
/// the Central UI health dashboard) reference this constant rather than
|
|
/// the literal string, so the change is local.
|
|
/// </summary>
|
|
public const string CentralSiteId = "$central";
|
|
|
|
private readonly ISiteHealthCollector _collector;
|
|
private readonly ICentralHealthAggregator _aggregator;
|
|
private readonly IClusterNodeProvider _clusterNodeProvider;
|
|
private readonly HealthMonitoringOptions _options;
|
|
private readonly ILogger<CentralHealthReportLoop> _logger;
|
|
|
|
// Seeded with Unix-ms so reports from a newly-elected central leader
|
|
// always sort after reports from any prior leader for siteId="central".
|
|
// The clock is read through the injected TimeProvider so the seeding is
|
|
// deterministically testable.
|
|
private long _sequenceNumber;
|
|
|
|
/// <summary>
|
|
/// Initializes the central health report loop.
|
|
/// </summary>
|
|
/// <param name="collector">Local health metrics collector for the central node.</param>
|
|
/// <param name="aggregator">Aggregator that stores reports for the Central UI health dashboard.</param>
|
|
/// <param name="clusterNodeProvider">Provider used to determine whether this node is primary.</param>
|
|
/// <param name="options">Health monitoring configuration (report interval, offline threshold).</param>
|
|
/// <param name="logger">Logger for diagnostics.</param>
|
|
/// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
|
|
public CentralHealthReportLoop(
|
|
ISiteHealthCollector collector,
|
|
ICentralHealthAggregator aggregator,
|
|
IClusterNodeProvider clusterNodeProvider,
|
|
IOptions<HealthMonitoringOptions> options,
|
|
ILogger<CentralHealthReportLoop> logger,
|
|
TimeProvider? timeProvider = null)
|
|
{
|
|
_collector = collector;
|
|
_aggregator = aggregator;
|
|
_clusterNodeProvider = clusterNodeProvider;
|
|
_options = options.Value;
|
|
_logger = logger;
|
|
_sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Current sequence number (for testing).
|
|
/// </summary>
|
|
public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
|
|
|
|
/// <inheritdoc />
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogInformation(
|
|
"Central health report loop starting, interval {Interval}s",
|
|
_options.ReportInterval.TotalSeconds);
|
|
|
|
using var timer = new PeriodicTimer(_options.ReportInterval);
|
|
|
|
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
|
{
|
|
try
|
|
{
|
|
var isPrimary = _clusterNodeProvider.SelfIsPrimary;
|
|
_collector.SetActiveNode(isPrimary);
|
|
|
|
if (!isPrimary)
|
|
continue;
|
|
|
|
_collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
|
|
|
|
var seq = Interlocked.Increment(ref _sequenceNumber);
|
|
|
|
// HealthMonitoring-018: CollectReport atomically read-and-resets
|
|
// the per-interval error counters via Interlocked.Exchange. If
|
|
// ProcessReport throws (or any other failure occurs between the
|
|
// collect and the publish), those counts would otherwise be
|
|
// lost — neither in the un-published report nor in the
|
|
// now-zeroed collector. Snapshot the freshly-collected report
|
|
// so that on a publish failure we can atomically restore the
|
|
// counts back into the shared SiteHealthCollector via
|
|
// Interlocked.Add. Concurrent increments arriving during the
|
|
// ProcessReport call are preserved on the counter; the restore
|
|
// Add safely sums with any such concurrent increments. Same
|
|
// shape as the HealthMonitoring-017 fix in HealthReportSender.
|
|
var report = _collector.CollectReport(CentralSiteId);
|
|
var reportWithSeq = report with { SequenceNumber = seq };
|
|
|
|
try
|
|
{
|
|
_aggregator.ProcessReport(reportWithSeq);
|
|
}
|
|
catch
|
|
{
|
|
// Restore the captured per-interval counters atomically so
|
|
// they roll forward into the next report — see
|
|
// HealthMonitoring-018.
|
|
_collector.AddIntervalCounters(
|
|
scriptErrors: report.ScriptErrorCount,
|
|
alarmErrors: report.AlarmEvaluationErrorCount,
|
|
deadLetters: report.DeadLetterCount,
|
|
siteAuditWriteFailures: report.SiteAuditWriteFailures,
|
|
auditRedactionFailures: report.AuditRedactionFailure);
|
|
throw;
|
|
}
|
|
|
|
_logger.LogDebug("Generated central health report #{Seq}", seq);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Failed to generate central health report");
|
|
}
|
|
}
|
|
}
|
|
}
|