refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,144 @@
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Central-side counterpart to <see cref="HealthReportSender"/>.
+/// Periodically builds a SiteHealthReport for the central cluster itself
+/// (siteId = <see cref="CentralSiteId"/>) and feeds it into the local
+/// CentralHealthAggregator so the UI can render central as another card
+/// on /monitoring/health. Only the cluster leader (Primary) generates
+/// reports — the standby's aggregator catches up on failover when it
+/// becomes Primary and starts its own loop.
+/// </summary>
+public class CentralHealthReportLoop : BackgroundService
+{
+    /// <summary>
+    /// Reserved siteId used to represent the central cluster in the
+    /// shared CentralHealthAggregator keyspace.
+    ///
+    /// HealthMonitoring-021: the value is prefixed with <c>$</c> — a character
+    /// that is forbidden in real site identifiers (the configuration /
+    /// repository layer only permits Sites whose <c>SiteIdentifier</c> is a
+    /// plain identifier) — so the synthetic central entry cannot collide with
+    /// a real site whose operator-set identifier happened to be the bare word
+    /// "central". A collision would have caused the two reports to clobber
+    /// each other in the aggregator keyspace via the sequence-number guard,
+    /// and the real site would inherit the longer
+    /// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/> grace and
+    /// stay falsely-online for an extra two minutes after going down.
+    /// Consumers (<see cref="CentralHealthAggregator.CheckForOfflineSites"/>,
+    /// the Central UI health dashboard) reference this constant rather than
+    /// the literal string, so the change is local.
+    /// </summary>
+    public const string CentralSiteId = "$central";
+
+    private readonly ISiteHealthCollector _collector;
+    private readonly ICentralHealthAggregator _aggregator;
+    private readonly IClusterNodeProvider _clusterNodeProvider;
+    private readonly HealthMonitoringOptions _options;
+    private readonly ILogger<CentralHealthReportLoop> _logger;
+
+    // Seeded with Unix-ms so reports from a newly-elected central leader
+    // always sort after reports from any prior leader for siteId="central".
+    // The clock is read through the injected TimeProvider so the seeding is
+    // deterministically testable.
+    private long _sequenceNumber;
+
+    /// <summary>
+    /// Initializes the central health report loop.
+    /// </summary>
+    /// <param name="collector">Local health metrics collector for the central node.</param>
+    /// <param name="aggregator">Aggregator that stores reports for the Central UI health dashboard.</param>
+    /// <param name="clusterNodeProvider">Provider used to determine whether this node is primary.</param>
+    /// <param name="options">Health monitoring configuration (report interval, offline threshold).</param>
+    /// <param name="logger">Logger for diagnostics.</param>
+    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
+    public CentralHealthReportLoop(
+        ISiteHealthCollector collector,
+        ICentralHealthAggregator aggregator,
+        IClusterNodeProvider clusterNodeProvider,
+        IOptions<HealthMonitoringOptions> options,
+        ILogger<CentralHealthReportLoop> logger,
+        TimeProvider? timeProvider = null)
+    {
+        _collector = collector;
+        _aggregator = aggregator;
+        _clusterNodeProvider = clusterNodeProvider;
+        _options = options.Value;
+        _logger = logger;
+        _sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
+    }
+
+    /// <summary>
+    /// Current sequence number (for testing).
+    /// </summary>
+    public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
+
+    /// <inheritdoc />
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        _logger.LogInformation(
+            "Central health report loop starting, interval {Interval}s",
+            _options.ReportInterval.TotalSeconds);
+
+        using var timer = new PeriodicTimer(_options.ReportInterval);
+
+        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
+        {
+            try
+            {
+                var isPrimary = _clusterNodeProvider.SelfIsPrimary;
+                _collector.SetActiveNode(isPrimary);
+
+                if (!isPrimary)
+                    continue;
+
+                _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
+
+                var seq = Interlocked.Increment(ref _sequenceNumber);
+
+                // HealthMonitoring-018: CollectReport atomically read-and-resets
+                // the per-interval error counters via Interlocked.Exchange. If
+                // ProcessReport throws (or any other failure occurs between the
+                // collect and the publish), those counts would otherwise be
+                // lost — neither in the un-published report nor in the
+                // now-zeroed collector. Snapshot the freshly-collected report
+                // so that on a publish failure we can atomically restore the
+                // counts back into the shared SiteHealthCollector via
+                // Interlocked.Add. Concurrent increments arriving during the
+                // ProcessReport call are preserved on the counter; the restore
+                // Add safely sums with any such concurrent increments. Same
+                // shape as the HealthMonitoring-017 fix in HealthReportSender.
+                var report = _collector.CollectReport(CentralSiteId);
+                var reportWithSeq = report with { SequenceNumber = seq };
+
+                try
+                {
+                    _aggregator.ProcessReport(reportWithSeq);
+                }
+                catch
+                {
+                    // Restore the captured per-interval counters atomically so
+                    // they roll forward into the next report — see
+                    // HealthMonitoring-018.
+                    _collector.AddIntervalCounters(
+                        scriptErrors: report.ScriptErrorCount,
+                        alarmErrors: report.AlarmEvaluationErrorCount,
+                        deadLetters: report.DeadLetterCount,
+                        siteAuditWriteFailures: report.SiteAuditWriteFailures,
+                        auditRedactionFailures: report.AuditRedactionFailure);
+                    throw;
+                }
+
+                _logger.LogDebug("Generated central health report #{Seq}", seq);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Failed to generate central health report");
+            }
+        }
+    }
+}