refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,187 @@
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.StoreAndForward;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+/// <summary>
+/// Periodically collects a SiteHealthReport and sends it to central via Akka remoting.
+/// Sequence numbers are monotonic and reset on service restart. They are <b>not</b>
+/// zero/one-based: the per-process counter is seeded with the current Unix epoch
+/// (milliseconds) at construction so that, after a failover, reports from a
+/// freshly-active node always sort after reports from any prior active node for the
+/// same site — otherwise the central aggregator's sequence-number guard would
+/// silently reject the new active's first reports as stale.
+/// </summary>
+public class HealthReportSender : BackgroundService
+{
+    private readonly ISiteHealthCollector _collector;
+    private readonly IHealthReportTransport _transport;
+    private readonly HealthMonitoringOptions _options;
+    private readonly ILogger<HealthReportSender> _logger;
+    private readonly string _siteId;
+    private readonly StoreAndForwardStorage? _sfStorage;
+    private readonly IClusterNodeProvider? _clusterNodeProvider;
+
+    // Seeded with Unix-ms at construction so reports from a freshly-active
+    // node always sort after reports from any prior active node for the same
+    // site. Without this seeding, failover would silently drop the new
+    // active's first reports because their per-process counter starts below
+    // the prior active's last sequence number. The clock is read through the
+    // injected TimeProvider so the seeding is deterministically testable.
+    private long _sequenceNumber;
+
+    /// <summary>Initializes the sender, seeds the monotonic sequence number from the current Unix timestamp.</summary>
+    /// <param name="collector">Site health metric collector supplying the report payload.</param>
+    /// <param name="transport">Transport used to send the health report to central.</param>
+    /// <param name="options">Health monitoring options including the report interval.</param>
+    /// <param name="logger">Logger instance.</param>
+    /// <param name="siteIdentityProvider">Provides the site identifier embedded in each report.</param>
+    /// <param name="sfStorage">Optional store-and-forward storage for queue depth metrics.</param>
+    /// <param name="clusterNodeProvider">Optional cluster node provider for active-node detection.</param>
+    /// <param name="timeProvider">Optional time provider; defaults to <see cref="TimeProvider.System"/>.</param>
+    public HealthReportSender(
+        ISiteHealthCollector collector,
+        IHealthReportTransport transport,
+        IOptions<HealthMonitoringOptions> options,
+        ILogger<HealthReportSender> logger,
+        ISiteIdentityProvider siteIdentityProvider,
+        StoreAndForwardStorage? sfStorage = null,
+        IClusterNodeProvider? clusterNodeProvider = null,
+        TimeProvider? timeProvider = null)
+    {
+        _collector = collector;
+        _transport = transport;
+        _options = options.Value;
+        _logger = logger;
+        _siteId = siteIdentityProvider.SiteId;
+        _sfStorage = sfStorage;
+        _clusterNodeProvider = clusterNodeProvider;
+        _sequenceNumber = (timeProvider ?? TimeProvider.System).GetUtcNow().ToUnixTimeMilliseconds();
+    }
+
+    /// <summary>
+    /// Current sequence number (for testing).
+    /// </summary>
+    public long CurrentSequenceNumber => Interlocked.Read(ref _sequenceNumber);
+
+    /// <inheritdoc />
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        _logger.LogInformation(
+            "Health report sender starting for site {SiteId}, interval {Interval}s",
+            _siteId, _options.ReportInterval.TotalSeconds);
+
+        using var timer = new PeriodicTimer(_options.ReportInterval);
+
+        while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
+        {
+            try
+            {
+                // Only the active node (running the DeploymentManager singleton) sends health reports.
+                // The standby node has no instance/connection data and would overwrite the active's report.
+                if (!_collector.IsActiveNode)
+                    continue;
+
+                if (_clusterNodeProvider != null)
+                {
+                    try
+                    {
+                        _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes());
+                    }
+                    catch (Exception ex)
+                    {
+                        // Non-fatal — the report ships with the previous cluster
+                        // node list. Logged so a persistent failure is diagnosable.
+                        _logger.LogWarning(ex,
+                            "Failed to refresh cluster nodes for health report (site {SiteId}); using stale list",
+                            _siteId);
+                    }
+                }
+
+                if (_sfStorage != null)
+                {
+                    try
+                    {
+                        var parkedCount = await _sfStorage.GetParkedMessageCountAsync();
+                        _collector.SetParkedMessageCount(parkedCount);
+                    }
+                    catch (Exception ex)
+                    {
+                        // Non-fatal — parked count will be 0 in this report.
+                        _logger.LogWarning(ex,
+                            "Failed to query parked message count for health report (site {SiteId})",
+                            _siteId);
+                    }
+
+                    try
+                    {
+                        // Per-category pending-message buffer depths (the documented
+                        // "store-and-forward buffer depth" triage metric). Keyed by
+                        // StoreAndForwardCategory name so the central dashboard can
+                        // render external/notification/DB-write depths separately.
+                        var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
+                        var depths = depthsByCategory.ToDictionary(
+                            kvp => kvp.Key.ToString(),
+                            kvp => kvp.Value);
+                        _collector.SetStoreAndForwardDepths(depths);
+                    }
+                    catch (Exception ex)
+                    {
+                        // Non-fatal — buffer depths will be empty in this report.
+                        _logger.LogWarning(ex,
+                            "Failed to query store-and-forward buffer depths for health report (site {SiteId})",
+                            _siteId);
+                    }
+                }
+
+                var seq = Interlocked.Increment(ref _sequenceNumber);
+
+                // HealthMonitoring-017: CollectReport atomically read-and-resets
+                // the per-interval error counters via Interlocked.Exchange. If
+                // the Send below throws, those counts are otherwise lost
+                // forever — neither in the un-sent report nor in the now-zeroed
+                // collector. Snapshot the freshly-collected report so that on a
+                // transport failure we can atomically restore the counts back
+                // into the collector via Interlocked.Add, so the next
+                // successful report includes them. Concurrent increments
+                // arriving during the Send are preserved on the counter (they
+                // accumulate against zero); the restore Add safely sums with
+                // any such concurrent increments.
+                var report = _collector.CollectReport(_siteId);
+
+                // Replace the placeholder sequence number with our monotonic one
+                var reportWithSeq = report with { SequenceNumber = seq };
+
+                try
+                {
+                    _transport.Send(reportWithSeq);
+                }
+                catch
+                {
+                    // Restore the captured per-interval counters atomically so
+                    // they roll forward into the next report — see
+                    // HealthMonitoring-017. Any concurrent increment that
+                    // arrived during the failed Send remains on the counter;
+                    // Interlocked.Add sums correctly with it.
+                    _collector.AddIntervalCounters(
+                        scriptErrors: report.ScriptErrorCount,
+                        alarmErrors: report.AlarmEvaluationErrorCount,
+                        deadLetters: report.DeadLetterCount,
+                        siteAuditWriteFailures: report.SiteAuditWriteFailures,
+                        auditRedactionFailures: report.AuditRedactionFailure);
+                    throw;
+                }
+
+                _logger.LogInformation("Sent health report #{Seq} for site {SiteId}", seq, _siteId);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Failed to send health report for site {SiteId}", _siteId);
+                // Continue sending — don't let a single failure stop reporting
+            }
+        }
+    }
+}