fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage

2026-05-16 21:11:24 -04:00
parent 2502e4d10a
commit 9f634e37c3
7 changed files with 470 additions and 29 deletions
--- a/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs
+++ b/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs
@@ -103,17 +103,42 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
    }

    /// <summary>
-    /// Bumps the last-seen timestamp for a site already known via a prior
-    /// SiteHealthReport. Heartbeats from sites we have not yet received a
-    /// full report from are ignored — registration only happens on report.
-    /// The update is an atomic compare-and-swap of the immutable state.
+    /// Bumps the last-seen timestamp for a site. If a heartbeat arrives for a
+    /// site the aggregator has no state for yet (e.g. immediately after a central
+    /// restart/failover, when in-memory state is empty), the site is registered
+    /// as online with no <see cref="SiteHealthState.LatestReport"/> — heartbeats
+    /// prove the site is reachable, so it shows online straight away rather than
+    /// as "unknown" for up to a full report interval. The update is an atomic
+    /// compare-and-swap of the immutable state.
    /// </summary>
    public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
    {
        while (true)
        {
            if (!_siteStates.TryGetValue(siteId, out var existing))
-                return;
+            {
+                // Unknown site — register it as online, awaiting its first
+                // full report. LatestReport stays null until ProcessReport runs.
+                var registered = new SiteHealthState
+                {
+                    SiteId = siteId,
+                    LatestReport = null,
+                    LastReportReceivedAt = default,
+                    LastHeartbeatAt = receivedAt,
+                    LastSequenceNumber = 0,
+                    IsOnline = true
+                };
+
+                if (_siteStates.TryAdd(siteId, registered))
+                {
+                    _logger.LogInformation(
+                        "Site {SiteId} registered online via heartbeat (awaiting first report)", siteId);
+                    return;
+                }
+
+                // Lost the race — another thread registered first; retry as an update.
+                continue;
+            }

            var newHeartbeat = receivedAt > existing.LastHeartbeatAt
                ? receivedAt
@@ -163,10 +188,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        _logger.LogInformation(
-            "Central health aggregator started, offline timeout {Timeout}s",
-            _options.OfflineTimeout.TotalSeconds);
+            "Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
+            _options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);

-        // Check at half the offline timeout interval for timely detection
+        // Check at half the (shorter) offline timeout interval for timely detection
        var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
        using var timer = new PeriodicTimer(checkInterval);

@@ -189,8 +214,17 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
            // healthy site node (cadence owned by Cluster Infrastructure /
            // SiteCommunicationActor), so OfflineTimeout only fires when no
            // node can reach central, not during single-node failovers.
+            //
+            // The synthetic "central" site has no heartbeat source — its only
+            // signal is the 30s CentralHealthReportLoop self-report — so it gets
+            // a longer grace window (CentralOfflineTimeout) to survive a single
+            // skipped/late self-report.
+            var timeout = kvp.Key == CentralHealthReportLoop.CentralSiteId
+                ? _options.CentralOfflineTimeout
+                : _options.OfflineTimeout;
+
            var elapsed = now - state.LastHeartbeatAt;
-            if (elapsed <= _options.OfflineTimeout)
+            if (elapsed <= timeout)
                continue;

            // Atomically swap to an offline copy. If the CAS loses to a
@@ -201,7 +235,7 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
            {
                _logger.LogWarning(
                    "Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
-                    state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
+                    state.SiteId, elapsed.TotalSeconds, timeout.TotalSeconds);
            }
        }
    }
--- a/src/ScadaLink.HealthMonitoring/HealthMonitoringOptions.cs
+++ b/src/ScadaLink.HealthMonitoring/HealthMonitoringOptions.cs
@@ -4,4 +4,17 @@ public class HealthMonitoringOptions
 {
    public TimeSpan ReportInterval { get; set; } = TimeSpan.FromSeconds(30);
    public TimeSpan OfflineTimeout { get; set; } = TimeSpan.FromMinutes(1);
+
+    /// <summary>
+    /// Offline timeout applied to the synthetic "central" site only. Real sites
+    /// emit frequent heartbeats that keep <c>LastHeartbeatAt</c> fresh, so the
+    /// normal <see cref="OfflineTimeout"/> only fires on genuine total loss. The
+    /// "central" self-report has no heartbeat source — its only signal is the
+    /// 30s <see cref="CentralHealthReportLoop"/>, so a single skipped/late
+    /// self-report (leader GC pause, brief stall, mid-failover before the new
+    /// leader's loop spins up) would flap it offline under the 60s site timeout.
+    /// A longer central grace gives the equivalent of "one missed report" that
+    /// the design doc grants real sites. Default: 3x the report interval.
+    /// </summary>
+    public TimeSpan CentralOfflineTimeout { get; set; } = TimeSpan.FromMinutes(3);
 }
--- a/src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs
+++ b/src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs
@@ -11,10 +11,13 @@ public interface ICentralHealthAggregator
    void ProcessReport(SiteHealthReport report);

    /// <summary>
-    /// Bumps the last-seen timestamp for a site already known via a prior
-    /// SiteHealthReport. Used to keep a site marked online between full
-    /// 30s reports when ~2s heartbeats are arriving — protects against the
-    /// 60s offline threshold firing on a transiently delayed report.
+    /// Bumps the last-seen timestamp for a site, keeping it marked online
+    /// between full 30s reports when heartbeats are arriving — protects against
+    /// the offline threshold firing on a transiently delayed report. A heartbeat
+    /// for a site with no aggregator state yet (e.g. just after a central
+    /// restart/failover) registers that site as online with no
+    /// <see cref="SiteHealthState.LatestReport"/>, so reachable sites are not
+    /// shown as "unknown" during the failover window.
    /// </summary>
    void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);