fix(health-monitoring): resolve HealthMonitoring-001/002 — populate S&F buffer depth, make SiteHealthState immutable

2026-05-16 19:40:40 -04:00
parent 340a70f0e6
commit 7d7214a4ca
7 changed files with 287 additions and 60 deletions
--- a/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs
+++ b/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs
@@ -33,16 +33,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
    /// Only replaces stored state if incoming sequence number is greater than last received.
    /// Auto-marks previously offline sites as online.
    /// </summary>
+    /// <remarks>
+    /// <see cref="SiteHealthState"/> is immutable: each transition produces a brand-new
+    /// instance, and the dictionary entry is replaced atomically. The mutation is
+    /// performed in a compare-and-swap retry loop rather than via the
+    /// <c>AddOrUpdate</c> update delegate so the sequence-number guard and the field
+    /// writes are evaluated as a single atomic step against the value actually
+    /// installed — the <c>AddOrUpdate</c> delegate may be invoked more than once
+    /// under contention and could otherwise act on a value that is then discarded.
+    /// </remarks>
    public void ProcessReport(SiteHealthReport report)
    {
        var now = _timeProvider.GetUtcNow();

-        _siteStates.AddOrUpdate(
-            report.SiteId,
-            _ =>
+        while (true)
+        {
+            if (!_siteStates.TryGetValue(report.SiteId, out var existing))
            {
-                _logger.LogInformation("Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
-                return new SiteHealthState
+                var registered = new SiteHealthState
                {
                    SiteId = report.SiteId,
                    LatestReport = report,
@@ -51,50 +59,84 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
                    LastSequenceNumber = report.SequenceNumber,
                    IsOnline = true
                };
-            },
-            (_, existing) =>
+
+                if (_siteStates.TryAdd(report.SiteId, registered))
+                {
+                    _logger.LogInformation(
+                        "Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
+                    return;
+                }
+
+                // Lost the race — another thread registered first; retry as an update.
+                continue;
+            }
+
+            if (report.SequenceNumber <= existing.LastSequenceNumber)
            {
-                if (report.SequenceNumber <= existing.LastSequenceNumber)
+                _logger.LogDebug(
+                    "Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
+                    report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
+                return;
+            }
+
+            var updated = existing with
+            {
+                LatestReport = report,
+                LastReportReceivedAt = now,
+                LastHeartbeatAt = now,
+                LastSequenceNumber = report.SequenceNumber,
+                IsOnline = true
+            };
+
+            if (_siteStates.TryUpdate(report.SiteId, updated, existing))
+            {
+                if (!existing.IsOnline)
                {
-                    _logger.LogDebug(
-                        "Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
-                        report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
-                    return existing;
+                    _logger.LogInformation(
+                        "Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
                }
+                return;
+            }

-                var wasOffline = !existing.IsOnline;
-                existing.LatestReport = report;
-                existing.LastReportReceivedAt = now;
-                existing.LastHeartbeatAt = now;
-                existing.LastSequenceNumber = report.SequenceNumber;
-                existing.IsOnline = true;
-
-                if (wasOffline)
-                {
-                    _logger.LogInformation("Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
-                }
-
-                return existing;
-            });
+            // CAS lost — the entry changed under us; retry with the fresh value.
+        }
    }

    /// <summary>
    /// Bumps the last-seen timestamp for a site already known via a prior
    /// SiteHealthReport. Heartbeats from sites we have not yet received a
    /// full report from are ignored — registration only happens on report.
+    /// The update is an atomic compare-and-swap of the immutable state.
    /// </summary>
    public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
    {
-        if (!_siteStates.TryGetValue(siteId, out var state))
-            return;
-
-        if (receivedAt > state.LastHeartbeatAt)
-            state.LastHeartbeatAt = receivedAt;
-
-        if (!state.IsOnline)
+        while (true)
        {
-            state.IsOnline = true;
-            _logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
+            if (!_siteStates.TryGetValue(siteId, out var existing))
+                return;
+
+            var newHeartbeat = receivedAt > existing.LastHeartbeatAt
+                ? receivedAt
+                : existing.LastHeartbeatAt;
+
+            // Nothing to change — avoid a needless swap.
+            if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
+                return;
+
+            var updated = existing with
+            {
+                LastHeartbeatAt = newHeartbeat,
+                IsOnline = true
+            };
+
+            if (_siteStates.TryUpdate(siteId, updated, existing))
+            {
+                if (!existing.IsOnline)
+                    _logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
+                return;
+            }
+
+            // CAS lost — retry with the fresh value.
        }
    }

@@ -143,13 +185,20 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
            var state = kvp.Value;
            if (!state.IsOnline) continue;

-            // Use LastHeartbeatAt — heartbeats arrive every ~5s from any
-            // healthy site node, so OfflineTimeout only fires when no node
-            // can reach central, not during single-node failovers.
+            // Use LastHeartbeatAt — heartbeats arrive frequently from any
+            // healthy site node (cadence owned by Cluster Infrastructure /
+            // SiteCommunicationActor), so OfflineTimeout only fires when no
+            // node can reach central, not during single-node failovers.
            var elapsed = now - state.LastHeartbeatAt;
-            if (elapsed > _options.OfflineTimeout)
+            if (elapsed <= _options.OfflineTimeout)
+                continue;
+
+            // Atomically swap to an offline copy. If the CAS loses to a
+            // concurrent report/heartbeat the site was just heard from, so
+            // leaving it online is the correct outcome — no retry needed.
+            var offline = state with { IsOnline = false };
+            if (_siteStates.TryUpdate(kvp.Key, offline, state))
            {
-                state.IsOnline = false;
                _logger.LogWarning(
                    "Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
                    state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
--- a/src/ScadaLink.HealthMonitoring/HealthReportSender.cs
+++ b/src/ScadaLink.HealthMonitoring/HealthReportSender.cs
@@ -84,6 +84,20 @@ public class HealthReportSender : BackgroundService
                        _collector.SetParkedMessageCount(parkedCount);
                    }
                    catch { /* Non-fatal — parked count will be 0 */ }
+
+                    try
+                    {
+                        // Per-category pending-message buffer depths (the documented
+                        // "store-and-forward buffer depth" triage metric). Keyed by
+                        // StoreAndForwardCategory name so the central dashboard can
+                        // render external/notification/DB-write depths separately.
+                        var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
+                        var depths = depthsByCategory.ToDictionary(
+                            kvp => kvp.Key.ToString(),
+                            kvp => kvp.Value);
+                        _collector.SetStoreAndForwardDepths(depths);
+                    }
+                    catch { /* Non-fatal — buffer depths will be empty */ }
                }

                var seq = Interlocked.Increment(ref _sequenceNumber);
--- a/src/ScadaLink.HealthMonitoring/SiteHealthState.cs
+++ b/src/ScadaLink.HealthMonitoring/SiteHealthState.cs
@@ -4,26 +4,37 @@ namespace ScadaLink.HealthMonitoring;

 /// <summary>
 /// In-memory state for a single site's health, stored by the central aggregator.
+/// Immutable: every state transition produces a new instance which the aggregator
+/// installs into its <c>ConcurrentDictionary</c> via an atomic compare-and-swap.
+/// This makes handing the reference straight to UI callers safe — a consumer can
+/// never observe a torn or half-applied update.
 /// </summary>
-public class SiteHealthState
+public sealed record SiteHealthState
 {
    public required string SiteId { get; init; }
-    public SiteHealthReport LatestReport { get; set; } = null!;
+
+    /// <summary>
+    /// The latest full <see cref="SiteHealthReport"/> received for the site, or
+    /// <c>null</c> if the site is known only via heartbeats and has not yet sent
+    /// a report.
+    /// </summary>
+    public SiteHealthReport? LatestReport { get; init; }

    /// <summary>
    /// Time the latest full <see cref="SiteHealthReport"/> was processed.
    /// Used by the UI to surface report staleness during failover.
    /// </summary>
-    public DateTimeOffset LastReportReceivedAt { get; set; }
+    public DateTimeOffset LastReportReceivedAt { get; init; }

    /// <summary>
-    /// Time the most recent signal of any kind (full report OR ~5s heartbeat)
-    /// was received. Drives offline detection — heartbeats from the standby
-    /// keep the site marked online even when the active node is unable to
-    /// produce a report (mid-failover, brief stalls).
+    /// Time the most recent signal of any kind (full report OR heartbeat) was
+    /// received. Drives offline detection — heartbeats from the standby keep the
+    /// site marked online even when the active node is unable to produce a report
+    /// (mid-failover, brief stalls). See the heartbeat scheduler owned by the
+    /// Cluster Infrastructure / SiteCommunicationActor for the actual cadence.
    /// </summary>
-    public DateTimeOffset LastHeartbeatAt { get; set; }
+    public DateTimeOffset LastHeartbeatAt { get; init; }

-    public long LastSequenceNumber { get; set; }
-    public bool IsOnline { get; set; }
+    public long LastSequenceNumber { get; init; }
+    public bool IsOnline { get; init; }
 }