fix(health-monitoring): resolve HealthMonitoring-001/002 — populate S&F buffer depth, make SiteHealthState immutable

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent 340a70f0e6
commit 7d7214a4ca
7 changed files with 287 additions and 60 deletions

View File

@@ -33,16 +33,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
/// Only replaces stored state if incoming sequence number is greater than last received.
/// Auto-marks previously offline sites as online.
/// </summary>
/// <remarks>
/// <see cref="SiteHealthState"/> is immutable: each transition produces a brand-new
/// instance, and the dictionary entry is replaced atomically. The mutation is
/// performed in a compare-and-swap retry loop rather than via the
/// <c>AddOrUpdate</c> update delegate so the sequence-number guard and the field
/// writes are evaluated as a single atomic step against the value actually
/// installed — the <c>AddOrUpdate</c> delegate may be invoked more than once
/// under contention and could otherwise act on a value that is then discarded.
/// </remarks>
public void ProcessReport(SiteHealthReport report)
{
var now = _timeProvider.GetUtcNow();
_siteStates.AddOrUpdate(
report.SiteId,
_ =>
while (true)
{
if (!_siteStates.TryGetValue(report.SiteId, out var existing))
{
_logger.LogInformation("Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
return new SiteHealthState
var registered = new SiteHealthState
{
SiteId = report.SiteId,
LatestReport = report,
@@ -51,50 +59,84 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
LastSequenceNumber = report.SequenceNumber,
IsOnline = true
};
},
(_, existing) =>
if (_siteStates.TryAdd(report.SiteId, registered))
{
_logger.LogInformation(
"Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
return;
}
// Lost the race — another thread registered first; retry as an update.
continue;
}
if (report.SequenceNumber <= existing.LastSequenceNumber)
{
if (report.SequenceNumber <= existing.LastSequenceNumber)
_logger.LogDebug(
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
return;
}
var updated = existing with
{
LatestReport = report,
LastReportReceivedAt = now,
LastHeartbeatAt = now,
LastSequenceNumber = report.SequenceNumber,
IsOnline = true
};
if (_siteStates.TryUpdate(report.SiteId, updated, existing))
{
if (!existing.IsOnline)
{
_logger.LogDebug(
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
return existing;
_logger.LogInformation(
"Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
}
return;
}
var wasOffline = !existing.IsOnline;
existing.LatestReport = report;
existing.LastReportReceivedAt = now;
existing.LastHeartbeatAt = now;
existing.LastSequenceNumber = report.SequenceNumber;
existing.IsOnline = true;
if (wasOffline)
{
_logger.LogInformation("Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
}
return existing;
});
// CAS lost — the entry changed under us; retry with the fresh value.
}
}
/// <summary>
/// Bumps the last-seen timestamp for a site already known via a prior
/// SiteHealthReport. Heartbeats from sites we have not yet received a
/// full report from are ignored — registration only happens on report.
/// The update is an atomic compare-and-swap of the immutable state.
/// </summary>
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
{
if (!_siteStates.TryGetValue(siteId, out var state))
return;
if (receivedAt > state.LastHeartbeatAt)
state.LastHeartbeatAt = receivedAt;
if (!state.IsOnline)
while (true)
{
state.IsOnline = true;
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
if (!_siteStates.TryGetValue(siteId, out var existing))
return;
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
? receivedAt
: existing.LastHeartbeatAt;
// Nothing to change — avoid a needless swap.
if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
return;
var updated = existing with
{
LastHeartbeatAt = newHeartbeat,
IsOnline = true
};
if (_siteStates.TryUpdate(siteId, updated, existing))
{
if (!existing.IsOnline)
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
return;
}
// CAS lost — retry with the fresh value.
}
}
@@ -143,13 +185,20 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
var state = kvp.Value;
if (!state.IsOnline) continue;
// Use LastHeartbeatAt — heartbeats arrive every ~5s from any
// healthy site node, so OfflineTimeout only fires when no node
// can reach central, not during single-node failovers.
// Use LastHeartbeatAt — heartbeats arrive frequently from any
// healthy site node (cadence owned by Cluster Infrastructure /
// SiteCommunicationActor), so OfflineTimeout only fires when no
// node can reach central, not during single-node failovers.
var elapsed = now - state.LastHeartbeatAt;
if (elapsed > _options.OfflineTimeout)
if (elapsed <= _options.OfflineTimeout)
continue;
// Atomically swap to an offline copy. If the CAS loses to a
// concurrent report/heartbeat the site was just heard from, so
// leaving it online is the correct outcome — no retry needed.
var offline = state with { IsOnline = false };
if (_siteStates.TryUpdate(kvp.Key, offline, state))
{
state.IsOnline = false;
_logger.LogWarning(
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);

View File

@@ -84,6 +84,20 @@ public class HealthReportSender : BackgroundService
_collector.SetParkedMessageCount(parkedCount);
}
catch { /* Non-fatal — parked count will be 0 */ }
try
{
// Per-category pending-message buffer depths (the documented
// "store-and-forward buffer depth" triage metric). Keyed by
// StoreAndForwardCategory name so the central dashboard can
// render external/notification/DB-write depths separately.
var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
var depths = depthsByCategory.ToDictionary(
kvp => kvp.Key.ToString(),
kvp => kvp.Value);
_collector.SetStoreAndForwardDepths(depths);
}
catch { /* Non-fatal — buffer depths will be empty */ }
}
var seq = Interlocked.Increment(ref _sequenceNumber);

View File

@@ -4,26 +4,37 @@ namespace ScadaLink.HealthMonitoring;
/// <summary>
/// In-memory state for a single site's health, stored by the central aggregator.
/// Immutable: every state transition produces a new instance which the aggregator
/// installs into its <c>ConcurrentDictionary</c> via an atomic compare-and-swap.
/// This makes handing the reference straight to UI callers safe — a consumer can
/// never observe a torn or half-applied update.
/// </summary>
public class SiteHealthState
public sealed record SiteHealthState
{
public required string SiteId { get; init; }
public SiteHealthReport LatestReport { get; set; } = null!;
/// <summary>
/// The latest full <see cref="SiteHealthReport"/> received for the site, or
/// <c>null</c> if the site is known only via heartbeats and has not yet sent
/// a report.
/// </summary>
public SiteHealthReport? LatestReport { get; init; }
/// <summary>
/// Time the latest full <see cref="SiteHealthReport"/> was processed.
/// Used by the UI to surface report staleness during failover.
/// </summary>
public DateTimeOffset LastReportReceivedAt { get; set; }
public DateTimeOffset LastReportReceivedAt { get; init; }
/// <summary>
/// Time the most recent signal of any kind (full report OR ~5s heartbeat)
/// was received. Drives offline detection — heartbeats from the standby
/// keep the site marked online even when the active node is unable to
/// produce a report (mid-failover, brief stalls).
/// Time the most recent signal of any kind (full report OR heartbeat) was
/// received. Drives offline detection — heartbeats from the standby keep the
/// site marked online even when the active node is unable to produce a report
/// (mid-failover, brief stalls). See the heartbeat scheduler owned by the
/// Cluster Infrastructure / SiteCommunicationActor for the actual cadence.
/// </summary>
public DateTimeOffset LastHeartbeatAt { get; set; }
public DateTimeOffset LastHeartbeatAt { get; init; }
public long LastSequenceNumber { get; set; }
public bool IsOnline { get; set; }
public long LastSequenceNumber { get; init; }
public bool IsOnline { get; init; }
}