fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage

This commit is contained in:
Joseph Doherty
2026-05-16 21:11:24 -04:00
parent 2502e4d10a
commit 9f634e37c3
7 changed files with 470 additions and 29 deletions

View File

@@ -103,17 +103,42 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
}
/// <summary>
/// Bumps the last-seen timestamp for a site already known via a prior
/// SiteHealthReport. Heartbeats from sites we have not yet received a
/// full report from are ignored — registration only happens on report.
/// The update is an atomic compare-and-swap of the immutable state.
/// Bumps the last-seen timestamp for a site. If a heartbeat arrives for a
/// site the aggregator has no state for yet (e.g. immediately after a central
/// restart/failover, when in-memory state is empty), the site is registered
/// as online with no <see cref="SiteHealthState.LatestReport"/> — heartbeats
/// prove the site is reachable, so it shows online straight away rather than
/// as "unknown" for up to a full report interval. The update is an atomic
/// compare-and-swap of the immutable state.
/// </summary>
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
{
while (true)
{
if (!_siteStates.TryGetValue(siteId, out var existing))
return;
{
// Unknown site — register it as online, awaiting its first
// full report. LatestReport stays null until ProcessReport runs.
var registered = new SiteHealthState
{
SiteId = siteId,
LatestReport = null,
LastReportReceivedAt = default,
LastHeartbeatAt = receivedAt,
LastSequenceNumber = 0,
IsOnline = true
};
if (_siteStates.TryAdd(siteId, registered))
{
_logger.LogInformation(
"Site {SiteId} registered online via heartbeat (awaiting first report)", siteId);
return;
}
// Lost the race — another thread registered first; retry as an update.
continue;
}
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
? receivedAt
@@ -163,10 +188,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation(
"Central health aggregator started, offline timeout {Timeout}s",
_options.OfflineTimeout.TotalSeconds);
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
// Check at half the offline timeout interval for timely detection
// Check at half the (shorter) offline timeout interval for timely detection
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
using var timer = new PeriodicTimer(checkInterval);
@@ -189,8 +214,17 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
// healthy site node (cadence owned by Cluster Infrastructure /
// SiteCommunicationActor), so OfflineTimeout only fires when no
// node can reach central, not during single-node failovers.
//
// The synthetic "central" site has no heartbeat source — its only
// signal is the 30s CentralHealthReportLoop self-report — so it gets
// a longer grace window (CentralOfflineTimeout) to survive a single
// skipped/late self-report.
var timeout = kvp.Key == CentralHealthReportLoop.CentralSiteId
? _options.CentralOfflineTimeout
: _options.OfflineTimeout;
var elapsed = now - state.LastHeartbeatAt;
if (elapsed <= _options.OfflineTimeout)
if (elapsed <= timeout)
continue;
// Atomically swap to an offline copy. If the CAS loses to a
@@ -201,7 +235,7 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
{
_logger.LogWarning(
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
state.SiteId, elapsed.TotalSeconds, timeout.TotalSeconds);
}
}
}