fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage
This commit is contained in:
@@ -103,17 +103,42 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
||||
/// full report from are ignored — registration only happens on report.
|
||||
/// The update is an atomic compare-and-swap of the immutable state.
|
||||
/// Bumps the last-seen timestamp for a site. If a heartbeat arrives for a
|
||||
/// site the aggregator has no state for yet (e.g. immediately after a central
|
||||
/// restart/failover, when in-memory state is empty), the site is registered
|
||||
/// as online with no <see cref="SiteHealthState.LatestReport"/> — heartbeats
|
||||
/// prove the site is reachable, so it shows online straight away rather than
|
||||
/// as "unknown" for up to a full report interval. The update is an atomic
|
||||
/// compare-and-swap of the immutable state.
|
||||
/// </summary>
|
||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
if (!_siteStates.TryGetValue(siteId, out var existing))
|
||||
return;
|
||||
{
|
||||
// Unknown site — register it as online, awaiting its first
|
||||
// full report. LatestReport stays null until ProcessReport runs.
|
||||
var registered = new SiteHealthState
|
||||
{
|
||||
SiteId = siteId,
|
||||
LatestReport = null,
|
||||
LastReportReceivedAt = default,
|
||||
LastHeartbeatAt = receivedAt,
|
||||
LastSequenceNumber = 0,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryAdd(siteId, registered))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} registered online via heartbeat (awaiting first report)", siteId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Lost the race — another thread registered first; retry as an update.
|
||||
continue;
|
||||
}
|
||||
|
||||
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
|
||||
? receivedAt
|
||||
@@ -163,10 +188,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Central health aggregator started, offline timeout {Timeout}s",
|
||||
_options.OfflineTimeout.TotalSeconds);
|
||||
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
|
||||
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
|
||||
|
||||
// Check at half the offline timeout interval for timely detection
|
||||
// Check at half the (shorter) offline timeout interval for timely detection
|
||||
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
|
||||
using var timer = new PeriodicTimer(checkInterval);
|
||||
|
||||
@@ -189,8 +214,17 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
// healthy site node (cadence owned by Cluster Infrastructure /
|
||||
// SiteCommunicationActor), so OfflineTimeout only fires when no
|
||||
// node can reach central, not during single-node failovers.
|
||||
//
|
||||
// The synthetic "central" site has no heartbeat source — its only
|
||||
// signal is the 30s CentralHealthReportLoop self-report — so it gets
|
||||
// a longer grace window (CentralOfflineTimeout) to survive a single
|
||||
// skipped/late self-report.
|
||||
var timeout = kvp.Key == CentralHealthReportLoop.CentralSiteId
|
||||
? _options.CentralOfflineTimeout
|
||||
: _options.OfflineTimeout;
|
||||
|
||||
var elapsed = now - state.LastHeartbeatAt;
|
||||
if (elapsed <= _options.OfflineTimeout)
|
||||
if (elapsed <= timeout)
|
||||
continue;
|
||||
|
||||
// Atomically swap to an offline copy. If the CAS loses to a
|
||||
@@ -201,7 +235,7 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
||||
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
|
||||
state.SiteId, elapsed.TotalSeconds, timeout.TotalSeconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,4 +4,17 @@ public class HealthMonitoringOptions
|
||||
{
|
||||
public TimeSpan ReportInterval { get; set; } = TimeSpan.FromSeconds(30);
|
||||
public TimeSpan OfflineTimeout { get; set; } = TimeSpan.FromMinutes(1);
|
||||
|
||||
/// <summary>
|
||||
/// Offline timeout applied to the synthetic "central" site only. Real sites
|
||||
/// emit frequent heartbeats that keep <c>LastHeartbeatAt</c> fresh, so the
|
||||
/// normal <see cref="OfflineTimeout"/> only fires on genuine total loss. The
|
||||
/// "central" self-report has no heartbeat source — its only signal is the
|
||||
/// 30s <see cref="CentralHealthReportLoop"/>, so a single skipped/late
|
||||
/// self-report (leader GC pause, brief stall, mid-failover before the new
|
||||
/// leader's loop spins up) would flap it offline under the 60s site timeout.
|
||||
/// A longer central grace gives the equivalent of "one missed report" that
|
||||
/// the design doc grants real sites. Default: 3x the report interval.
|
||||
/// </summary>
|
||||
public TimeSpan CentralOfflineTimeout { get; set; } = TimeSpan.FromMinutes(3);
|
||||
}
|
||||
|
||||
@@ -11,10 +11,13 @@ public interface ICentralHealthAggregator
|
||||
void ProcessReport(SiteHealthReport report);
|
||||
|
||||
/// <summary>
|
||||
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||
/// SiteHealthReport. Used to keep a site marked online between full
|
||||
/// 30s reports when ~2s heartbeats are arriving — protects against the
|
||||
/// 60s offline threshold firing on a transiently delayed report.
|
||||
/// Bumps the last-seen timestamp for a site, keeping it marked online
|
||||
/// between full 30s reports when heartbeats are arriving — protects against
|
||||
/// the offline threshold firing on a transiently delayed report. A heartbeat
|
||||
/// for a site with no aggregator state yet (e.g. just after a central
|
||||
/// restart/failover) registers that site as online with no
|
||||
/// <see cref="SiteHealthState.LatestReport"/>, so reachable sites are not
|
||||
/// shown as "unknown" during the failover window.
|
||||
/// </summary>
|
||||
void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user