fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage

This commit is contained in:
Joseph Doherty
2026-05-16 21:11:24 -04:00
parent 2502e4d10a
commit 9f634e37c3
7 changed files with 470 additions and 29 deletions

View File

@@ -4,4 +4,17 @@ public class HealthMonitoringOptions
{
public TimeSpan ReportInterval { get; set; } = TimeSpan.FromSeconds(30);
public TimeSpan OfflineTimeout { get; set; } = TimeSpan.FromMinutes(1);
/// <summary>
/// Offline timeout applied to the synthetic "central" site only. Real sites
/// emit frequent heartbeats that keep <c>LastHeartbeatAt</c> fresh, so the
/// normal <see cref="OfflineTimeout"/> only fires on genuine total loss. The
/// "central" self-report has no heartbeat source — its only signal is the
/// 30s <see cref="CentralHealthReportLoop"/>, so a single skipped/late
/// self-report (leader GC pause, brief stall, mid-failover before the new
/// leader's loop spins up) would flap it offline under the 60s site timeout.
/// A longer central grace gives the equivalent of "one missed report" that
/// the design doc grants real sites. Default: 3x the report interval.
/// </summary>
public TimeSpan CentralOfflineTimeout { get; set; } = TimeSpan.FromMinutes(3);
}