fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage
This commit is contained in:
@@ -219,6 +219,94 @@ public class CentralHealthAggregatorTests
|
||||
Assert.True(final.IsOnline);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-007 regression: a heartbeat for a site that has not yet
|
||||
/// sent a full report (e.g. immediately after a central restart/failover, when
|
||||
/// the aggregator's in-memory state is empty) must register the site as online
|
||||
/// rather than being silently discarded. Otherwise reachable sites show as
|
||||
/// "unknown" for up to a full report interval during the failover window.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
_aggregator.MarkHeartbeat("site-new", now);
|
||||
|
||||
var state = _aggregator.GetSiteState("site-new");
|
||||
Assert.NotNull(state);
|
||||
Assert.True(state.IsOnline);
|
||||
Assert.Null(state.LatestReport);
|
||||
Assert.Equal(now, state.LastHeartbeatAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MarkHeartbeat_KeepsSiteOnline_BetweenReports()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport("site-1", 1));
|
||||
|
||||
// Time advances past the offline timeout, but heartbeats keep arriving.
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(45));
|
||||
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(45));
|
||||
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
|
||||
|
||||
_aggregator.CheckForOfflineSites();
|
||||
|
||||
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MarkHeartbeat_BringsOfflineSiteBackOnline()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport("site-1", 1));
|
||||
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(61));
|
||||
_aggregator.CheckForOfflineSites();
|
||||
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
|
||||
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
|
||||
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-005 regression: the synthetic "central" site has no
|
||||
/// heartbeat source — its LastHeartbeatAt is only bumped by the 30s
|
||||
/// CentralHealthReportLoop self-report. A single skipped/late self-report
|
||||
/// (leader GC pause, brief stall, mid-failover) would leave it with no signal
|
||||
/// for >60s and flap it offline even though the central cluster is healthy.
|
||||
/// The "central" keyspace entry must get a longer offline grace than real sites.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void OfflineDetection_CentralSite_HasLongerGraceThanRealSites()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
|
||||
_aggregator.ProcessReport(MakeReport("site-1", 1));
|
||||
|
||||
// One missed central self-report (~30s) plus the normal 60s site timeout:
|
||||
// a real site would already be offline here, but central must not be —
|
||||
// it only gets one self-report every 30s, so 60s is barely two reports.
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(75));
|
||||
_aggregator.CheckForOfflineSites();
|
||||
|
||||
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
Assert.True(
|
||||
_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline,
|
||||
"central must survive a single missed self-report");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
|
||||
|
||||
// Well beyond even the central grace window — genuine total loss.
|
||||
_timeProvider.Advance(TimeSpan.FromMinutes(10));
|
||||
_aggregator.CheckForOfflineSites();
|
||||
|
||||
Assert.False(_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SequenceNumberReset_RejectedUntilExceedsPrevMax()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user