fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage

This commit is contained in:
Joseph Doherty
2026-05-16 21:11:24 -04:00
parent 2502e4d10a
commit 9f634e37c3
7 changed files with 470 additions and 29 deletions

View File

@@ -219,6 +219,94 @@ public class CentralHealthAggregatorTests
Assert.True(final.IsOnline);
}
/// <summary>
/// HealthMonitoring-007 regression: a heartbeat for a site that has not yet
/// sent a full report (e.g. immediately after a central restart/failover, when
/// the aggregator's in-memory state is empty) must register the site as online
/// rather than being silently discarded. Otherwise reachable sites show as
/// "unknown" for up to a full report interval during the failover window.
/// </summary>
[Fact]
public void MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport()
{
var now = _timeProvider.GetUtcNow();
_aggregator.MarkHeartbeat("site-new", now);
var state = _aggregator.GetSiteState("site-new");
Assert.NotNull(state);
Assert.True(state.IsOnline);
Assert.Null(state.LatestReport);
Assert.Equal(now, state.LastHeartbeatAt);
}
[Fact]
public void MarkHeartbeat_KeepsSiteOnline_BetweenReports()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
// Time advances past the offline timeout, but heartbeats keep arriving.
_timeProvider.Advance(TimeSpan.FromSeconds(45));
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
_timeProvider.Advance(TimeSpan.FromSeconds(45));
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
_aggregator.CheckForOfflineSites();
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
[Fact]
public void MarkHeartbeat_BringsOfflineSiteBackOnline()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
_timeProvider.Advance(TimeSpan.FromSeconds(61));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
/// <summary>
/// HealthMonitoring-005 regression: the synthetic "central" site has no
/// heartbeat source — its LastHeartbeatAt is only bumped by the 30s
/// CentralHealthReportLoop self-report. A single skipped/late self-report
/// (leader GC pause, brief stall, mid-failover) would leave it with no signal
/// for &gt;60s and flap it offline even though the central cluster is healthy.
/// The "central" keyspace entry must get a longer offline grace than real sites.
/// </summary>
[Fact]
public void OfflineDetection_CentralSite_HasLongerGraceThanRealSites()
{
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
_aggregator.ProcessReport(MakeReport("site-1", 1));
// One missed central self-report (~30s) plus the normal 60s site timeout:
// a real site would already be offline here, but central must not be —
// it only gets one self-report every 30s, so 60s is barely two reports.
_timeProvider.Advance(TimeSpan.FromSeconds(75));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
Assert.True(
_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline,
"central must survive a single missed self-report");
}
[Fact]
public void OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss()
{
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
// Well beyond even the central grace window — genuine total loss.
_timeProvider.Advance(TimeSpan.FromMinutes(10));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline);
}
[Fact]
public void SequenceNumberReset_RejectedUntilExceedsPrevMax()
{

View File

@@ -0,0 +1,134 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Messages.Health;
namespace ScadaLink.HealthMonitoring.Tests;
/// <summary>
/// HealthMonitoring-009 regression: the central self-report loop had no test
/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
/// self-report generation for siteId="central", and monotonic sequence
/// assignment.
/// </summary>
public class CentralHealthReportLoopTests
{
private sealed class FakeClusterNodeProvider : IClusterNodeProvider
{
public bool SelfIsPrimary { get; set; }
public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
}
private sealed class RecordingAggregator : ICentralHealthAggregator
{
public List<SiteHealthReport> Processed { get; } = [];
public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
new Dictionary<string, SiteHealthState>();
public SiteHealthState? GetSiteState(string siteId) => null;
}
private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
{
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(runForMs + 100));
try
{
await loop.StartAsync(cts.Token);
await Task.Delay(runForMs, CancellationToken.None);
await loop.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
}
[Fact]
public async Task GeneratesCentralReports_WhenSelfIsPrimary()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 250);
Assert.NotEmpty(aggregator.Processed);
Assert.All(aggregator.Processed,
r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
}
[Fact]
public async Task GeneratesNoReports_WhenNotPrimary()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 250);
Assert.Empty(aggregator.Processed);
}
[Fact]
public async Task AssignsMonotonicSequenceNumbers()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 300);
Assert.True(aggregator.Processed.Count >= 2,
$"Expected at least 2 reports, got {aggregator.Processed.Count}");
for (int i = 1; i < aggregator.Processed.Count; i++)
{
Assert.True(
aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
$"Sequence numbers not strictly increasing at index {i}");
}
}
[Fact]
public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
{
// The loop must still report the node's role to the collector when it is
// the standby, so the standby's own node card shows the correct role.
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 150);
Assert.False(collector.IsActiveNode);
}
}

View File

@@ -138,6 +138,111 @@ public class SiteHealthCollectorTests
Assert.Equal(0, report.SequenceNumber);
}
// HealthMonitoring-009 regression: the remaining collector setters had no
// "reflected in report" coverage. The following tests verify each setter's
// value reaches CollectReport output.
[Fact]
public void SetClusterNodes_ReflectedInReport()
{
var nodes = new List<ScadaLink.Commons.Messages.Health.NodeStatus>
{
new("node-a", true, "Active"),
new("node-b", true, "Standby")
};
_collector.SetClusterNodes(nodes);
var report = _collector.CollectReport("site-1");
Assert.NotNull(report.ClusterNodes);
Assert.Equal(2, report.ClusterNodes!.Count);
Assert.Equal("node-a", report.ClusterNodes[0].Hostname);
}
[Fact]
public void SetInstanceCounts_ReflectedInReport()
{
_collector.SetInstanceCounts(deployed: 10, enabled: 7, disabled: 3);
var report = _collector.CollectReport("site-1");
Assert.Equal(10, report.DeployedInstanceCount);
Assert.Equal(7, report.EnabledInstanceCount);
Assert.Equal(3, report.DisabledInstanceCount);
}
[Fact]
public void SetParkedMessageCount_ReflectedInReport()
{
_collector.SetParkedMessageCount(42);
var report = _collector.CollectReport("site-1");
Assert.Equal(42, report.ParkedMessageCount);
}
[Fact]
public void SetNodeHostname_ReflectedInReport()
{
_collector.SetNodeHostname("site-host-1");
var report = _collector.CollectReport("site-1");
Assert.Equal("site-host-1", report.NodeHostname);
}
[Fact]
public void SetActiveNode_ReflectedInNodeRole()
{
_collector.SetActiveNode(true);
Assert.Equal("Active", _collector.CollectReport("site-1").NodeRole);
Assert.True(_collector.IsActiveNode);
_collector.SetActiveNode(false);
Assert.Equal("Standby", _collector.CollectReport("site-1").NodeRole);
Assert.False(_collector.IsActiveNode);
}
[Fact]
public void UpdateTagQuality_ReflectedInReport()
{
_collector.UpdateTagQuality("opc-1", good: 80, bad: 15, uncertain: 5);
var report = _collector.CollectReport("site-1");
Assert.NotNull(report.DataConnectionTagQuality);
var quality = report.DataConnectionTagQuality!["opc-1"];
Assert.Equal(80, quality.Good);
Assert.Equal(15, quality.Bad);
Assert.Equal(5, quality.Uncertain);
}
[Fact]
public void UpdateConnectionEndpoint_ReflectedInReport()
{
_collector.UpdateConnectionEndpoint("opc-1", "opc.tcp://plc-1:4840");
var report = _collector.CollectReport("site-1");
Assert.NotNull(report.DataConnectionEndpoints);
Assert.Equal("opc.tcp://plc-1:4840", report.DataConnectionEndpoints!["opc-1"]);
}
[Fact]
public void SetStoreAndForwardDepths_ReflectedInReport()
{
_collector.SetStoreAndForwardDepths(new Dictionary<string, int>
{
["ExternalSystem"] = 5,
["Notification"] = 2
});
var report = _collector.CollectReport("site-1");
Assert.Equal(5, report.StoreAndForwardBufferDepths["ExternalSystem"]);
Assert.Equal(2, report.StoreAndForwardBufferDepths["Notification"]);
}
[Fact]
public async Task ThreadSafety_ConcurrentIncrements()
{