using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; using ScadaLink.Commons.Messages.Health; namespace ScadaLink.HealthMonitoring.Tests; /// /// HealthMonitoring-009 regression: the central self-report loop had no test /// coverage at all. These tests exercise leader-only gating (SelfIsPrimary), /// self-report generation for siteId="central", and monotonic sequence /// assignment. /// public class CentralHealthReportLoopTests { private sealed class FakeClusterNodeProvider : IClusterNodeProvider { public bool SelfIsPrimary { get; set; } public IReadOnlyList Nodes { get; set; } = []; public IReadOnlyList GetClusterNodes() => Nodes; } private sealed class RecordingAggregator : ICentralHealthAggregator { public List Processed { get; } = []; public void ProcessReport(SiteHealthReport report) => Processed.Add(report); public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { } public IReadOnlyDictionary GetAllSiteStates() => new Dictionary(); public SiteHealthState? GetSiteState(string siteId) => null; } /// /// HealthMonitoring-022 de-flake: 's /// internal cadence is a real , so the loop is /// timing-sensitive. We can't drive a virtual clock (PeriodicTimer doesn't /// consume ) without refactoring the production /// loop, so we keep wall-clock waits but use a *generous* budget: a 5 s /// outer cancellation cap with a poll-until-condition wait, instead of a /// fixed that fails fast on a slow CI runner. The /// loop's ReportInterval is set to 50 ms in each test, so under /// normal conditions the condition is met almost immediately; under heavy /// CI load the poll loop tolerates the slow tick instead of asserting on a /// timed-out empty list. /// private static async Task RunLoopUntil( CentralHealthReportLoop loop, Func condition, TimeSpan? maxWait = null) { var deadline = maxWait ?? TimeSpan.FromSeconds(5); using var cts = new CancellationTokenSource(deadline + TimeSpan.FromSeconds(1)); try { await loop.StartAsync(cts.Token); var sw = System.Diagnostics.Stopwatch.StartNew(); while (sw.Elapsed < deadline && !condition()) { await Task.Delay(25, CancellationToken.None); } await loop.StopAsync(CancellationToken.None); } catch (OperationCanceledException) { } } /// /// Used by tests that need the loop to run for a bounded period without /// waiting on a specific condition (e.g. asserting no reports were /// produced). The wait is generous (1 s default) — see /// for the rationale. /// private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs) { var totalMs = Math.Max(runForMs, 1000); using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(totalMs + 1000)); try { await loop.StartAsync(cts.Token); await Task.Delay(totalMs, CancellationToken.None); await loop.StopAsync(CancellationToken.None); } catch (OperationCanceledException) { } } [Fact] public async Task GeneratesCentralReports_WhenSelfIsPrimary() { var collector = new SiteHealthCollector(); var aggregator = new RecordingAggregator(); var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true }; var options = Options.Create(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromMilliseconds(50) }); var loop = new CentralHealthReportLoop( collector, aggregator, clusterNodes, options, NullLogger.Instance); // HealthMonitoring-022: wait up to 5 s for at least one report to fire // rather than fixed-budget Task.Delay; tolerates slow CI runners. await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1); Assert.NotEmpty(aggregator.Processed); Assert.All(aggregator.Processed, r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId)); } [Fact] public async Task GeneratesNoReports_WhenNotPrimary() { var collector = new SiteHealthCollector(); var aggregator = new RecordingAggregator(); var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false }; var options = Options.Create(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromMilliseconds(50) }); var loop = new CentralHealthReportLoop( collector, aggregator, clusterNodes, options, NullLogger.Instance); await RunLoopBriefly(loop, 250); Assert.Empty(aggregator.Processed); } [Fact] public async Task AssignsMonotonicSequenceNumbers() { var collector = new SiteHealthCollector(); var aggregator = new RecordingAggregator(); var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true }; var options = Options.Create(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromMilliseconds(50) }); var loop = new CentralHealthReportLoop( collector, aggregator, clusterNodes, options, NullLogger.Instance); // HealthMonitoring-022: wait up to 5 s for at least 2 reports rather // than a fixed 300 ms window that could miss the second tick on a // slow CI runner; the assertion below proves the sequence is monotonic. await RunLoopUntil(loop, () => aggregator.Processed.Count >= 2); Assert.True(aggregator.Processed.Count >= 2, $"Expected at least 2 reports, got {aggregator.Processed.Count}"); for (int i = 1; i < aggregator.Processed.Count; i++) { Assert.True( aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber, $"Sequence numbers not strictly increasing at index {i}"); } } /// /// HealthMonitoring-006 regression: the central loop's sequence-number seed /// must be derived from the injected (Unix-ms), /// not from DateTimeOffset.UtcNow read at field initialization, so the /// seeding strategy is deterministically testable. /// [Fact] public void SequenceNumberSeed_UsesInjectedTimeProvider() { var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero); var timeProvider = new TestTimeProvider(fixedInstant); var loop = new CentralHealthReportLoop( new SiteHealthCollector(), new RecordingAggregator(), new FakeClusterNodeProvider { SelfIsPrimary = true }, Options.Create(new HealthMonitoringOptions()), NullLogger.Instance, timeProvider); Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber); } /// /// HealthMonitoring-018 regression: when /// throws, the per-interval counters that /// just drained must be /// restored back into the shared collector so they roll forward into the /// next interval rather than being silently lost. Same shape as the /// HealthMonitoring-017 fix in . /// [Fact] public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport() { var collector = new SiteHealthCollector(); // Pre-populate every per-interval counter so the restore path on each // field is exercised. The loop's first iteration will read-and-reset // these via CollectReport, then ProcessReport will throw, and the // restore must put them back. collector.IncrementScriptError(); collector.IncrementScriptError(); collector.IncrementAlarmError(); collector.IncrementDeadLetter(); collector.IncrementDeadLetter(); collector.IncrementDeadLetter(); collector.IncrementSiteAuditWriteFailures(); collector.IncrementAuditRedactionFailure(); collector.IncrementAuditRedactionFailure(); var aggregator = new FailingThenSucceedingAggregator(); var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true }; var options = Options.Create(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromMilliseconds(50) }); var loop = new CentralHealthReportLoop( collector, aggregator, clusterNodes, options, NullLogger.Instance); // HealthMonitoring-022: the first ProcessReport call throws (counters // get restored), the second succeeds. Wait up to 5 s for that second // (successful) call rather than a fixed 450 ms budget. await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1); // First call threw, later succeeded — the first successful report // must carry the previously-failed interval's accumulated counts. Assert.NotEmpty(aggregator.Processed); var firstSuccess = aggregator.Processed[0]; Assert.Equal(2, firstSuccess.ScriptErrorCount); Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount); Assert.Equal(3, firstSuccess.DeadLetterCount); Assert.Equal(1, firstSuccess.SiteAuditWriteFailures); Assert.Equal(2, firstSuccess.AuditRedactionFailure); } /// /// whose first ProcessReport /// call throws (only the first), then subsequent calls succeed. Used by /// /// to verify the HealthMonitoring-018 restore-on-failure path. /// private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator { private int _callCount; public List Processed { get; } = []; public void ProcessReport(SiteHealthReport report) { var n = Interlocked.Increment(ref _callCount); if (n == 1) { throw new InvalidOperationException("aggregator temporarily unavailable"); } Processed.Add(report); } public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { } public IReadOnlyDictionary GetAllSiteStates() => new Dictionary(); public SiteHealthState? GetSiteState(string siteId) => null; } [Fact] public async Task SetsActiveNodeFlag_EvenWhenNotPrimary() { // The loop must still report the node's role to the collector when it is // the standby, so the standby's own node card shows the correct role. var collector = new SiteHealthCollector(); var aggregator = new RecordingAggregator(); var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false }; var options = Options.Create(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromMilliseconds(50) }); var loop = new CentralHealthReportLoop( collector, aggregator, clusterNodes, options, NullLogger.Instance); await RunLoopBriefly(loop, 150); Assert.False(collector.IsActiveNode); } }