ScadaBridge/tests/ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests/CentralHealthReportLoopTests.cs

using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;

namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;

/// <summary>
/// HealthMonitoring-009 regression: the central self-report loop had no test
/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
/// self-report generation for siteId="central", and monotonic sequence
/// assignment.
/// </summary>
public class CentralHealthReportLoopTests
{
    private sealed class FakeClusterNodeProvider : IClusterNodeProvider
    {
        public bool SelfIsPrimary { get; set; }
        public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
        public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
    }

    private sealed class RecordingAggregator : ICentralHealthAggregator
    {
        public List<SiteHealthReport> Processed { get; } = [];
        public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
        public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
        public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
            new Dictionary<string, SiteHealthState>();
        public SiteHealthState? GetSiteState(string siteId) => null;
    }

    /// <summary>
    /// HealthMonitoring-022 de-flake: <see cref="CentralHealthReportLoop"/>'s
    /// internal cadence is a real <see cref="PeriodicTimer"/>, so the loop is
    /// timing-sensitive. We can't drive a virtual clock (PeriodicTimer doesn't
    /// consume <see cref="TimeProvider"/>) without refactoring the production
    /// loop, so we keep wall-clock waits but use a *generous* budget: a 5 s
    /// outer cancellation cap with a poll-until-condition wait, instead of a
    /// fixed <see cref="Task.Delay"/> that fails fast on a slow CI runner. The
    /// loop's <c>ReportInterval</c> is set to 50 ms in each test, so under
    /// normal conditions the condition is met almost immediately; under heavy
    /// CI load the poll loop tolerates the slow tick instead of asserting on a
    /// timed-out empty list.
    /// </summary>
    private static async Task RunLoopUntil(
        CentralHealthReportLoop loop,
        Func<bool> condition,
        TimeSpan? maxWait = null)
    {
        var deadline = maxWait ?? TimeSpan.FromSeconds(5);
        using var cts = new CancellationTokenSource(deadline + TimeSpan.FromSeconds(1));
        try
        {
            await loop.StartAsync(cts.Token);
            var sw = System.Diagnostics.Stopwatch.StartNew();
            while (sw.Elapsed < deadline && !condition())
            {
                await Task.Delay(25, CancellationToken.None);
            }
            await loop.StopAsync(CancellationToken.None);
        }
        catch (OperationCanceledException) { }
    }

    /// <summary>
    /// Used by tests that need the loop to run for a bounded period without
    /// waiting on a specific condition (e.g. asserting <i>no</i> reports were
    /// produced). The wait is generous (1 s default) — see
    /// <see cref="RunLoopUntil"/> for the rationale.
    /// </summary>
    private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
    {
        var totalMs = Math.Max(runForMs, 1000);
        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(totalMs + 1000));
        try
        {
            await loop.StartAsync(cts.Token);
            await Task.Delay(totalMs, CancellationToken.None);
            await loop.StopAsync(CancellationToken.None);
        }
        catch (OperationCanceledException) { }
    }

    [Fact]
    public async Task GeneratesCentralReports_WhenSelfIsPrimary()
    {
        var collector = new SiteHealthCollector();
        var aggregator = new RecordingAggregator();
        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
        var options = Options.Create(new HealthMonitoringOptions
        {
            ReportInterval = TimeSpan.FromMilliseconds(50)
        });

        var loop = new CentralHealthReportLoop(
            collector, aggregator, clusterNodes, options,
            NullLogger<CentralHealthReportLoop>.Instance);

        // HealthMonitoring-022: wait up to 5 s for at least one report to fire
        // rather than fixed-budget Task.Delay; tolerates slow CI runners.
        await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);

        Assert.NotEmpty(aggregator.Processed);
        Assert.All(aggregator.Processed,
            r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
    }

    [Fact]
    public async Task GeneratesNoReports_WhenNotPrimary()
    {
        var collector = new SiteHealthCollector();
        var aggregator = new RecordingAggregator();
        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
        var options = Options.Create(new HealthMonitoringOptions
        {
            ReportInterval = TimeSpan.FromMilliseconds(50)
        });

        var loop = new CentralHealthReportLoop(
            collector, aggregator, clusterNodes, options,
            NullLogger<CentralHealthReportLoop>.Instance);

        await RunLoopBriefly(loop, 250);

        Assert.Empty(aggregator.Processed);
    }

    [Fact]
    public async Task AssignsMonotonicSequenceNumbers()
    {
        var collector = new SiteHealthCollector();
        var aggregator = new RecordingAggregator();
        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
        var options = Options.Create(new HealthMonitoringOptions
        {
            ReportInterval = TimeSpan.FromMilliseconds(50)
        });

        var loop = new CentralHealthReportLoop(
            collector, aggregator, clusterNodes, options,
            NullLogger<CentralHealthReportLoop>.Instance);

        // HealthMonitoring-022: wait up to 5 s for at least 2 reports rather
        // than a fixed 300 ms window that could miss the second tick on a
        // slow CI runner; the assertion below proves the sequence is monotonic.
        await RunLoopUntil(loop, () => aggregator.Processed.Count >= 2);

        Assert.True(aggregator.Processed.Count >= 2,
            $"Expected at least 2 reports, got {aggregator.Processed.Count}");
        for (int i = 1; i < aggregator.Processed.Count; i++)
        {
            Assert.True(
                aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
                $"Sequence numbers not strictly increasing at index {i}");
        }
    }

    /// <summary>
    /// HealthMonitoring-006 regression: the central loop's sequence-number seed
    /// must be derived from the injected <see cref="TimeProvider"/> (Unix-ms),
    /// not from <c>DateTimeOffset.UtcNow</c> read at field initialization, so the
    /// seeding strategy is deterministically testable.
    /// </summary>
    [Fact]
    public void SequenceNumberSeed_UsesInjectedTimeProvider()
    {
        var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
        var timeProvider = new TestTimeProvider(fixedInstant);

        var loop = new CentralHealthReportLoop(
            new SiteHealthCollector(),
            new RecordingAggregator(),
            new FakeClusterNodeProvider { SelfIsPrimary = true },
            Options.Create(new HealthMonitoringOptions()),
            NullLogger<CentralHealthReportLoop>.Instance,
            timeProvider);

        Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber);
    }

    /// <summary>
    /// HealthMonitoring-018 regression: when <see cref="ICentralHealthAggregator.ProcessReport"/>
    /// throws, the per-interval counters that
    /// <see cref="SiteHealthCollector.CollectReport"/> just drained must be
    /// restored back into the shared collector so they roll forward into the
    /// next interval rather than being silently lost. Same shape as the
    /// HealthMonitoring-017 fix in <see cref="HealthReportSender"/>.
    /// </summary>
    [Fact]
    public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport()
    {
        var collector = new SiteHealthCollector();
        // Pre-populate every per-interval counter so the restore path on each
        // field is exercised. The loop's first iteration will read-and-reset
        // these via CollectReport, then ProcessReport will throw, and the
        // restore must put them back.
        collector.IncrementScriptError();
        collector.IncrementScriptError();
        collector.IncrementAlarmError();
        collector.IncrementDeadLetter();
        collector.IncrementDeadLetter();
        collector.IncrementDeadLetter();
        collector.IncrementSiteAuditWriteFailures();
        collector.IncrementAuditRedactionFailure();
        collector.IncrementAuditRedactionFailure();

        var aggregator = new FailingThenSucceedingAggregator();
        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
        var options = Options.Create(new HealthMonitoringOptions
        {
            ReportInterval = TimeSpan.FromMilliseconds(50)
        });

        var loop = new CentralHealthReportLoop(
            collector, aggregator, clusterNodes, options,
            NullLogger<CentralHealthReportLoop>.Instance);

        // HealthMonitoring-022: the first ProcessReport call throws (counters
        // get restored), the second succeeds. Wait up to 5 s for that second
        // (successful) call rather than a fixed 450 ms budget.
        await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);

        // First call threw, later succeeded — the first successful report
        // must carry the previously-failed interval's accumulated counts.
        Assert.NotEmpty(aggregator.Processed);
        var firstSuccess = aggregator.Processed[0];
        Assert.Equal(2, firstSuccess.ScriptErrorCount);
        Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
        Assert.Equal(3, firstSuccess.DeadLetterCount);
        Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
        Assert.Equal(2, firstSuccess.AuditRedactionFailure);
    }

    /// <summary>
    /// <see cref="ICentralHealthAggregator"/> whose first <c>ProcessReport</c>
    /// call throws (only the first), then subsequent calls succeed. Used by
    /// <see cref="ProcessReportFailure_PreservesIntervalCountersForNextReport"/>
    /// to verify the HealthMonitoring-018 restore-on-failure path.
    /// </summary>
    private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator
    {
        private int _callCount;
        public List<SiteHealthReport> Processed { get; } = [];

        public void ProcessReport(SiteHealthReport report)
        {
            var n = Interlocked.Increment(ref _callCount);
            if (n == 1)
            {
                throw new InvalidOperationException("aggregator temporarily unavailable");
            }
            Processed.Add(report);
        }

        public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
        public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
            new Dictionary<string, SiteHealthState>();
        public SiteHealthState? GetSiteState(string siteId) => null;
    }

    [Fact]
    public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
    {
        // The loop must still report the node's role to the collector when it is
        // the standby, so the standby's own node card shows the correct role.
        var collector = new SiteHealthCollector();
        var aggregator = new RecordingAggregator();
        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
        var options = Options.Create(new HealthMonitoringOptions
        {
            ReportInterval = TimeSpan.FromMilliseconds(50)
        });

        var loop = new CentralHealthReportLoop(
            collector, aggregator, clusterNodes, options,
            NullLogger<CentralHealthReportLoop>.Instance);

        await RunLoopBriefly(loop, 150);

        Assert.False(collector.IsActiveNode);
    }
}