refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,57 @@
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+/// <summary>
+/// Bundle C (M5-T7) regression coverage. The Audit Log payload filter
+/// (<c>DefaultAuditPayloadFilter</c>) increments
+/// <c>IAuditRedactionFailureCounter</c> every time a header/body/SQL-param
+/// redactor stage throws and the filter has to over-redact the field with
+/// the <c>&lt;redacted: redactor error&gt;</c> marker. Bundle C bridges that
+/// counter into the Site Health Monitoring report payload as
+/// <c>AuditRedactionFailure</c> so a misconfigured / catastrophic regex
+/// surfaces on /monitoring/health rather than disappearing into a NoOp sink.
+/// Mirrors the Bundle G <c>SiteAuditWriteFailures</c> metric shape — same
+/// per-interval increment-and-reset semantics, same defaults-to-zero
+/// contract.
+/// </summary>
+public class AuditRedactionFailureMetricTests
+{
+    private readonly SiteHealthCollector _collector = new();
+
+    [Fact]
+    public void Increment_Three_Times_Counter_Reports_3()
+    {
+        _collector.IncrementAuditRedactionFailure();
+        _collector.IncrementAuditRedactionFailure();
+        _collector.IncrementAuditRedactionFailure();
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(3, report.AuditRedactionFailure);
+    }
+
+    [Fact]
+    public void Report_Payload_Includes_AuditRedactionFailure_AsZeroByDefault()
+    {
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(0, report.AuditRedactionFailure);
+    }
+
+    /// <summary>
+    /// Mirrors the existing per-interval reset semantics for ScriptErrorCount /
+    /// AlarmEvaluationErrorCount / DeadLetterCount / SiteAuditWriteFailures —
+    /// AuditRedactionFailure is an interval count, not a running total.
+    /// </summary>
+    [Fact]
+    public void CollectReport_Resets_AuditRedactionFailure()
+    {
+        _collector.IncrementAuditRedactionFailure();
+        _collector.IncrementAuditRedactionFailure();
+
+        var first = _collector.CollectReport("site-1");
+        Assert.Equal(2, first.AuditRedactionFailure);
+
+        var second = _collector.CollectReport("site-1");
+        Assert.Equal(0, second.AuditRedactionFailure);
+    }
+}
@@ -0,0 +1,426 @@
+using Microsoft.Extensions.Logging.Abstractions;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+/// <summary>
+/// A simple fake TimeProvider for testing that allows advancing time manually.
+/// </summary>
+internal sealed class TestTimeProvider : TimeProvider
+{
+    private DateTimeOffset _utcNow;
+
+    public TestTimeProvider(DateTimeOffset startTime)
+    {
+        _utcNow = startTime;
+    }
+
+    public override DateTimeOffset GetUtcNow() => _utcNow;
+
+    public void Advance(TimeSpan duration) => _utcNow += duration;
+}
+
+public class CentralHealthAggregatorTests
+{
+    private readonly TestTimeProvider _timeProvider;
+    private readonly CentralHealthAggregator _aggregator;
+
+    public CentralHealthAggregatorTests()
+    {
+        _timeProvider = new TestTimeProvider(DateTimeOffset.UtcNow);
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            OfflineTimeout = TimeSpan.FromSeconds(60)
+        });
+        _aggregator = new CentralHealthAggregator(
+            options,
+            NullLogger<CentralHealthAggregator>.Instance,
+            _timeProvider);
+    }
+
+    private static SiteHealthReport MakeReport(string siteId, long seq) =>
+        new(
+            SiteId: siteId,
+            SequenceNumber: seq,
+            ReportTimestamp: DateTimeOffset.UtcNow,
+            DataConnectionStatuses: new Dictionary<string, ConnectionHealth>(),
+            TagResolutionCounts: new Dictionary<string, TagResolutionStatus>(),
+            ScriptErrorCount: 0,
+            AlarmEvaluationErrorCount: 0,
+            StoreAndForwardBufferDepths: new Dictionary<string, int>(),
+            DeadLetterCount: 0,
+            DeployedInstanceCount: 0,
+            EnabledInstanceCount: 0,
+            DisabledInstanceCount: 0);
+
+    [Fact]
+    public void ProcessReport_StoresState_ForNewSite()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        var state = _aggregator.GetSiteState("site-1");
+        Assert.NotNull(state);
+        Assert.True(state.IsOnline);
+        Assert.Equal(1, state.LastSequenceNumber);
+    }
+
+    [Fact]
+    public void ProcessReport_UpdatesState_WhenSequenceIncreases()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+        _aggregator.ProcessReport(MakeReport("site-1", 2));
+
+        var state = _aggregator.GetSiteState("site-1");
+        Assert.Equal(2, state!.LastSequenceNumber);
+    }
+
+    [Fact]
+    public void ProcessReport_RejectsStaleReport_WhenSequenceNotGreater()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 5));
+        _aggregator.ProcessReport(MakeReport("site-1", 3));
+
+        var state = _aggregator.GetSiteState("site-1");
+        Assert.Equal(5, state!.LastSequenceNumber);
+    }
+
+    [Fact]
+    public void ProcessReport_RejectsEqualSequence()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 5));
+        _aggregator.ProcessReport(MakeReport("site-1", 5));
+
+        var state = _aggregator.GetSiteState("site-1");
+        Assert.Equal(5, state!.LastSequenceNumber);
+    }
+
+    [Fact]
+    public void OfflineDetection_SiteGoesOffline_WhenNoReportWithinTimeout()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+        Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
+
+        // Advance past the offline timeout
+        _timeProvider.Advance(TimeSpan.FromSeconds(61));
+        _aggregator.CheckForOfflineSites();
+
+        Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
+    }
+
+    [Fact]
+    public void OnlineRecovery_SiteComesBackOnline_WhenReportReceived()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        // Go offline
+        _timeProvider.Advance(TimeSpan.FromSeconds(61));
+        _aggregator.CheckForOfflineSites();
+        Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
+
+        // Receive new report → back online
+        _aggregator.ProcessReport(MakeReport("site-1", 2));
+        Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
+    }
+
+    [Fact]
+    public void OfflineDetection_SiteRemainsOnline_WhenReportWithinTimeout()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        _timeProvider.Advance(TimeSpan.FromSeconds(30));
+        _aggregator.CheckForOfflineSites();
+
+        Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
+    }
+
+    [Fact]
+    public void GetAllSiteStates_ReturnsAllKnownSites()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+        _aggregator.ProcessReport(MakeReport("site-2", 1));
+
+        var states = _aggregator.GetAllSiteStates();
+        Assert.Equal(2, states.Count);
+        Assert.Contains("site-1", states.Keys);
+        Assert.Contains("site-2", states.Keys);
+    }
+
+    [Fact]
+    public void GetSiteState_ReturnsNull_ForUnknownSite()
+    {
+        var state = _aggregator.GetSiteState("nonexistent");
+        Assert.Null(state);
+    }
+
+    [Fact]
+    public void ProcessReport_StoresLatestReport()
+    {
+        var report = MakeReport("site-1", 1) with { ScriptErrorCount = 42 };
+        _aggregator.ProcessReport(report);
+
+        var state = _aggregator.GetSiteState("site-1");
+        Assert.Equal(42, state!.LatestReport!.ScriptErrorCount);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-002 regression: SiteHealthState is mutated from multiple
+    /// threads (ProcessReport, MarkHeartbeat, CheckForOfflineSites). With a mutable
+    /// class and unsynchronized field writes, a snapshot read could observe a torn
+    /// or half-applied state. The state must be immutable and every transition an
+    /// atomic reference swap, so a snapshot is always internally consistent and the
+    /// monotonic sequence-number guard is never subverted by a lost update.
+    /// </summary>
+    [Fact]
+    public async Task ProcessReport_ConcurrentUpdates_NeverLoseSequenceOrTearState()
+    {
+        const int iterations = 5_000;
+        // SiteHealthState must be an immutable record so handing the reference to
+        // UI callers (and reading it concurrently) is safe.
+        Assert.True(typeof(SiteHealthState).GetMethod("<Clone>$") != null,
+            "SiteHealthState must be an immutable record for safe concurrent reads.");
+
+        _aggregator.ProcessReport(MakeReport("site-1", 0));
+
+        var writer = Task.Run(() =>
+        {
+            for (long seq = 1; seq <= iterations; seq++)
+                _aggregator.ProcessReport(MakeReport("site-1", seq));
+        });
+
+        var heartbeater = Task.Run(() =>
+        {
+            for (int i = 0; i < iterations; i++)
+                _aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
+        });
+
+        long maxObserved = 0;
+        var reader = Task.Run(() =>
+        {
+            for (int i = 0; i < iterations; i++)
+            {
+                var state = _aggregator.GetSiteState("site-1");
+                if (state == null) continue;
+                // A consistent snapshot: the stored report's sequence number must
+                // always match the state's LastSequenceNumber (no half-applied update).
+                Assert.Equal(state.LastSequenceNumber, state.LatestReport!.SequenceNumber);
+                if (state.LastSequenceNumber > maxObserved)
+                    maxObserved = state.LastSequenceNumber;
+            }
+        });
+
+        await Task.WhenAll(writer, heartbeater, reader);
+
+        // The final state must reflect the highest sequence — no lost update.
+        var final = _aggregator.GetSiteState("site-1");
+        Assert.Equal(iterations, final!.LastSequenceNumber);
+        Assert.Equal(iterations, final.LatestReport!.SequenceNumber);
+        Assert.True(final.IsOnline);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-007 regression: a heartbeat for a site that has not yet
+    /// sent a full report (e.g. immediately after a central restart/failover, when
+    /// the aggregator's in-memory state is empty) must register the site as online
+    /// rather than being silently discarded. Otherwise reachable sites show as
+    /// "unknown" for up to a full report interval during the failover window.
+    /// </summary>
+    [Fact]
+    public void MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport()
+    {
+        var now = _timeProvider.GetUtcNow();
+
+        _aggregator.MarkHeartbeat("site-new", now);
+
+        var state = _aggregator.GetSiteState("site-new");
+        Assert.NotNull(state);
+        Assert.True(state.IsOnline);
+        Assert.Null(state.LatestReport);
+        Assert.Equal(now, state.LastHeartbeatAt);
+    }
+
+    /// <summary>
+    /// Regression test for HealthMonitoring-015. A heartbeat-only registered site
+    /// has never processed a full report, so <see cref="SiteHealthState.LastReportReceivedAt"/>
+    /// must be <c>null</c> — not the <c>DateTimeOffset.MinValue</c> (year-0001)
+    /// sentinel that the UI would otherwise render as a ~2000-year-stale timestamp.
+    /// The "no report yet" signal must be an explicit nullable state, consistent
+    /// with <see cref="SiteHealthState.LatestReport"/>.
+    /// </summary>
+    [Fact]
+    public void MarkHeartbeat_RegistersUnknownSite_WithNullLastReportReceivedAt()
+    {
+        _aggregator.MarkHeartbeat("site-new", _timeProvider.GetUtcNow());
+
+        var state = _aggregator.GetSiteState("site-new");
+        Assert.NotNull(state);
+        Assert.Null(state.LastReportReceivedAt);
+    }
+
+    /// <summary>
+    /// Regression test for HealthMonitoring-015. Once a full report is processed
+    /// for a heartbeat-registered site, <see cref="SiteHealthState.LastReportReceivedAt"/>
+    /// becomes a real (non-null) instant.
+    /// </summary>
+    [Fact]
+    public void ProcessReport_SetsLastReportReceivedAt_ForHeartbeatRegisteredSite()
+    {
+        _aggregator.MarkHeartbeat("site-new", _timeProvider.GetUtcNow());
+        _timeProvider.Advance(TimeSpan.FromSeconds(5));
+        var reportTime = _timeProvider.GetUtcNow();
+
+        _aggregator.ProcessReport(MakeReport("site-new", 1));
+
+        var state = _aggregator.GetSiteState("site-new");
+        Assert.NotNull(state);
+        Assert.Equal(reportTime, state.LastReportReceivedAt);
+    }
+
+    [Fact]
+    public void MarkHeartbeat_KeepsSiteOnline_BetweenReports()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        // Time advances past the offline timeout, but heartbeats keep arriving.
+        _timeProvider.Advance(TimeSpan.FromSeconds(45));
+        _aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
+        _timeProvider.Advance(TimeSpan.FromSeconds(45));
+        _aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
+
+        _aggregator.CheckForOfflineSites();
+
+        Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
+    }
+
+    [Fact]
+    public void MarkHeartbeat_BringsOfflineSiteBackOnline()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        _timeProvider.Advance(TimeSpan.FromSeconds(61));
+        _aggregator.CheckForOfflineSites();
+        Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
+
+        _aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
+        Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-020 regression: an offline-to-online transition must
+    /// be backed by a fresh LastHeartbeatAt. Previously MarkHeartbeat used
+    /// <c>max(receivedAt, existing.LastHeartbeatAt)</c>, so an out-of-order
+    /// heartbeat carrying an older timestamp would bring the site online with
+    /// a stale heartbeat and CheckForOfflineSites would flap it straight back
+    /// to offline on the next tick.
+    /// </summary>
+    [Fact]
+    public void MarkHeartbeat_OfflineToOnline_StampsFreshLastHeartbeatAt()
+    {
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        _timeProvider.Advance(TimeSpan.FromSeconds(61));
+        _aggregator.CheckForOfflineSites();
+        Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
+
+        // An out-of-order heartbeat arrives with a timestamp older than the
+        // existing LastHeartbeatAt (e.g. clock skew on the originating node).
+        var nowAfter = _timeProvider.GetUtcNow();
+        var stale = nowAfter - TimeSpan.FromSeconds(120);
+        _aggregator.MarkHeartbeat("site-1", stale);
+
+        var state = _aggregator.GetSiteState("site-1")!;
+        Assert.True(state.IsOnline);
+        // The recorded LastHeartbeatAt must be ~"now", not the stale receivedAt.
+        Assert.InRange((nowAfter - state.LastHeartbeatAt).TotalSeconds, 0, 5);
+
+        // And it must survive the very next offline check — proves no flap.
+        _aggregator.CheckForOfflineSites();
+        Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-005 regression: the synthetic "central" site has no
+    /// heartbeat source — its LastHeartbeatAt is only bumped by the 30s
+    /// CentralHealthReportLoop self-report. A single skipped/late self-report
+    /// (leader GC pause, brief stall, mid-failover) would leave it with no signal
+    /// for &gt;60s and flap it offline even though the central cluster is healthy.
+    /// The "central" keyspace entry must get a longer offline grace than real sites.
+    /// </summary>
+    [Fact]
+    public void OfflineDetection_CentralSite_HasLongerGraceThanRealSites()
+    {
+        _aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        // One missed central self-report (~30s) plus the normal 60s site timeout:
+        // a real site would already be offline here, but central must not be —
+        // it only gets one self-report every 30s, so 60s is barely two reports.
+        _timeProvider.Advance(TimeSpan.FromSeconds(75));
+        _aggregator.CheckForOfflineSites();
+
+        Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
+        Assert.True(
+            _aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline,
+            "central must survive a single missed self-report");
+    }
+
+    [Fact]
+    public void OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss()
+    {
+        _aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
+
+        // Well beyond even the central grace window — genuine total loss.
+        _timeProvider.Advance(TimeSpan.FromMinutes(10));
+        _aggregator.CheckForOfflineSites();
+
+        Assert.False(_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-013 regression: the offline-check cadence must be derived
+    /// from the *shorter* of <see cref="HealthMonitoringOptions.OfflineTimeout"/>
+    /// and <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>, so that if
+    /// an operator configures <c>CentralOfflineTimeout</c> smaller than
+    /// <c>OfflineTimeout</c>, central offline detection is still timely instead of
+    /// being delayed by up to a full <c>OfflineTimeout / 2</c>.
+    /// </summary>
+    [Fact]
+    public void CheckInterval_IsHalfTheShorterTimeout()
+    {
+        // Default: OfflineTimeout (60s) is the shorter of the two.
+        Assert.Equal(
+            TimeSpan.FromSeconds(30),
+            CentralHealthAggregator.ComputeCheckInterval(new HealthMonitoringOptions
+            {
+                OfflineTimeout = TimeSpan.FromSeconds(60),
+                CentralOfflineTimeout = TimeSpan.FromMinutes(3)
+            }));
+
+        // Operator configures CentralOfflineTimeout shorter — cadence must adapt.
+        Assert.Equal(
+            TimeSpan.FromSeconds(10),
+            CentralHealthAggregator.ComputeCheckInterval(new HealthMonitoringOptions
+            {
+                OfflineTimeout = TimeSpan.FromSeconds(60),
+                CentralOfflineTimeout = TimeSpan.FromSeconds(20)
+            }));
+    }
+
+    [Fact]
+    public void SequenceNumberReset_RejectedUntilExceedsPrevMax()
+    {
+        // Site sends seq 10, then restarts and sends seq 1.
+        // Per design: sequence resets on singleton restart.
+        // The aggregator will reject seq 1 < 10 — expected behavior.
+        _aggregator.ProcessReport(MakeReport("site-1", 10));
+        _aggregator.ProcessReport(MakeReport("site-1", 1));
+
+        Assert.Equal(10, _aggregator.GetSiteState("site-1")!.LastSequenceNumber);
+
+        // Once it exceeds the old max, it works again. SiteHealthState is an
+        // immutable snapshot, so re-fetch to observe the new state.
+        _aggregator.ProcessReport(MakeReport("site-1", 11));
+        Assert.Equal(11, _aggregator.GetSiteState("site-1")!.LastSequenceNumber);
+    }
+}
@@ -0,0 +1,282 @@
+using Microsoft.Extensions.Logging.Abstractions;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+/// <summary>
+/// HealthMonitoring-009 regression: the central self-report loop had no test
+/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
+/// self-report generation for siteId="central", and monotonic sequence
+/// assignment.
+/// </summary>
+public class CentralHealthReportLoopTests
+{
+    private sealed class FakeClusterNodeProvider : IClusterNodeProvider
+    {
+        public bool SelfIsPrimary { get; set; }
+        public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
+        public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
+    }
+
+    private sealed class RecordingAggregator : ICentralHealthAggregator
+    {
+        public List<SiteHealthReport> Processed { get; } = [];
+        public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
+        public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
+        public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
+            new Dictionary<string, SiteHealthState>();
+        public SiteHealthState? GetSiteState(string siteId) => null;
+    }
+
+    /// <summary>
+    /// HealthMonitoring-022 de-flake: <see cref="CentralHealthReportLoop"/>'s
+    /// internal cadence is a real <see cref="PeriodicTimer"/>, so the loop is
+    /// timing-sensitive. We can't drive a virtual clock (PeriodicTimer doesn't
+    /// consume <see cref="TimeProvider"/>) without refactoring the production
+    /// loop, so we keep wall-clock waits but use a *generous* budget: a 5 s
+    /// outer cancellation cap with a poll-until-condition wait, instead of a
+    /// fixed <see cref="Task.Delay"/> that fails fast on a slow CI runner. The
+    /// loop's <c>ReportInterval</c> is set to 50 ms in each test, so under
+    /// normal conditions the condition is met almost immediately; under heavy
+    /// CI load the poll loop tolerates the slow tick instead of asserting on a
+    /// timed-out empty list.
+    /// </summary>
+    private static async Task RunLoopUntil(
+        CentralHealthReportLoop loop,
+        Func<bool> condition,
+        TimeSpan? maxWait = null)
+    {
+        var deadline = maxWait ?? TimeSpan.FromSeconds(5);
+        using var cts = new CancellationTokenSource(deadline + TimeSpan.FromSeconds(1));
+        try
+        {
+            await loop.StartAsync(cts.Token);
+            var sw = System.Diagnostics.Stopwatch.StartNew();
+            while (sw.Elapsed < deadline && !condition())
+            {
+                await Task.Delay(25, CancellationToken.None);
+            }
+            await loop.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+    }
+
+    /// <summary>
+    /// Used by tests that need the loop to run for a bounded period without
+    /// waiting on a specific condition (e.g. asserting <i>no</i> reports were
+    /// produced). The wait is generous (1 s default) — see
+    /// <see cref="RunLoopUntil"/> for the rationale.
+    /// </summary>
+    private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
+    {
+        var totalMs = Math.Max(runForMs, 1000);
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(totalMs + 1000));
+        try
+        {
+            await loop.StartAsync(cts.Token);
+            await Task.Delay(totalMs, CancellationToken.None);
+            await loop.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+    }
+
+    [Fact]
+    public async Task GeneratesCentralReports_WhenSelfIsPrimary()
+    {
+        var collector = new SiteHealthCollector();
+        var aggregator = new RecordingAggregator();
+        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var loop = new CentralHealthReportLoop(
+            collector, aggregator, clusterNodes, options,
+            NullLogger<CentralHealthReportLoop>.Instance);
+
+        // HealthMonitoring-022: wait up to 5 s for at least one report to fire
+        // rather than fixed-budget Task.Delay; tolerates slow CI runners.
+        await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);
+
+        Assert.NotEmpty(aggregator.Processed);
+        Assert.All(aggregator.Processed,
+            r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
+    }
+
+    [Fact]
+    public async Task GeneratesNoReports_WhenNotPrimary()
+    {
+        var collector = new SiteHealthCollector();
+        var aggregator = new RecordingAggregator();
+        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var loop = new CentralHealthReportLoop(
+            collector, aggregator, clusterNodes, options,
+            NullLogger<CentralHealthReportLoop>.Instance);
+
+        await RunLoopBriefly(loop, 250);
+
+        Assert.Empty(aggregator.Processed);
+    }
+
+    [Fact]
+    public async Task AssignsMonotonicSequenceNumbers()
+    {
+        var collector = new SiteHealthCollector();
+        var aggregator = new RecordingAggregator();
+        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var loop = new CentralHealthReportLoop(
+            collector, aggregator, clusterNodes, options,
+            NullLogger<CentralHealthReportLoop>.Instance);
+
+        // HealthMonitoring-022: wait up to 5 s for at least 2 reports rather
+        // than a fixed 300 ms window that could miss the second tick on a
+        // slow CI runner; the assertion below proves the sequence is monotonic.
+        await RunLoopUntil(loop, () => aggregator.Processed.Count >= 2);
+
+        Assert.True(aggregator.Processed.Count >= 2,
+            $"Expected at least 2 reports, got {aggregator.Processed.Count}");
+        for (int i = 1; i < aggregator.Processed.Count; i++)
+        {
+            Assert.True(
+                aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
+                $"Sequence numbers not strictly increasing at index {i}");
+        }
+    }
+
+    /// <summary>
+    /// HealthMonitoring-006 regression: the central loop's sequence-number seed
+    /// must be derived from the injected <see cref="TimeProvider"/> (Unix-ms),
+    /// not from <c>DateTimeOffset.UtcNow</c> read at field initialization, so the
+    /// seeding strategy is deterministically testable.
+    /// </summary>
+    [Fact]
+    public void SequenceNumberSeed_UsesInjectedTimeProvider()
+    {
+        var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
+        var timeProvider = new TestTimeProvider(fixedInstant);
+
+        var loop = new CentralHealthReportLoop(
+            new SiteHealthCollector(),
+            new RecordingAggregator(),
+            new FakeClusterNodeProvider { SelfIsPrimary = true },
+            Options.Create(new HealthMonitoringOptions()),
+            NullLogger<CentralHealthReportLoop>.Instance,
+            timeProvider);
+
+        Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-018 regression: when <see cref="ICentralHealthAggregator.ProcessReport"/>
+    /// throws, the per-interval counters that
+    /// <see cref="SiteHealthCollector.CollectReport"/> just drained must be
+    /// restored back into the shared collector so they roll forward into the
+    /// next interval rather than being silently lost. Same shape as the
+    /// HealthMonitoring-017 fix in <see cref="HealthReportSender"/>.
+    /// </summary>
+    [Fact]
+    public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport()
+    {
+        var collector = new SiteHealthCollector();
+        // Pre-populate every per-interval counter so the restore path on each
+        // field is exercised. The loop's first iteration will read-and-reset
+        // these via CollectReport, then ProcessReport will throw, and the
+        // restore must put them back.
+        collector.IncrementScriptError();
+        collector.IncrementScriptError();
+        collector.IncrementAlarmError();
+        collector.IncrementDeadLetter();
+        collector.IncrementDeadLetter();
+        collector.IncrementDeadLetter();
+        collector.IncrementSiteAuditWriteFailures();
+        collector.IncrementAuditRedactionFailure();
+        collector.IncrementAuditRedactionFailure();
+
+        var aggregator = new FailingThenSucceedingAggregator();
+        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var loop = new CentralHealthReportLoop(
+            collector, aggregator, clusterNodes, options,
+            NullLogger<CentralHealthReportLoop>.Instance);
+
+        // HealthMonitoring-022: the first ProcessReport call throws (counters
+        // get restored), the second succeeds. Wait up to 5 s for that second
+        // (successful) call rather than a fixed 450 ms budget.
+        await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);
+
+        // First call threw, later succeeded — the first successful report
+        // must carry the previously-failed interval's accumulated counts.
+        Assert.NotEmpty(aggregator.Processed);
+        var firstSuccess = aggregator.Processed[0];
+        Assert.Equal(2, firstSuccess.ScriptErrorCount);
+        Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
+        Assert.Equal(3, firstSuccess.DeadLetterCount);
+        Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
+        Assert.Equal(2, firstSuccess.AuditRedactionFailure);
+    }
+
+    /// <summary>
+    /// <see cref="ICentralHealthAggregator"/> whose first <c>ProcessReport</c>
+    /// call throws (only the first), then subsequent calls succeed. Used by
+    /// <see cref="ProcessReportFailure_PreservesIntervalCountersForNextReport"/>
+    /// to verify the HealthMonitoring-018 restore-on-failure path.
+    /// </summary>
+    private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator
+    {
+        private int _callCount;
+        public List<SiteHealthReport> Processed { get; } = [];
+
+        public void ProcessReport(SiteHealthReport report)
+        {
+            var n = Interlocked.Increment(ref _callCount);
+            if (n == 1)
+            {
+                throw new InvalidOperationException("aggregator temporarily unavailable");
+            }
+            Processed.Add(report);
+        }
+
+        public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
+        public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
+            new Dictionary<string, SiteHealthState>();
+        public SiteHealthState? GetSiteState(string siteId) => null;
+    }
+
+    [Fact]
+    public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
+    {
+        // The loop must still report the node's role to the collector when it is
+        // the standby, so the standby's own node card shows the correct role.
+        var collector = new SiteHealthCollector();
+        var aggregator = new RecordingAggregator();
+        var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var loop = new CentralHealthReportLoop(
+            collector, aggregator, clusterNodes, options,
+            NullLogger<CentralHealthReportLoop>.Instance);
+
+        await RunLoopBriefly(loop, 150);
+
+        Assert.False(collector.IsActiveNode);
+    }
+}
@@ -0,0 +1,73 @@
+using Microsoft.Extensions.Options;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+/// <summary>
+/// HealthMonitoring-014 regression: <see cref="HealthMonitoringOptions"/> intervals
+/// are fed straight into <c>new PeriodicTimer(...)</c>, which throws
+/// <see cref="ArgumentOutOfRangeException"/> for a zero/negative period. A
+/// misconfigured <c>appsettings.json</c> must be rejected by an
+/// <see cref="IValidateOptions{TOptions}"/> with a clear, key-naming message
+/// rather than crashing the hosted service with an opaque exception.
+/// </summary>
+public class HealthMonitoringOptionsValidatorTests
+{
+    private static ValidateOptionsResult Validate(HealthMonitoringOptions options) =>
+        new HealthMonitoringOptionsValidator().Validate(Options.DefaultName, options);
+
+    [Fact]
+    public void DefaultOptions_AreValid()
+    {
+        var result = Validate(new HealthMonitoringOptions());
+        Assert.True(result.Succeeded, result.FailureMessage);
+    }
+
+    [Fact]
+    public void ZeroReportInterval_IsRejected()
+    {
+        var result = Validate(new HealthMonitoringOptions { ReportInterval = TimeSpan.Zero });
+
+        Assert.True(result.Failed);
+        Assert.Contains("ReportInterval", result.FailureMessage);
+    }
+
+    [Fact]
+    public void NegativeReportInterval_IsRejected()
+    {
+        var result = Validate(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromSeconds(-1) });
+
+        Assert.True(result.Failed);
+        Assert.Contains("ReportInterval", result.FailureMessage);
+    }
+
+    [Fact]
+    public void ZeroOfflineTimeout_IsRejected()
+    {
+        var result = Validate(new HealthMonitoringOptions { OfflineTimeout = TimeSpan.Zero });
+
+        Assert.True(result.Failed);
+        Assert.Contains("OfflineTimeout", result.FailureMessage);
+    }
+
+    [Fact]
+    public void ZeroCentralOfflineTimeout_IsRejected()
+    {
+        var result = Validate(new HealthMonitoringOptions { CentralOfflineTimeout = TimeSpan.Zero });
+
+        Assert.True(result.Failed);
+        Assert.Contains("CentralOfflineTimeout", result.FailureMessage);
+    }
+
+    [Fact]
+    public void CentralOfflineTimeout_ShorterThanOfflineTimeout_IsRejected()
+    {
+        var result = Validate(new HealthMonitoringOptions
+        {
+            OfflineTimeout = TimeSpan.FromSeconds(60),
+            CentralOfflineTimeout = TimeSpan.FromSeconds(30)
+        });
+
+        Assert.True(result.Failed);
+        Assert.Contains("CentralOfflineTimeout", result.FailureMessage);
+    }
+}
@@ -0,0 +1,423 @@
+using Microsoft.Data.Sqlite;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+using ZB.MOM.WW.ScadaBridge.StoreAndForward;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+public class HealthReportSenderTests
+{
+    private class FakeTransport : IHealthReportTransport
+    {
+        public List<SiteHealthReport> SentReports { get; } = [];
+        public void Send(SiteHealthReport report) => SentReports.Add(report);
+    }
+
+    private class FakeSiteIdentityProvider : ISiteIdentityProvider
+    {
+        public string SiteId { get; set; } = "test-site";
+    }
+
+    /// <summary>
+    /// Captures emitted log entries so tests can assert that non-fatal failures
+    /// are surfaced (HealthMonitoring-010) rather than silently swallowed.
+    /// </summary>
+    private sealed class CapturingLogger<T> : ILogger<T>
+    {
+        public sealed record Entry(LogLevel Level, string Message, Exception? Exception);
+
+        public List<Entry> Entries { get; } = [];
+
+        public IDisposable BeginScope<TState>(TState state) where TState : notnull => NullScope.Instance;
+        public bool IsEnabled(LogLevel logLevel) => true;
+
+        public void Log<TState>(
+            LogLevel logLevel, EventId eventId, TState state, Exception? exception,
+            Func<TState, Exception?, string> formatter)
+        {
+            lock (Entries)
+            {
+                Entries.Add(new Entry(logLevel, formatter(state, exception), exception));
+            }
+        }
+
+        private sealed class NullScope : IDisposable
+        {
+            public static readonly NullScope Instance = new();
+            public void Dispose() { }
+        }
+    }
+
+    /// <summary>An <see cref="IClusterNodeProvider"/> whose query always throws.</summary>
+    private sealed class ThrowingClusterNodeProvider : IClusterNodeProvider
+    {
+        public bool SelfIsPrimary => true;
+        public IReadOnlyList<NodeStatus> GetClusterNodes() =>
+            throw new InvalidOperationException("cluster query failed");
+    }
+
+    [Fact]
+    public async Task SendsReportsWithMonotonicSequenceNumbers()
+    {
+        var transport = new FakeTransport();
+        var collector = new SiteHealthCollector();
+        collector.SetActiveNode(true);
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider { SiteId = "site-A" });
+
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(300));
+        try
+        {
+            await sender.StartAsync(cts.Token);
+            await Task.Delay(280, CancellationToken.None);
+            await sender.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+
+        // Should have sent several reports
+        Assert.True(transport.SentReports.Count >= 2,
+            $"Expected at least 2 reports, got {transport.SentReports.Count}");
+
+        // Verify strictly-monotonic sequence numbers and matching site id
+        for (int i = 0; i < transport.SentReports.Count; i++)
+        {
+            if (i > 0)
+            {
+                Assert.True(
+                    transport.SentReports[i].SequenceNumber > transport.SentReports[i - 1].SequenceNumber,
+                    $"Sequence numbers not strictly increasing at index {i}");
+            }
+            Assert.Equal("site-A", transport.SentReports[i].SiteId);
+        }
+    }
+
+    [Fact]
+    public async Task FirstReportSequenceExceedsStartupUnixMs()
+    {
+        // Reports are seeded with Unix-ms at construction so a freshly-active
+        // node always sorts after the prior active. Verify the first emitted
+        // sequence is at least the startup epoch.
+        var transport = new FakeTransport();
+        var collector = new SiteHealthCollector();
+        collector.SetActiveNode(true);
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var beforeCtor = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider());
+
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(150));
+        try
+        {
+            await sender.StartAsync(cts.Token);
+            await Task.Delay(120, CancellationToken.None);
+            await sender.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+
+        Assert.True(transport.SentReports.Count >= 1);
+        Assert.True(
+            transport.SentReports[0].SequenceNumber >= beforeCtor,
+            $"First sequence {transport.SentReports[0].SequenceNumber} should be >= startup epoch {beforeCtor}");
+    }
+
+    [Fact]
+    public async Task ReportsIncludeUtcTimestamp()
+    {
+        var transport = new FakeTransport();
+        var collector = new SiteHealthCollector();
+        collector.SetActiveNode(true);
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider());
+
+        var before = DateTimeOffset.UtcNow;
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(150));
+        try
+        {
+            await sender.StartAsync(cts.Token);
+            await Task.Delay(120, CancellationToken.None);
+            await sender.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+        var after = DateTimeOffset.UtcNow;
+
+        Assert.True(transport.SentReports.Count >= 1);
+        foreach (var report in transport.SentReports)
+        {
+            Assert.InRange(report.ReportTimestamp, before, after);
+            Assert.Equal(TimeSpan.Zero, report.ReportTimestamp.Offset);
+        }
+    }
+
+    /// <summary>
+    /// HealthMonitoring-001 regression: the documented "store-and-forward buffer
+    /// depth" metric (pending messages by category) must actually be populated in
+    /// the emitted report. Previously SetStoreAndForwardDepths had no callers, so
+    /// StoreAndForwardBufferDepths was always empty. The sender must query the S&amp;F
+    /// engine's per-category depth API and include it alongside the parked count.
+    /// </summary>
+    [Fact]
+    public async Task ReportsIncludeStoreAndForwardBufferDepthsFromStorage()
+    {
+        var dbName = $"HealthSfDepth_{Guid.NewGuid():N}";
+        var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
+        // Keep one connection alive so the in-memory DB persists for the test.
+        using var keepAlive = new SqliteConnection(connStr);
+        keepAlive.Open();
+
+        var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
+        await storage.InitializeAsync();
+
+        // Two pending ExternalSystem messages and one pending Notification message.
+        await storage.EnqueueAsync(MakePendingMessage("m1", StoreAndForwardCategory.ExternalSystem));
+        await storage.EnqueueAsync(MakePendingMessage("m2", StoreAndForwardCategory.ExternalSystem));
+        await storage.EnqueueAsync(MakePendingMessage("m3", StoreAndForwardCategory.Notification));
+
+        var transport = new FakeTransport();
+        var collector = new SiteHealthCollector();
+        collector.SetActiveNode(true);
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider(),
+            sfStorage: storage);
+
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(300));
+        try
+        {
+            await sender.StartAsync(cts.Token);
+            await Task.Delay(250, CancellationToken.None);
+            await sender.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+
+        Assert.True(transport.SentReports.Count >= 1);
+        var depths = transport.SentReports[^1].StoreAndForwardBufferDepths;
+        Assert.Equal(2, depths[nameof(StoreAndForwardCategory.ExternalSystem)]);
+        Assert.Equal(1, depths[nameof(StoreAndForwardCategory.Notification)]);
+        Assert.False(depths.ContainsKey(nameof(StoreAndForwardCategory.CachedDbWrite)));
+    }
+
+    private static StoreAndForwardMessage MakePendingMessage(string id, StoreAndForwardCategory category) =>
+        new()
+        {
+            Id = id,
+            Category = category,
+            Target = "target",
+            PayloadJson = "{}",
+            RetryCount = 0,
+            MaxRetries = 50,
+            RetryIntervalMs = 30_000,
+            CreatedAt = DateTimeOffset.UtcNow,
+            Status = StoreAndForwardMessageStatus.Pending
+        };
+
+    [Fact]
+    public void InitialSequenceNumberSeededWithUnixMs()
+    {
+        var transport = new FakeTransport();
+        var collector = new SiteHealthCollector();
+        var options = Options.Create(new HealthMonitoringOptions());
+
+        var beforeCtor = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider());
+        var afterCtor = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
+
+        Assert.InRange(sender.CurrentSequenceNumber, beforeCtor, afterCtor);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-010 regression: a failure refreshing cluster nodes is
+    /// non-fatal (the report still ships) but must no longer be swallowed by a
+    /// bare <c>catch {}</c> — it must be logged as a warning with the exception so
+    /// persistent degradation is diagnosable.
+    /// </summary>
+    [Fact]
+    public async Task ClusterNodeRefreshFailure_IsLoggedNotSwallowed()
+    {
+        var transport = new FakeTransport();
+        var collector = new SiteHealthCollector();
+        collector.SetActiveNode(true);
+        var logger = new CapturingLogger<HealthReportSender>();
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            logger,
+            new FakeSiteIdentityProvider(),
+            clusterNodeProvider: new ThrowingClusterNodeProvider());
+
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(300));
+        try
+        {
+            await sender.StartAsync(cts.Token);
+            await Task.Delay(250, CancellationToken.None);
+            await sender.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+
+        // The report loop continues despite the failure...
+        Assert.NotEmpty(transport.SentReports);
+        // ...but the failure is surfaced as a warning carrying the exception.
+        CapturingLogger<HealthReportSender>.Entry[] warnings;
+        lock (logger.Entries)
+        {
+            warnings = logger.Entries
+                .Where(e => e.Level == LogLevel.Warning && e.Exception is InvalidOperationException)
+                .ToArray();
+        }
+        Assert.NotEmpty(warnings);
+        Assert.Contains(warnings, w => w.Message.Contains("cluster nodes", StringComparison.OrdinalIgnoreCase));
+    }
+
+    /// <summary>
+    /// HealthMonitoring-017 regression: when the transport's <c>Send</c> throws,
+    /// the per-interval counters that <see cref="SiteHealthCollector.CollectReport"/>
+    /// just drained via <c>Interlocked.Exchange</c> must be restored back into the
+    /// collector so they roll forward into the next interval rather than being
+    /// silently lost. Before the fix, a transport failure left the counts in the
+    /// un-sent report only, and the next successful report shipped with the
+    /// counters at zero.
+    /// </summary>
+    [Fact]
+    public async Task SendFailure_PreservesIntervalCountersForNextReport()
+    {
+        var transport = new FailingThenSucceedingTransport();
+        var collector = new SiteHealthCollector();
+        collector.SetActiveNode(true);
+        // Pre-populate every per-interval counter so the restore path on each
+        // field is exercised — script error, alarm error, dead letter, site
+        // audit write failure, audit redaction failure.
+        collector.IncrementScriptError();
+        collector.IncrementScriptError();
+        collector.IncrementAlarmError();
+        collector.IncrementDeadLetter();
+        collector.IncrementDeadLetter();
+        collector.IncrementDeadLetter();
+        collector.IncrementSiteAuditWriteFailures();
+        collector.IncrementAuditRedactionFailure();
+        collector.IncrementAuditRedactionFailure();
+
+        var options = Options.Create(new HealthMonitoringOptions
+        {
+            ReportInterval = TimeSpan.FromMilliseconds(50)
+        });
+
+        var sender = new HealthReportSender(
+            collector,
+            transport,
+            options,
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider());
+
+        using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(500));
+        try
+        {
+            await sender.StartAsync(cts.Token);
+            await Task.Delay(450, CancellationToken.None);
+            await sender.StopAsync(CancellationToken.None);
+        }
+        catch (OperationCanceledException) { }
+
+        // The first interval's Send threw, then later intervals succeeded. The
+        // first successful report must include the previously-failed interval's
+        // accumulated counts.
+        Assert.NotEmpty(transport.SentReports);
+        var firstSuccess = transport.SentReports[0];
+        Assert.Equal(2, firstSuccess.ScriptErrorCount);
+        Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
+        Assert.Equal(3, firstSuccess.DeadLetterCount);
+        Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
+        Assert.Equal(2, firstSuccess.AuditRedactionFailure);
+    }
+
+    /// <summary>
+    /// <see cref="IHealthReportTransport"/> that throws on the first
+    /// <c>Send</c> call (and only the first), then succeeds. Used by
+    /// <see cref="SendFailure_PreservesIntervalCountersForNextReport"/> to
+    /// verify the HealthMonitoring-017 restore-on-failure path.
+    /// </summary>
+    private sealed class FailingThenSucceedingTransport : IHealthReportTransport
+    {
+        private int _callCount;
+        public List<SiteHealthReport> SentReports { get; } = [];
+
+        public void Send(SiteHealthReport report)
+        {
+            var n = Interlocked.Increment(ref _callCount);
+            if (n == 1)
+            {
+                throw new InvalidOperationException("transport temporarily unavailable");
+            }
+            SentReports.Add(report);
+        }
+    }
+
+    /// <summary>
+    /// HealthMonitoring-006 regression: the sequence-number seed must be derived
+    /// from the injected <see cref="TimeProvider"/> so the Unix-ms seeding strategy
+    /// is deterministically testable and the clock dependency is explicit, rather
+    /// than reading <c>DateTimeOffset.UtcNow</c> directly at field initialization.
+    /// </summary>
+    [Fact]
+    public void SequenceNumberSeed_UsesInjectedTimeProvider()
+    {
+        var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
+        var timeProvider = new TestTimeProvider(fixedInstant);
+
+        var sender = new HealthReportSender(
+            new SiteHealthCollector(),
+            new FakeTransport(),
+            Options.Create(new HealthMonitoringOptions()),
+            NullLogger<HealthReportSender>.Instance,
+            new FakeSiteIdentityProvider(),
+            timeProvider: timeProvider);
+
+        Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), sender.CurrentSequenceNumber);
+    }
+}
@@ -0,0 +1,73 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+/// <summary>
+/// Bundle E (M6-T6) regression coverage. The site-side audit-log SQLite writer
+/// exposes a backlog snapshot (<c>SiteAuditBacklogSnapshot</c>) via the
+/// <c>ISiteAuditQueue.GetBacklogStatsAsync</c> surface. A periodic
+/// <c>SiteAuditBacklogReporter</c> hosted service polls that snapshot and
+/// pushes it into the collector via <see cref="ISiteHealthCollector.UpdateSiteAuditBacklog"/>
+/// so the next <see cref="ISiteHealthCollector.CollectReport"/> includes it in
+/// the report payload as <c>SiteAuditBacklog</c>. Unlike the
+/// SiteAuditWriteFailures / AuditRedactionFailure interval counters, the
+/// backlog snapshot is not reset on collect — the field carries forward
+/// whatever the most recent refresh pushed in.
+/// </summary>
+public class SiteAuditBacklogMetricTests
+{
+    private readonly SiteHealthCollector _collector = new();
+
+    [Fact]
+    public void Update_Then_CollectReport_IncludesBacklog()
+    {
+        var snapshot = new SiteAuditBacklogSnapshot(
+            PendingCount: 42,
+            OldestPendingUtc: new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc),
+            OnDiskBytes: 1234567);
+
+        _collector.UpdateSiteAuditBacklog(snapshot);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(snapshot, report.SiteAuditBacklog);
+    }
+
+    [Fact]
+    public void Report_Payload_Includes_SiteAuditBacklog_AsNullByDefault()
+    {
+        // No refresh has been pushed yet — the report carries null so the
+        // central UI can distinguish "no data yet" from "queue empty".
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Null(report.SiteAuditBacklog);
+    }
+
+    [Fact]
+    public void CollectReport_DoesNotReset_SiteAuditBacklog()
+    {
+        // Backlog snapshot is a point-in-time reading, not a per-interval
+        // counter — successive CollectReport calls before the next
+        // SiteAuditBacklogReporter tick MUST keep returning the same snapshot
+        // so a slow refresh cadence doesn't blank the central dashboard.
+        var snapshot = new SiteAuditBacklogSnapshot(
+            PendingCount: 7,
+            OldestPendingUtc: null,
+            OnDiskBytes: 8192);
+
+        _collector.UpdateSiteAuditBacklog(snapshot);
+
+        var first = _collector.CollectReport("site-1");
+        var second = _collector.CollectReport("site-1");
+
+        Assert.Equal(snapshot, first.SiteAuditBacklog);
+        Assert.Equal(snapshot, second.SiteAuditBacklog);
+    }
+
+    [Fact]
+    public void Update_With_Null_Throws_ArgumentNullException()
+    {
+        Assert.Throws<ArgumentNullException>(
+            () => _collector.UpdateSiteAuditBacklog(null!));
+    }
+}
@@ -0,0 +1,52 @@
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+/// <summary>
+/// Bundle G (M2-T11) regression coverage. The site-side Audit Log writer chain
+/// (FallbackAuditWriter) increments <see cref="IAuditWriteFailureCounter"/>
+/// every time the primary SQLite writer throws. Bundle G bridges that counter
+/// into the Site Health Monitoring report payload as <c>SiteAuditWriteFailures</c>
+/// so a sustained audit-write outage surfaces on /monitoring/health rather than
+/// disappearing into a NoOp sink.
+/// </summary>
+public class SiteAuditWriteFailuresMetricTests
+{
+    private readonly SiteHealthCollector _collector = new();
+
+    [Fact]
+    public void Increment_Three_Times_Counter_Reports_3()
+    {
+        _collector.IncrementSiteAuditWriteFailures();
+        _collector.IncrementSiteAuditWriteFailures();
+        _collector.IncrementSiteAuditWriteFailures();
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(3, report.SiteAuditWriteFailures);
+    }
+
+    [Fact]
+    public void Report_Payload_Includes_SiteAuditWriteFailures_AsZeroByDefault()
+    {
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(0, report.SiteAuditWriteFailures);
+    }
+
+    /// <summary>
+    /// Mirrors the existing per-interval reset semantics for ScriptErrorCount /
+    /// AlarmEvaluationErrorCount / DeadLetterCount — SiteAuditWriteFailures is an
+    /// interval count, not a running total.
+    /// </summary>
+    [Fact]
+    public void CollectReport_Resets_SiteAuditWriteFailures()
+    {
+        _collector.IncrementSiteAuditWriteFailures();
+        _collector.IncrementSiteAuditWriteFailures();
+
+        var first = _collector.CollectReport("site-1");
+        Assert.Equal(2, first.SiteAuditWriteFailures);
+
+        var second = _collector.CollectReport("site-1");
+        Assert.Equal(0, second.SiteAuditWriteFailures);
+    }
+}
@@ -0,0 +1,282 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
+
+public class SiteHealthCollectorTests
+{
+    private readonly SiteHealthCollector _collector = new();
+
+    [Fact]
+    public void CollectReport_ReturnsZeroCounters_WhenNoErrorsRecorded()
+    {
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal("site-1", report.SiteId);
+        Assert.Equal(0, report.ScriptErrorCount);
+        Assert.Equal(0, report.AlarmEvaluationErrorCount);
+        Assert.Equal(0, report.DeadLetterCount);
+    }
+
+    [Fact]
+    public void IncrementScriptError_AccumulatesBetweenReports()
+    {
+        _collector.IncrementScriptError();
+        _collector.IncrementScriptError();
+        _collector.IncrementScriptError();
+
+        var report = _collector.CollectReport("site-1");
+        Assert.Equal(3, report.ScriptErrorCount);
+    }
+
+    [Fact]
+    public void IncrementAlarmError_AccumulatesBetweenReports()
+    {
+        _collector.IncrementAlarmError();
+        _collector.IncrementAlarmError();
+
+        var report = _collector.CollectReport("site-1");
+        Assert.Equal(2, report.AlarmEvaluationErrorCount);
+    }
+
+    [Fact]
+    public void IncrementDeadLetter_AccumulatesBetweenReports()
+    {
+        _collector.IncrementDeadLetter();
+
+        var report = _collector.CollectReport("site-1");
+        Assert.Equal(1, report.DeadLetterCount);
+    }
+
+    [Fact]
+    public void CollectReport_ResetsCounters_AfterCollection()
+    {
+        _collector.IncrementScriptError();
+        _collector.IncrementAlarmError();
+        _collector.IncrementDeadLetter();
+
+        var first = _collector.CollectReport("site-1");
+        Assert.Equal(1, first.ScriptErrorCount);
+        Assert.Equal(1, first.AlarmEvaluationErrorCount);
+        Assert.Equal(1, first.DeadLetterCount);
+
+        var second = _collector.CollectReport("site-1");
+        Assert.Equal(0, second.ScriptErrorCount);
+        Assert.Equal(0, second.AlarmEvaluationErrorCount);
+        Assert.Equal(0, second.DeadLetterCount);
+    }
+
+    [Fact]
+    public void UpdateConnectionHealth_ReflectedInReport()
+    {
+        _collector.UpdateConnectionHealth("opc-1", ConnectionHealth.Connected);
+        _collector.UpdateConnectionHealth("opc-2", ConnectionHealth.Disconnected);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(2, report.DataConnectionStatuses.Count);
+        Assert.Equal(ConnectionHealth.Connected, report.DataConnectionStatuses["opc-1"]);
+        Assert.Equal(ConnectionHealth.Disconnected, report.DataConnectionStatuses["opc-2"]);
+    }
+
+    [Fact]
+    public void ConnectionHealth_NotResetAfterCollect()
+    {
+        _collector.UpdateConnectionHealth("opc-1", ConnectionHealth.Connected);
+
+        _collector.CollectReport("site-1");
+        var second = _collector.CollectReport("site-1");
+
+        Assert.Single(second.DataConnectionStatuses);
+        Assert.Equal(ConnectionHealth.Connected, second.DataConnectionStatuses["opc-1"]);
+    }
+
+    [Fact]
+    public void RemoveConnection_RemovesFromReport()
+    {
+        _collector.UpdateConnectionHealth("opc-1", ConnectionHealth.Connected);
+        _collector.UpdateTagResolution("opc-1", 10, 8);
+        _collector.RemoveConnection("opc-1");
+
+        var report = _collector.CollectReport("site-1");
+        Assert.Empty(report.DataConnectionStatuses);
+        Assert.Empty(report.TagResolutionCounts);
+    }
+
+    [Fact]
+    public void UpdateTagResolution_ReflectedInReport()
+    {
+        _collector.UpdateTagResolution("opc-1", 50, 45);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Single(report.TagResolutionCounts);
+        Assert.Equal(50, report.TagResolutionCounts["opc-1"].TotalSubscribed);
+        Assert.Equal(45, report.TagResolutionCounts["opc-1"].SuccessfullyResolved);
+    }
+
+    [Fact]
+    public void StoreAndForwardBufferDepths_DefaultsToEmpty_WhenSetterNotCalled()
+    {
+        var report = _collector.CollectReport("site-1");
+        Assert.Empty(report.StoreAndForwardBufferDepths);
+    }
+
+    [Fact]
+    public void CollectReport_IncludesUtcTimestamp()
+    {
+        var before = DateTimeOffset.UtcNow;
+        var report = _collector.CollectReport("site-1");
+        var after = DateTimeOffset.UtcNow;
+
+        Assert.InRange(report.ReportTimestamp, before, after);
+    }
+
+    /// <summary>
+    /// HealthMonitoring-016 regression: <see cref="SiteHealthCollector.CollectReport"/>
+    /// must stamp <c>ReportTimestamp</c> from an injected <see cref="TimeProvider"/>
+    /// (consistent with the rest of the module), not directly from
+    /// <c>DateTimeOffset.UtcNow</c>, so the report timestamp is deterministically
+    /// testable against a known instant.
+    /// </summary>
+    [Fact]
+    public void CollectReport_StampsTimestamp_FromInjectedTimeProvider()
+    {
+        var fixedInstant = new DateTimeOffset(2026, 5, 17, 9, 30, 0, TimeSpan.Zero);
+        var collector = new SiteHealthCollector(new TestTimeProvider(fixedInstant));
+
+        var report = collector.CollectReport("site-1");
+
+        Assert.Equal(fixedInstant, report.ReportTimestamp);
+    }
+
+    [Fact]
+    public void CollectReport_SequenceNumberIsZero_CallerAssignsIt()
+    {
+        var report = _collector.CollectReport("site-1");
+        Assert.Equal(0, report.SequenceNumber);
+    }
+
+    // HealthMonitoring-009 regression: the remaining collector setters had no
+    // "reflected in report" coverage. The following tests verify each setter's
+    // value reaches CollectReport output.
+
+    [Fact]
+    public void SetClusterNodes_ReflectedInReport()
+    {
+        var nodes = new List<ZB.MOM.WW.ScadaBridge.Commons.Messages.Health.NodeStatus>
+        {
+            new("node-a", true, "Active"),
+            new("node-b", true, "Standby")
+        };
+        _collector.SetClusterNodes(nodes);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.NotNull(report.ClusterNodes);
+        Assert.Equal(2, report.ClusterNodes!.Count);
+        Assert.Equal("node-a", report.ClusterNodes[0].Hostname);
+    }
+
+    [Fact]
+    public void SetInstanceCounts_ReflectedInReport()
+    {
+        _collector.SetInstanceCounts(deployed: 10, enabled: 7, disabled: 3);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(10, report.DeployedInstanceCount);
+        Assert.Equal(7, report.EnabledInstanceCount);
+        Assert.Equal(3, report.DisabledInstanceCount);
+    }
+
+    [Fact]
+    public void SetParkedMessageCount_ReflectedInReport()
+    {
+        _collector.SetParkedMessageCount(42);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(42, report.ParkedMessageCount);
+    }
+
+    [Fact]
+    public void SetNodeHostname_ReflectedInReport()
+    {
+        _collector.SetNodeHostname("site-host-1");
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal("site-host-1", report.NodeHostname);
+    }
+
+    [Fact]
+    public void SetActiveNode_ReflectedInNodeRole()
+    {
+        _collector.SetActiveNode(true);
+        Assert.Equal("Active", _collector.CollectReport("site-1").NodeRole);
+        Assert.True(_collector.IsActiveNode);
+
+        _collector.SetActiveNode(false);
+        Assert.Equal("Standby", _collector.CollectReport("site-1").NodeRole);
+        Assert.False(_collector.IsActiveNode);
+    }
+
+    [Fact]
+    public void UpdateTagQuality_ReflectedInReport()
+    {
+        _collector.UpdateTagQuality("opc-1", good: 80, bad: 15, uncertain: 5);
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.NotNull(report.DataConnectionTagQuality);
+        var quality = report.DataConnectionTagQuality!["opc-1"];
+        Assert.Equal(80, quality.Good);
+        Assert.Equal(15, quality.Bad);
+        Assert.Equal(5, quality.Uncertain);
+    }
+
+    [Fact]
+    public void UpdateConnectionEndpoint_ReflectedInReport()
+    {
+        _collector.UpdateConnectionEndpoint("opc-1", "opc.tcp://plc-1:4840");
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.NotNull(report.DataConnectionEndpoints);
+        Assert.Equal("opc.tcp://plc-1:4840", report.DataConnectionEndpoints!["opc-1"]);
+    }
+
+    [Fact]
+    public void SetStoreAndForwardDepths_ReflectedInReport()
+    {
+        _collector.SetStoreAndForwardDepths(new Dictionary<string, int>
+        {
+            ["ExternalSystem"] = 5,
+            ["Notification"] = 2
+        });
+
+        var report = _collector.CollectReport("site-1");
+
+        Assert.Equal(5, report.StoreAndForwardBufferDepths["ExternalSystem"]);
+        Assert.Equal(2, report.StoreAndForwardBufferDepths["Notification"]);
+    }
+
+    [Fact]
+    public async Task ThreadSafety_ConcurrentIncrements()
+    {
+        const int iterations = 10_000;
+        var tasks = new[]
+        {
+            Task.Run(() => { for (int i = 0; i < iterations; i++) _collector.IncrementScriptError(); }),
+            Task.Run(() => { for (int i = 0; i < iterations; i++) _collector.IncrementAlarmError(); }),
+            Task.Run(() => { for (int i = 0; i < iterations; i++) _collector.IncrementDeadLetter(); })
+        };
+
+        await Task.WhenAll(tasks);
+
+        var report = _collector.CollectReport("site-1");
+        Assert.Equal(iterations, report.ScriptErrorCount);
+        Assert.Equal(iterations, report.AlarmEvaluationErrorCount);
+        Assert.Equal(iterations, report.DeadLetterCount);
+    }
+}
@@ -0,0 +1,29 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net10.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+    <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+    <IsPackable>false</IsPackable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="coverlet.collector" />
+    <PackageReference Include="Microsoft.Data.Sqlite" />
+    <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
+    <PackageReference Include="Microsoft.Extensions.Options" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" />
+    <PackageReference Include="xunit" />
+    <PackageReference Include="xunit.runner.visualstudio" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Using Include="Xunit" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="../../src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ZB.MOM.WW.ScadaBridge.HealthMonitoring.csproj" />
+  </ItemGroup>
+
+</Project>