refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj,
namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated.
ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated.
SQL roles/logins, LDAP domains, CLI command name, and CLI config dir
(~/.scadalink → ~/.scadabridge) also renamed.

Build green; 5 Host.Tests fail awaiting SQL login rename in next commit.
Pre-existing StaleTagMonitor timing flakes unchanged.

Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
Joseph Doherty
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,57 @@
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// Bundle C (M5-T7) regression coverage. The Audit Log payload filter
/// (<c>DefaultAuditPayloadFilter</c>) increments
/// <c>IAuditRedactionFailureCounter</c> every time a header/body/SQL-param
/// redactor stage throws and the filter has to over-redact the field with
/// the <c>&lt;redacted: redactor error&gt;</c> marker. Bundle C bridges that
/// counter into the Site Health Monitoring report payload as
/// <c>AuditRedactionFailure</c> so a misconfigured / catastrophic regex
/// surfaces on /monitoring/health rather than disappearing into a NoOp sink.
/// Mirrors the Bundle G <c>SiteAuditWriteFailures</c> metric shape — same
/// per-interval increment-and-reset semantics, same defaults-to-zero
/// contract.
/// </summary>
public class AuditRedactionFailureMetricTests
{
private readonly SiteHealthCollector _collector = new();
[Fact]
public void Increment_Three_Times_Counter_Reports_3()
{
_collector.IncrementAuditRedactionFailure();
_collector.IncrementAuditRedactionFailure();
_collector.IncrementAuditRedactionFailure();
var report = _collector.CollectReport("site-1");
Assert.Equal(3, report.AuditRedactionFailure);
}
[Fact]
public void Report_Payload_Includes_AuditRedactionFailure_AsZeroByDefault()
{
var report = _collector.CollectReport("site-1");
Assert.Equal(0, report.AuditRedactionFailure);
}
/// <summary>
/// Mirrors the existing per-interval reset semantics for ScriptErrorCount /
/// AlarmEvaluationErrorCount / DeadLetterCount / SiteAuditWriteFailures —
/// AuditRedactionFailure is an interval count, not a running total.
/// </summary>
[Fact]
public void CollectReport_Resets_AuditRedactionFailure()
{
_collector.IncrementAuditRedactionFailure();
_collector.IncrementAuditRedactionFailure();
var first = _collector.CollectReport("site-1");
Assert.Equal(2, first.AuditRedactionFailure);
var second = _collector.CollectReport("site-1");
Assert.Equal(0, second.AuditRedactionFailure);
}
}
@@ -0,0 +1,426 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// A simple fake TimeProvider for testing that allows advancing time manually.
/// </summary>
internal sealed class TestTimeProvider : TimeProvider
{
private DateTimeOffset _utcNow;
public TestTimeProvider(DateTimeOffset startTime)
{
_utcNow = startTime;
}
public override DateTimeOffset GetUtcNow() => _utcNow;
public void Advance(TimeSpan duration) => _utcNow += duration;
}
public class CentralHealthAggregatorTests
{
private readonly TestTimeProvider _timeProvider;
private readonly CentralHealthAggregator _aggregator;
public CentralHealthAggregatorTests()
{
_timeProvider = new TestTimeProvider(DateTimeOffset.UtcNow);
var options = Options.Create(new HealthMonitoringOptions
{
OfflineTimeout = TimeSpan.FromSeconds(60)
});
_aggregator = new CentralHealthAggregator(
options,
NullLogger<CentralHealthAggregator>.Instance,
_timeProvider);
}
private static SiteHealthReport MakeReport(string siteId, long seq) =>
new(
SiteId: siteId,
SequenceNumber: seq,
ReportTimestamp: DateTimeOffset.UtcNow,
DataConnectionStatuses: new Dictionary<string, ConnectionHealth>(),
TagResolutionCounts: new Dictionary<string, TagResolutionStatus>(),
ScriptErrorCount: 0,
AlarmEvaluationErrorCount: 0,
StoreAndForwardBufferDepths: new Dictionary<string, int>(),
DeadLetterCount: 0,
DeployedInstanceCount: 0,
EnabledInstanceCount: 0,
DisabledInstanceCount: 0);
[Fact]
public void ProcessReport_StoresState_ForNewSite()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
var state = _aggregator.GetSiteState("site-1");
Assert.NotNull(state);
Assert.True(state.IsOnline);
Assert.Equal(1, state.LastSequenceNumber);
}
[Fact]
public void ProcessReport_UpdatesState_WhenSequenceIncreases()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
_aggregator.ProcessReport(MakeReport("site-1", 2));
var state = _aggregator.GetSiteState("site-1");
Assert.Equal(2, state!.LastSequenceNumber);
}
[Fact]
public void ProcessReport_RejectsStaleReport_WhenSequenceNotGreater()
{
_aggregator.ProcessReport(MakeReport("site-1", 5));
_aggregator.ProcessReport(MakeReport("site-1", 3));
var state = _aggregator.GetSiteState("site-1");
Assert.Equal(5, state!.LastSequenceNumber);
}
[Fact]
public void ProcessReport_RejectsEqualSequence()
{
_aggregator.ProcessReport(MakeReport("site-1", 5));
_aggregator.ProcessReport(MakeReport("site-1", 5));
var state = _aggregator.GetSiteState("site-1");
Assert.Equal(5, state!.LastSequenceNumber);
}
[Fact]
public void OfflineDetection_SiteGoesOffline_WhenNoReportWithinTimeout()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
// Advance past the offline timeout
_timeProvider.Advance(TimeSpan.FromSeconds(61));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
}
[Fact]
public void OnlineRecovery_SiteComesBackOnline_WhenReportReceived()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
// Go offline
_timeProvider.Advance(TimeSpan.FromSeconds(61));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
// Receive new report → back online
_aggregator.ProcessReport(MakeReport("site-1", 2));
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
[Fact]
public void OfflineDetection_SiteRemainsOnline_WhenReportWithinTimeout()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
_timeProvider.Advance(TimeSpan.FromSeconds(30));
_aggregator.CheckForOfflineSites();
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
[Fact]
public void GetAllSiteStates_ReturnsAllKnownSites()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
_aggregator.ProcessReport(MakeReport("site-2", 1));
var states = _aggregator.GetAllSiteStates();
Assert.Equal(2, states.Count);
Assert.Contains("site-1", states.Keys);
Assert.Contains("site-2", states.Keys);
}
[Fact]
public void GetSiteState_ReturnsNull_ForUnknownSite()
{
var state = _aggregator.GetSiteState("nonexistent");
Assert.Null(state);
}
[Fact]
public void ProcessReport_StoresLatestReport()
{
var report = MakeReport("site-1", 1) with { ScriptErrorCount = 42 };
_aggregator.ProcessReport(report);
var state = _aggregator.GetSiteState("site-1");
Assert.Equal(42, state!.LatestReport!.ScriptErrorCount);
}
/// <summary>
/// HealthMonitoring-002 regression: SiteHealthState is mutated from multiple
/// threads (ProcessReport, MarkHeartbeat, CheckForOfflineSites). With a mutable
/// class and unsynchronized field writes, a snapshot read could observe a torn
/// or half-applied state. The state must be immutable and every transition an
/// atomic reference swap, so a snapshot is always internally consistent and the
/// monotonic sequence-number guard is never subverted by a lost update.
/// </summary>
[Fact]
public async Task ProcessReport_ConcurrentUpdates_NeverLoseSequenceOrTearState()
{
const int iterations = 5_000;
// SiteHealthState must be an immutable record so handing the reference to
// UI callers (and reading it concurrently) is safe.
Assert.True(typeof(SiteHealthState).GetMethod("<Clone>$") != null,
"SiteHealthState must be an immutable record for safe concurrent reads.");
_aggregator.ProcessReport(MakeReport("site-1", 0));
var writer = Task.Run(() =>
{
for (long seq = 1; seq <= iterations; seq++)
_aggregator.ProcessReport(MakeReport("site-1", seq));
});
var heartbeater = Task.Run(() =>
{
for (int i = 0; i < iterations; i++)
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
});
long maxObserved = 0;
var reader = Task.Run(() =>
{
for (int i = 0; i < iterations; i++)
{
var state = _aggregator.GetSiteState("site-1");
if (state == null) continue;
// A consistent snapshot: the stored report's sequence number must
// always match the state's LastSequenceNumber (no half-applied update).
Assert.Equal(state.LastSequenceNumber, state.LatestReport!.SequenceNumber);
if (state.LastSequenceNumber > maxObserved)
maxObserved = state.LastSequenceNumber;
}
});
await Task.WhenAll(writer, heartbeater, reader);
// The final state must reflect the highest sequence — no lost update.
var final = _aggregator.GetSiteState("site-1");
Assert.Equal(iterations, final!.LastSequenceNumber);
Assert.Equal(iterations, final.LatestReport!.SequenceNumber);
Assert.True(final.IsOnline);
}
/// <summary>
/// HealthMonitoring-007 regression: a heartbeat for a site that has not yet
/// sent a full report (e.g. immediately after a central restart/failover, when
/// the aggregator's in-memory state is empty) must register the site as online
/// rather than being silently discarded. Otherwise reachable sites show as
/// "unknown" for up to a full report interval during the failover window.
/// </summary>
[Fact]
public void MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport()
{
var now = _timeProvider.GetUtcNow();
_aggregator.MarkHeartbeat("site-new", now);
var state = _aggregator.GetSiteState("site-new");
Assert.NotNull(state);
Assert.True(state.IsOnline);
Assert.Null(state.LatestReport);
Assert.Equal(now, state.LastHeartbeatAt);
}
/// <summary>
/// Regression test for HealthMonitoring-015. A heartbeat-only registered site
/// has never processed a full report, so <see cref="SiteHealthState.LastReportReceivedAt"/>
/// must be <c>null</c> — not the <c>DateTimeOffset.MinValue</c> (year-0001)
/// sentinel that the UI would otherwise render as a ~2000-year-stale timestamp.
/// The "no report yet" signal must be an explicit nullable state, consistent
/// with <see cref="SiteHealthState.LatestReport"/>.
/// </summary>
[Fact]
public void MarkHeartbeat_RegistersUnknownSite_WithNullLastReportReceivedAt()
{
_aggregator.MarkHeartbeat("site-new", _timeProvider.GetUtcNow());
var state = _aggregator.GetSiteState("site-new");
Assert.NotNull(state);
Assert.Null(state.LastReportReceivedAt);
}
/// <summary>
/// Regression test for HealthMonitoring-015. Once a full report is processed
/// for a heartbeat-registered site, <see cref="SiteHealthState.LastReportReceivedAt"/>
/// becomes a real (non-null) instant.
/// </summary>
[Fact]
public void ProcessReport_SetsLastReportReceivedAt_ForHeartbeatRegisteredSite()
{
_aggregator.MarkHeartbeat("site-new", _timeProvider.GetUtcNow());
_timeProvider.Advance(TimeSpan.FromSeconds(5));
var reportTime = _timeProvider.GetUtcNow();
_aggregator.ProcessReport(MakeReport("site-new", 1));
var state = _aggregator.GetSiteState("site-new");
Assert.NotNull(state);
Assert.Equal(reportTime, state.LastReportReceivedAt);
}
[Fact]
public void MarkHeartbeat_KeepsSiteOnline_BetweenReports()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
// Time advances past the offline timeout, but heartbeats keep arriving.
_timeProvider.Advance(TimeSpan.FromSeconds(45));
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
_timeProvider.Advance(TimeSpan.FromSeconds(45));
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
_aggregator.CheckForOfflineSites();
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
[Fact]
public void MarkHeartbeat_BringsOfflineSiteBackOnline()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
_timeProvider.Advance(TimeSpan.FromSeconds(61));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
/// <summary>
/// HealthMonitoring-020 regression: an offline-to-online transition must
/// be backed by a fresh LastHeartbeatAt. Previously MarkHeartbeat used
/// <c>max(receivedAt, existing.LastHeartbeatAt)</c>, so an out-of-order
/// heartbeat carrying an older timestamp would bring the site online with
/// a stale heartbeat and CheckForOfflineSites would flap it straight back
/// to offline on the next tick.
/// </summary>
[Fact]
public void MarkHeartbeat_OfflineToOnline_StampsFreshLastHeartbeatAt()
{
_aggregator.ProcessReport(MakeReport("site-1", 1));
_timeProvider.Advance(TimeSpan.FromSeconds(61));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
// An out-of-order heartbeat arrives with a timestamp older than the
// existing LastHeartbeatAt (e.g. clock skew on the originating node).
var nowAfter = _timeProvider.GetUtcNow();
var stale = nowAfter - TimeSpan.FromSeconds(120);
_aggregator.MarkHeartbeat("site-1", stale);
var state = _aggregator.GetSiteState("site-1")!;
Assert.True(state.IsOnline);
// The recorded LastHeartbeatAt must be ~"now", not the stale receivedAt.
Assert.InRange((nowAfter - state.LastHeartbeatAt).TotalSeconds, 0, 5);
// And it must survive the very next offline check — proves no flap.
_aggregator.CheckForOfflineSites();
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
}
/// <summary>
/// HealthMonitoring-005 regression: the synthetic "central" site has no
/// heartbeat source — its LastHeartbeatAt is only bumped by the 30s
/// CentralHealthReportLoop self-report. A single skipped/late self-report
/// (leader GC pause, brief stall, mid-failover) would leave it with no signal
/// for &gt;60s and flap it offline even though the central cluster is healthy.
/// The "central" keyspace entry must get a longer offline grace than real sites.
/// </summary>
[Fact]
public void OfflineDetection_CentralSite_HasLongerGraceThanRealSites()
{
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
_aggregator.ProcessReport(MakeReport("site-1", 1));
// One missed central self-report (~30s) plus the normal 60s site timeout:
// a real site would already be offline here, but central must not be —
// it only gets one self-report every 30s, so 60s is barely two reports.
_timeProvider.Advance(TimeSpan.FromSeconds(75));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
Assert.True(
_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline,
"central must survive a single missed self-report");
}
[Fact]
public void OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss()
{
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
// Well beyond even the central grace window — genuine total loss.
_timeProvider.Advance(TimeSpan.FromMinutes(10));
_aggregator.CheckForOfflineSites();
Assert.False(_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline);
}
/// <summary>
/// HealthMonitoring-013 regression: the offline-check cadence must be derived
/// from the *shorter* of <see cref="HealthMonitoringOptions.OfflineTimeout"/>
/// and <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>, so that if
/// an operator configures <c>CentralOfflineTimeout</c> smaller than
/// <c>OfflineTimeout</c>, central offline detection is still timely instead of
/// being delayed by up to a full <c>OfflineTimeout / 2</c>.
/// </summary>
[Fact]
public void CheckInterval_IsHalfTheShorterTimeout()
{
// Default: OfflineTimeout (60s) is the shorter of the two.
Assert.Equal(
TimeSpan.FromSeconds(30),
CentralHealthAggregator.ComputeCheckInterval(new HealthMonitoringOptions
{
OfflineTimeout = TimeSpan.FromSeconds(60),
CentralOfflineTimeout = TimeSpan.FromMinutes(3)
}));
// Operator configures CentralOfflineTimeout shorter — cadence must adapt.
Assert.Equal(
TimeSpan.FromSeconds(10),
CentralHealthAggregator.ComputeCheckInterval(new HealthMonitoringOptions
{
OfflineTimeout = TimeSpan.FromSeconds(60),
CentralOfflineTimeout = TimeSpan.FromSeconds(20)
}));
}
[Fact]
public void SequenceNumberReset_RejectedUntilExceedsPrevMax()
{
// Site sends seq 10, then restarts and sends seq 1.
// Per design: sequence resets on singleton restart.
// The aggregator will reject seq 1 < 10 — expected behavior.
_aggregator.ProcessReport(MakeReport("site-1", 10));
_aggregator.ProcessReport(MakeReport("site-1", 1));
Assert.Equal(10, _aggregator.GetSiteState("site-1")!.LastSequenceNumber);
// Once it exceeds the old max, it works again. SiteHealthState is an
// immutable snapshot, so re-fetch to observe the new state.
_aggregator.ProcessReport(MakeReport("site-1", 11));
Assert.Equal(11, _aggregator.GetSiteState("site-1")!.LastSequenceNumber);
}
}
@@ -0,0 +1,282 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// HealthMonitoring-009 regression: the central self-report loop had no test
/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
/// self-report generation for siteId="central", and monotonic sequence
/// assignment.
/// </summary>
public class CentralHealthReportLoopTests
{
private sealed class FakeClusterNodeProvider : IClusterNodeProvider
{
public bool SelfIsPrimary { get; set; }
public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
}
private sealed class RecordingAggregator : ICentralHealthAggregator
{
public List<SiteHealthReport> Processed { get; } = [];
public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
new Dictionary<string, SiteHealthState>();
public SiteHealthState? GetSiteState(string siteId) => null;
}
/// <summary>
/// HealthMonitoring-022 de-flake: <see cref="CentralHealthReportLoop"/>'s
/// internal cadence is a real <see cref="PeriodicTimer"/>, so the loop is
/// timing-sensitive. We can't drive a virtual clock (PeriodicTimer doesn't
/// consume <see cref="TimeProvider"/>) without refactoring the production
/// loop, so we keep wall-clock waits but use a *generous* budget: a 5 s
/// outer cancellation cap with a poll-until-condition wait, instead of a
/// fixed <see cref="Task.Delay"/> that fails fast on a slow CI runner. The
/// loop's <c>ReportInterval</c> is set to 50 ms in each test, so under
/// normal conditions the condition is met almost immediately; under heavy
/// CI load the poll loop tolerates the slow tick instead of asserting on a
/// timed-out empty list.
/// </summary>
private static async Task RunLoopUntil(
CentralHealthReportLoop loop,
Func<bool> condition,
TimeSpan? maxWait = null)
{
var deadline = maxWait ?? TimeSpan.FromSeconds(5);
using var cts = new CancellationTokenSource(deadline + TimeSpan.FromSeconds(1));
try
{
await loop.StartAsync(cts.Token);
var sw = System.Diagnostics.Stopwatch.StartNew();
while (sw.Elapsed < deadline && !condition())
{
await Task.Delay(25, CancellationToken.None);
}
await loop.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
}
/// <summary>
/// Used by tests that need the loop to run for a bounded period without
/// waiting on a specific condition (e.g. asserting <i>no</i> reports were
/// produced). The wait is generous (1 s default) — see
/// <see cref="RunLoopUntil"/> for the rationale.
/// </summary>
private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
{
var totalMs = Math.Max(runForMs, 1000);
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(totalMs + 1000));
try
{
await loop.StartAsync(cts.Token);
await Task.Delay(totalMs, CancellationToken.None);
await loop.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
}
[Fact]
public async Task GeneratesCentralReports_WhenSelfIsPrimary()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
// HealthMonitoring-022: wait up to 5 s for at least one report to fire
// rather than fixed-budget Task.Delay; tolerates slow CI runners.
await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);
Assert.NotEmpty(aggregator.Processed);
Assert.All(aggregator.Processed,
r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
}
[Fact]
public async Task GeneratesNoReports_WhenNotPrimary()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 250);
Assert.Empty(aggregator.Processed);
}
[Fact]
public async Task AssignsMonotonicSequenceNumbers()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
// HealthMonitoring-022: wait up to 5 s for at least 2 reports rather
// than a fixed 300 ms window that could miss the second tick on a
// slow CI runner; the assertion below proves the sequence is monotonic.
await RunLoopUntil(loop, () => aggregator.Processed.Count >= 2);
Assert.True(aggregator.Processed.Count >= 2,
$"Expected at least 2 reports, got {aggregator.Processed.Count}");
for (int i = 1; i < aggregator.Processed.Count; i++)
{
Assert.True(
aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
$"Sequence numbers not strictly increasing at index {i}");
}
}
/// <summary>
/// HealthMonitoring-006 regression: the central loop's sequence-number seed
/// must be derived from the injected <see cref="TimeProvider"/> (Unix-ms),
/// not from <c>DateTimeOffset.UtcNow</c> read at field initialization, so the
/// seeding strategy is deterministically testable.
/// </summary>
[Fact]
public void SequenceNumberSeed_UsesInjectedTimeProvider()
{
var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
var timeProvider = new TestTimeProvider(fixedInstant);
var loop = new CentralHealthReportLoop(
new SiteHealthCollector(),
new RecordingAggregator(),
new FakeClusterNodeProvider { SelfIsPrimary = true },
Options.Create(new HealthMonitoringOptions()),
NullLogger<CentralHealthReportLoop>.Instance,
timeProvider);
Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber);
}
/// <summary>
/// HealthMonitoring-018 regression: when <see cref="ICentralHealthAggregator.ProcessReport"/>
/// throws, the per-interval counters that
/// <see cref="SiteHealthCollector.CollectReport"/> just drained must be
/// restored back into the shared collector so they roll forward into the
/// next interval rather than being silently lost. Same shape as the
/// HealthMonitoring-017 fix in <see cref="HealthReportSender"/>.
/// </summary>
[Fact]
public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport()
{
var collector = new SiteHealthCollector();
// Pre-populate every per-interval counter so the restore path on each
// field is exercised. The loop's first iteration will read-and-reset
// these via CollectReport, then ProcessReport will throw, and the
// restore must put them back.
collector.IncrementScriptError();
collector.IncrementScriptError();
collector.IncrementAlarmError();
collector.IncrementDeadLetter();
collector.IncrementDeadLetter();
collector.IncrementDeadLetter();
collector.IncrementSiteAuditWriteFailures();
collector.IncrementAuditRedactionFailure();
collector.IncrementAuditRedactionFailure();
var aggregator = new FailingThenSucceedingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
// HealthMonitoring-022: the first ProcessReport call throws (counters
// get restored), the second succeeds. Wait up to 5 s for that second
// (successful) call rather than a fixed 450 ms budget.
await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);
// First call threw, later succeeded — the first successful report
// must carry the previously-failed interval's accumulated counts.
Assert.NotEmpty(aggregator.Processed);
var firstSuccess = aggregator.Processed[0];
Assert.Equal(2, firstSuccess.ScriptErrorCount);
Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
Assert.Equal(3, firstSuccess.DeadLetterCount);
Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
Assert.Equal(2, firstSuccess.AuditRedactionFailure);
}
/// <summary>
/// <see cref="ICentralHealthAggregator"/> whose first <c>ProcessReport</c>
/// call throws (only the first), then subsequent calls succeed. Used by
/// <see cref="ProcessReportFailure_PreservesIntervalCountersForNextReport"/>
/// to verify the HealthMonitoring-018 restore-on-failure path.
/// </summary>
private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator
{
private int _callCount;
public List<SiteHealthReport> Processed { get; } = [];
public void ProcessReport(SiteHealthReport report)
{
var n = Interlocked.Increment(ref _callCount);
if (n == 1)
{
throw new InvalidOperationException("aggregator temporarily unavailable");
}
Processed.Add(report);
}
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
new Dictionary<string, SiteHealthState>();
public SiteHealthState? GetSiteState(string siteId) => null;
}
[Fact]
public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
{
// The loop must still report the node's role to the collector when it is
// the standby, so the standby's own node card shows the correct role.
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 150);
Assert.False(collector.IsActiveNode);
}
}
@@ -0,0 +1,73 @@
using Microsoft.Extensions.Options;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// HealthMonitoring-014 regression: <see cref="HealthMonitoringOptions"/> intervals
/// are fed straight into <c>new PeriodicTimer(...)</c>, which throws
/// <see cref="ArgumentOutOfRangeException"/> for a zero/negative period. A
/// misconfigured <c>appsettings.json</c> must be rejected by an
/// <see cref="IValidateOptions{TOptions}"/> with a clear, key-naming message
/// rather than crashing the hosted service with an opaque exception.
/// </summary>
public class HealthMonitoringOptionsValidatorTests
{
private static ValidateOptionsResult Validate(HealthMonitoringOptions options) =>
new HealthMonitoringOptionsValidator().Validate(Options.DefaultName, options);
[Fact]
public void DefaultOptions_AreValid()
{
var result = Validate(new HealthMonitoringOptions());
Assert.True(result.Succeeded, result.FailureMessage);
}
[Fact]
public void ZeroReportInterval_IsRejected()
{
var result = Validate(new HealthMonitoringOptions { ReportInterval = TimeSpan.Zero });
Assert.True(result.Failed);
Assert.Contains("ReportInterval", result.FailureMessage);
}
[Fact]
public void NegativeReportInterval_IsRejected()
{
var result = Validate(new HealthMonitoringOptions { ReportInterval = TimeSpan.FromSeconds(-1) });
Assert.True(result.Failed);
Assert.Contains("ReportInterval", result.FailureMessage);
}
[Fact]
public void ZeroOfflineTimeout_IsRejected()
{
var result = Validate(new HealthMonitoringOptions { OfflineTimeout = TimeSpan.Zero });
Assert.True(result.Failed);
Assert.Contains("OfflineTimeout", result.FailureMessage);
}
[Fact]
public void ZeroCentralOfflineTimeout_IsRejected()
{
var result = Validate(new HealthMonitoringOptions { CentralOfflineTimeout = TimeSpan.Zero });
Assert.True(result.Failed);
Assert.Contains("CentralOfflineTimeout", result.FailureMessage);
}
[Fact]
public void CentralOfflineTimeout_ShorterThanOfflineTimeout_IsRejected()
{
var result = Validate(new HealthMonitoringOptions
{
OfflineTimeout = TimeSpan.FromSeconds(60),
CentralOfflineTimeout = TimeSpan.FromSeconds(30)
});
Assert.True(result.Failed);
Assert.Contains("CentralOfflineTimeout", result.FailureMessage);
}
}
@@ -0,0 +1,423 @@
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
public class HealthReportSenderTests
{
private class FakeTransport : IHealthReportTransport
{
public List<SiteHealthReport> SentReports { get; } = [];
public void Send(SiteHealthReport report) => SentReports.Add(report);
}
private class FakeSiteIdentityProvider : ISiteIdentityProvider
{
public string SiteId { get; set; } = "test-site";
}
/// <summary>
/// Captures emitted log entries so tests can assert that non-fatal failures
/// are surfaced (HealthMonitoring-010) rather than silently swallowed.
/// </summary>
private sealed class CapturingLogger<T> : ILogger<T>
{
public sealed record Entry(LogLevel Level, string Message, Exception? Exception);
public List<Entry> Entries { get; } = [];
public IDisposable BeginScope<TState>(TState state) where TState : notnull => NullScope.Instance;
public bool IsEnabled(LogLevel logLevel) => true;
public void Log<TState>(
LogLevel logLevel, EventId eventId, TState state, Exception? exception,
Func<TState, Exception?, string> formatter)
{
lock (Entries)
{
Entries.Add(new Entry(logLevel, formatter(state, exception), exception));
}
}
private sealed class NullScope : IDisposable
{
public static readonly NullScope Instance = new();
public void Dispose() { }
}
}
/// <summary>An <see cref="IClusterNodeProvider"/> whose query always throws.</summary>
private sealed class ThrowingClusterNodeProvider : IClusterNodeProvider
{
public bool SelfIsPrimary => true;
public IReadOnlyList<NodeStatus> GetClusterNodes() =>
throw new InvalidOperationException("cluster query failed");
}
[Fact]
public async Task SendsReportsWithMonotonicSequenceNumbers()
{
var transport = new FakeTransport();
var collector = new SiteHealthCollector();
collector.SetActiveNode(true);
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var sender = new HealthReportSender(
collector,
transport,
options,
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider { SiteId = "site-A" });
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(300));
try
{
await sender.StartAsync(cts.Token);
await Task.Delay(280, CancellationToken.None);
await sender.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
// Should have sent several reports
Assert.True(transport.SentReports.Count >= 2,
$"Expected at least 2 reports, got {transport.SentReports.Count}");
// Verify strictly-monotonic sequence numbers and matching site id
for (int i = 0; i < transport.SentReports.Count; i++)
{
if (i > 0)
{
Assert.True(
transport.SentReports[i].SequenceNumber > transport.SentReports[i - 1].SequenceNumber,
$"Sequence numbers not strictly increasing at index {i}");
}
Assert.Equal("site-A", transport.SentReports[i].SiteId);
}
}
[Fact]
public async Task FirstReportSequenceExceedsStartupUnixMs()
{
// Reports are seeded with Unix-ms at construction so a freshly-active
// node always sorts after the prior active. Verify the first emitted
// sequence is at least the startup epoch.
var transport = new FakeTransport();
var collector = new SiteHealthCollector();
collector.SetActiveNode(true);
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var beforeCtor = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
var sender = new HealthReportSender(
collector,
transport,
options,
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider());
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(150));
try
{
await sender.StartAsync(cts.Token);
await Task.Delay(120, CancellationToken.None);
await sender.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
Assert.True(transport.SentReports.Count >= 1);
Assert.True(
transport.SentReports[0].SequenceNumber >= beforeCtor,
$"First sequence {transport.SentReports[0].SequenceNumber} should be >= startup epoch {beforeCtor}");
}
[Fact]
public async Task ReportsIncludeUtcTimestamp()
{
var transport = new FakeTransport();
var collector = new SiteHealthCollector();
collector.SetActiveNode(true);
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var sender = new HealthReportSender(
collector,
transport,
options,
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider());
var before = DateTimeOffset.UtcNow;
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(150));
try
{
await sender.StartAsync(cts.Token);
await Task.Delay(120, CancellationToken.None);
await sender.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
var after = DateTimeOffset.UtcNow;
Assert.True(transport.SentReports.Count >= 1);
foreach (var report in transport.SentReports)
{
Assert.InRange(report.ReportTimestamp, before, after);
Assert.Equal(TimeSpan.Zero, report.ReportTimestamp.Offset);
}
}
/// <summary>
/// HealthMonitoring-001 regression: the documented "store-and-forward buffer
/// depth" metric (pending messages by category) must actually be populated in
/// the emitted report. Previously SetStoreAndForwardDepths had no callers, so
/// StoreAndForwardBufferDepths was always empty. The sender must query the S&amp;F
/// engine's per-category depth API and include it alongside the parked count.
/// </summary>
[Fact]
public async Task ReportsIncludeStoreAndForwardBufferDepthsFromStorage()
{
var dbName = $"HealthSfDepth_{Guid.NewGuid():N}";
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
// Keep one connection alive so the in-memory DB persists for the test.
using var keepAlive = new SqliteConnection(connStr);
keepAlive.Open();
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
await storage.InitializeAsync();
// Two pending ExternalSystem messages and one pending Notification message.
await storage.EnqueueAsync(MakePendingMessage("m1", StoreAndForwardCategory.ExternalSystem));
await storage.EnqueueAsync(MakePendingMessage("m2", StoreAndForwardCategory.ExternalSystem));
await storage.EnqueueAsync(MakePendingMessage("m3", StoreAndForwardCategory.Notification));
var transport = new FakeTransport();
var collector = new SiteHealthCollector();
collector.SetActiveNode(true);
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var sender = new HealthReportSender(
collector,
transport,
options,
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider(),
sfStorage: storage);
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(300));
try
{
await sender.StartAsync(cts.Token);
await Task.Delay(250, CancellationToken.None);
await sender.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
Assert.True(transport.SentReports.Count >= 1);
var depths = transport.SentReports[^1].StoreAndForwardBufferDepths;
Assert.Equal(2, depths[nameof(StoreAndForwardCategory.ExternalSystem)]);
Assert.Equal(1, depths[nameof(StoreAndForwardCategory.Notification)]);
Assert.False(depths.ContainsKey(nameof(StoreAndForwardCategory.CachedDbWrite)));
}
private static StoreAndForwardMessage MakePendingMessage(string id, StoreAndForwardCategory category) =>
new()
{
Id = id,
Category = category,
Target = "target",
PayloadJson = "{}",
RetryCount = 0,
MaxRetries = 50,
RetryIntervalMs = 30_000,
CreatedAt = DateTimeOffset.UtcNow,
Status = StoreAndForwardMessageStatus.Pending
};
[Fact]
public void InitialSequenceNumberSeededWithUnixMs()
{
var transport = new FakeTransport();
var collector = new SiteHealthCollector();
var options = Options.Create(new HealthMonitoringOptions());
var beforeCtor = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
var sender = new HealthReportSender(
collector,
transport,
options,
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider());
var afterCtor = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
Assert.InRange(sender.CurrentSequenceNumber, beforeCtor, afterCtor);
}
/// <summary>
/// HealthMonitoring-010 regression: a failure refreshing cluster nodes is
/// non-fatal (the report still ships) but must no longer be swallowed by a
/// bare <c>catch {}</c> — it must be logged as a warning with the exception so
/// persistent degradation is diagnosable.
/// </summary>
[Fact]
public async Task ClusterNodeRefreshFailure_IsLoggedNotSwallowed()
{
var transport = new FakeTransport();
var collector = new SiteHealthCollector();
collector.SetActiveNode(true);
var logger = new CapturingLogger<HealthReportSender>();
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var sender = new HealthReportSender(
collector,
transport,
options,
logger,
new FakeSiteIdentityProvider(),
clusterNodeProvider: new ThrowingClusterNodeProvider());
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(300));
try
{
await sender.StartAsync(cts.Token);
await Task.Delay(250, CancellationToken.None);
await sender.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
// The report loop continues despite the failure...
Assert.NotEmpty(transport.SentReports);
// ...but the failure is surfaced as a warning carrying the exception.
CapturingLogger<HealthReportSender>.Entry[] warnings;
lock (logger.Entries)
{
warnings = logger.Entries
.Where(e => e.Level == LogLevel.Warning && e.Exception is InvalidOperationException)
.ToArray();
}
Assert.NotEmpty(warnings);
Assert.Contains(warnings, w => w.Message.Contains("cluster nodes", StringComparison.OrdinalIgnoreCase));
}
/// <summary>
/// HealthMonitoring-017 regression: when the transport's <c>Send</c> throws,
/// the per-interval counters that <see cref="SiteHealthCollector.CollectReport"/>
/// just drained via <c>Interlocked.Exchange</c> must be restored back into the
/// collector so they roll forward into the next interval rather than being
/// silently lost. Before the fix, a transport failure left the counts in the
/// un-sent report only, and the next successful report shipped with the
/// counters at zero.
/// </summary>
[Fact]
public async Task SendFailure_PreservesIntervalCountersForNextReport()
{
var transport = new FailingThenSucceedingTransport();
var collector = new SiteHealthCollector();
collector.SetActiveNode(true);
// Pre-populate every per-interval counter so the restore path on each
// field is exercised — script error, alarm error, dead letter, site
// audit write failure, audit redaction failure.
collector.IncrementScriptError();
collector.IncrementScriptError();
collector.IncrementAlarmError();
collector.IncrementDeadLetter();
collector.IncrementDeadLetter();
collector.IncrementDeadLetter();
collector.IncrementSiteAuditWriteFailures();
collector.IncrementAuditRedactionFailure();
collector.IncrementAuditRedactionFailure();
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var sender = new HealthReportSender(
collector,
transport,
options,
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider());
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(500));
try
{
await sender.StartAsync(cts.Token);
await Task.Delay(450, CancellationToken.None);
await sender.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
// The first interval's Send threw, then later intervals succeeded. The
// first successful report must include the previously-failed interval's
// accumulated counts.
Assert.NotEmpty(transport.SentReports);
var firstSuccess = transport.SentReports[0];
Assert.Equal(2, firstSuccess.ScriptErrorCount);
Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
Assert.Equal(3, firstSuccess.DeadLetterCount);
Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
Assert.Equal(2, firstSuccess.AuditRedactionFailure);
}
/// <summary>
/// <see cref="IHealthReportTransport"/> that throws on the first
/// <c>Send</c> call (and only the first), then succeeds. Used by
/// <see cref="SendFailure_PreservesIntervalCountersForNextReport"/> to
/// verify the HealthMonitoring-017 restore-on-failure path.
/// </summary>
private sealed class FailingThenSucceedingTransport : IHealthReportTransport
{
private int _callCount;
public List<SiteHealthReport> SentReports { get; } = [];
public void Send(SiteHealthReport report)
{
var n = Interlocked.Increment(ref _callCount);
if (n == 1)
{
throw new InvalidOperationException("transport temporarily unavailable");
}
SentReports.Add(report);
}
}
/// <summary>
/// HealthMonitoring-006 regression: the sequence-number seed must be derived
/// from the injected <see cref="TimeProvider"/> so the Unix-ms seeding strategy
/// is deterministically testable and the clock dependency is explicit, rather
/// than reading <c>DateTimeOffset.UtcNow</c> directly at field initialization.
/// </summary>
[Fact]
public void SequenceNumberSeed_UsesInjectedTimeProvider()
{
var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
var timeProvider = new TestTimeProvider(fixedInstant);
var sender = new HealthReportSender(
new SiteHealthCollector(),
new FakeTransport(),
Options.Create(new HealthMonitoringOptions()),
NullLogger<HealthReportSender>.Instance,
new FakeSiteIdentityProvider(),
timeProvider: timeProvider);
Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), sender.CurrentSequenceNumber);
}
}
@@ -0,0 +1,73 @@
using ZB.MOM.WW.ScadaBridge.Commons.Types;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// Bundle E (M6-T6) regression coverage. The site-side audit-log SQLite writer
/// exposes a backlog snapshot (<c>SiteAuditBacklogSnapshot</c>) via the
/// <c>ISiteAuditQueue.GetBacklogStatsAsync</c> surface. A periodic
/// <c>SiteAuditBacklogReporter</c> hosted service polls that snapshot and
/// pushes it into the collector via <see cref="ISiteHealthCollector.UpdateSiteAuditBacklog"/>
/// so the next <see cref="ISiteHealthCollector.CollectReport"/> includes it in
/// the report payload as <c>SiteAuditBacklog</c>. Unlike the
/// SiteAuditWriteFailures / AuditRedactionFailure interval counters, the
/// backlog snapshot is not reset on collect — the field carries forward
/// whatever the most recent refresh pushed in.
/// </summary>
public class SiteAuditBacklogMetricTests
{
private readonly SiteHealthCollector _collector = new();
[Fact]
public void Update_Then_CollectReport_IncludesBacklog()
{
var snapshot = new SiteAuditBacklogSnapshot(
PendingCount: 42,
OldestPendingUtc: new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc),
OnDiskBytes: 1234567);
_collector.UpdateSiteAuditBacklog(snapshot);
var report = _collector.CollectReport("site-1");
Assert.Equal(snapshot, report.SiteAuditBacklog);
}
[Fact]
public void Report_Payload_Includes_SiteAuditBacklog_AsNullByDefault()
{
// No refresh has been pushed yet — the report carries null so the
// central UI can distinguish "no data yet" from "queue empty".
var report = _collector.CollectReport("site-1");
Assert.Null(report.SiteAuditBacklog);
}
[Fact]
public void CollectReport_DoesNotReset_SiteAuditBacklog()
{
// Backlog snapshot is a point-in-time reading, not a per-interval
// counter — successive CollectReport calls before the next
// SiteAuditBacklogReporter tick MUST keep returning the same snapshot
// so a slow refresh cadence doesn't blank the central dashboard.
var snapshot = new SiteAuditBacklogSnapshot(
PendingCount: 7,
OldestPendingUtc: null,
OnDiskBytes: 8192);
_collector.UpdateSiteAuditBacklog(snapshot);
var first = _collector.CollectReport("site-1");
var second = _collector.CollectReport("site-1");
Assert.Equal(snapshot, first.SiteAuditBacklog);
Assert.Equal(snapshot, second.SiteAuditBacklog);
}
[Fact]
public void Update_With_Null_Throws_ArgumentNullException()
{
Assert.Throws<ArgumentNullException>(
() => _collector.UpdateSiteAuditBacklog(null!));
}
}
@@ -0,0 +1,52 @@
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
/// <summary>
/// Bundle G (M2-T11) regression coverage. The site-side Audit Log writer chain
/// (FallbackAuditWriter) increments <see cref="IAuditWriteFailureCounter"/>
/// every time the primary SQLite writer throws. Bundle G bridges that counter
/// into the Site Health Monitoring report payload as <c>SiteAuditWriteFailures</c>
/// so a sustained audit-write outage surfaces on /monitoring/health rather than
/// disappearing into a NoOp sink.
/// </summary>
public class SiteAuditWriteFailuresMetricTests
{
private readonly SiteHealthCollector _collector = new();
[Fact]
public void Increment_Three_Times_Counter_Reports_3()
{
_collector.IncrementSiteAuditWriteFailures();
_collector.IncrementSiteAuditWriteFailures();
_collector.IncrementSiteAuditWriteFailures();
var report = _collector.CollectReport("site-1");
Assert.Equal(3, report.SiteAuditWriteFailures);
}
[Fact]
public void Report_Payload_Includes_SiteAuditWriteFailures_AsZeroByDefault()
{
var report = _collector.CollectReport("site-1");
Assert.Equal(0, report.SiteAuditWriteFailures);
}
/// <summary>
/// Mirrors the existing per-interval reset semantics for ScriptErrorCount /
/// AlarmEvaluationErrorCount / DeadLetterCount — SiteAuditWriteFailures is an
/// interval count, not a running total.
/// </summary>
[Fact]
public void CollectReport_Resets_SiteAuditWriteFailures()
{
_collector.IncrementSiteAuditWriteFailures();
_collector.IncrementSiteAuditWriteFailures();
var first = _collector.CollectReport("site-1");
Assert.Equal(2, first.SiteAuditWriteFailures);
var second = _collector.CollectReport("site-1");
Assert.Equal(0, second.SiteAuditWriteFailures);
}
}
@@ -0,0 +1,282 @@
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
public class SiteHealthCollectorTests
{
private readonly SiteHealthCollector _collector = new();
[Fact]
public void CollectReport_ReturnsZeroCounters_WhenNoErrorsRecorded()
{
var report = _collector.CollectReport("site-1");
Assert.Equal("site-1", report.SiteId);
Assert.Equal(0, report.ScriptErrorCount);
Assert.Equal(0, report.AlarmEvaluationErrorCount);
Assert.Equal(0, report.DeadLetterCount);
}
[Fact]
public void IncrementScriptError_AccumulatesBetweenReports()
{
_collector.IncrementScriptError();
_collector.IncrementScriptError();
_collector.IncrementScriptError();
var report = _collector.CollectReport("site-1");
Assert.Equal(3, report.ScriptErrorCount);
}
[Fact]
public void IncrementAlarmError_AccumulatesBetweenReports()
{
_collector.IncrementAlarmError();
_collector.IncrementAlarmError();
var report = _collector.CollectReport("site-1");
Assert.Equal(2, report.AlarmEvaluationErrorCount);
}
[Fact]
public void IncrementDeadLetter_AccumulatesBetweenReports()
{
_collector.IncrementDeadLetter();
var report = _collector.CollectReport("site-1");
Assert.Equal(1, report.DeadLetterCount);
}
[Fact]
public void CollectReport_ResetsCounters_AfterCollection()
{
_collector.IncrementScriptError();
_collector.IncrementAlarmError();
_collector.IncrementDeadLetter();
var first = _collector.CollectReport("site-1");
Assert.Equal(1, first.ScriptErrorCount);
Assert.Equal(1, first.AlarmEvaluationErrorCount);
Assert.Equal(1, first.DeadLetterCount);
var second = _collector.CollectReport("site-1");
Assert.Equal(0, second.ScriptErrorCount);
Assert.Equal(0, second.AlarmEvaluationErrorCount);
Assert.Equal(0, second.DeadLetterCount);
}
[Fact]
public void UpdateConnectionHealth_ReflectedInReport()
{
_collector.UpdateConnectionHealth("opc-1", ConnectionHealth.Connected);
_collector.UpdateConnectionHealth("opc-2", ConnectionHealth.Disconnected);
var report = _collector.CollectReport("site-1");
Assert.Equal(2, report.DataConnectionStatuses.Count);
Assert.Equal(ConnectionHealth.Connected, report.DataConnectionStatuses["opc-1"]);
Assert.Equal(ConnectionHealth.Disconnected, report.DataConnectionStatuses["opc-2"]);
}
[Fact]
public void ConnectionHealth_NotResetAfterCollect()
{
_collector.UpdateConnectionHealth("opc-1", ConnectionHealth.Connected);
_collector.CollectReport("site-1");
var second = _collector.CollectReport("site-1");
Assert.Single(second.DataConnectionStatuses);
Assert.Equal(ConnectionHealth.Connected, second.DataConnectionStatuses["opc-1"]);
}
[Fact]
public void RemoveConnection_RemovesFromReport()
{
_collector.UpdateConnectionHealth("opc-1", ConnectionHealth.Connected);
_collector.UpdateTagResolution("opc-1", 10, 8);
_collector.RemoveConnection("opc-1");
var report = _collector.CollectReport("site-1");
Assert.Empty(report.DataConnectionStatuses);
Assert.Empty(report.TagResolutionCounts);
}
[Fact]
public void UpdateTagResolution_ReflectedInReport()
{
_collector.UpdateTagResolution("opc-1", 50, 45);
var report = _collector.CollectReport("site-1");
Assert.Single(report.TagResolutionCounts);
Assert.Equal(50, report.TagResolutionCounts["opc-1"].TotalSubscribed);
Assert.Equal(45, report.TagResolutionCounts["opc-1"].SuccessfullyResolved);
}
[Fact]
public void StoreAndForwardBufferDepths_DefaultsToEmpty_WhenSetterNotCalled()
{
var report = _collector.CollectReport("site-1");
Assert.Empty(report.StoreAndForwardBufferDepths);
}
[Fact]
public void CollectReport_IncludesUtcTimestamp()
{
var before = DateTimeOffset.UtcNow;
var report = _collector.CollectReport("site-1");
var after = DateTimeOffset.UtcNow;
Assert.InRange(report.ReportTimestamp, before, after);
}
/// <summary>
/// HealthMonitoring-016 regression: <see cref="SiteHealthCollector.CollectReport"/>
/// must stamp <c>ReportTimestamp</c> from an injected <see cref="TimeProvider"/>
/// (consistent with the rest of the module), not directly from
/// <c>DateTimeOffset.UtcNow</c>, so the report timestamp is deterministically
/// testable against a known instant.
/// </summary>
[Fact]
public void CollectReport_StampsTimestamp_FromInjectedTimeProvider()
{
var fixedInstant = new DateTimeOffset(2026, 5, 17, 9, 30, 0, TimeSpan.Zero);
var collector = new SiteHealthCollector(new TestTimeProvider(fixedInstant));
var report = collector.CollectReport("site-1");
Assert.Equal(fixedInstant, report.ReportTimestamp);
}
[Fact]
public void CollectReport_SequenceNumberIsZero_CallerAssignsIt()
{
var report = _collector.CollectReport("site-1");
Assert.Equal(0, report.SequenceNumber);
}
// HealthMonitoring-009 regression: the remaining collector setters had no
// "reflected in report" coverage. The following tests verify each setter's
// value reaches CollectReport output.
[Fact]
public void SetClusterNodes_ReflectedInReport()
{
var nodes = new List<ZB.MOM.WW.ScadaBridge.Commons.Messages.Health.NodeStatus>
{
new("node-a", true, "Active"),
new("node-b", true, "Standby")
};
_collector.SetClusterNodes(nodes);
var report = _collector.CollectReport("site-1");
Assert.NotNull(report.ClusterNodes);
Assert.Equal(2, report.ClusterNodes!.Count);
Assert.Equal("node-a", report.ClusterNodes[0].Hostname);
}
[Fact]
public void SetInstanceCounts_ReflectedInReport()
{
_collector.SetInstanceCounts(deployed: 10, enabled: 7, disabled: 3);
var report = _collector.CollectReport("site-1");
Assert.Equal(10, report.DeployedInstanceCount);
Assert.Equal(7, report.EnabledInstanceCount);
Assert.Equal(3, report.DisabledInstanceCount);
}
[Fact]
public void SetParkedMessageCount_ReflectedInReport()
{
_collector.SetParkedMessageCount(42);
var report = _collector.CollectReport("site-1");
Assert.Equal(42, report.ParkedMessageCount);
}
[Fact]
public void SetNodeHostname_ReflectedInReport()
{
_collector.SetNodeHostname("site-host-1");
var report = _collector.CollectReport("site-1");
Assert.Equal("site-host-1", report.NodeHostname);
}
[Fact]
public void SetActiveNode_ReflectedInNodeRole()
{
_collector.SetActiveNode(true);
Assert.Equal("Active", _collector.CollectReport("site-1").NodeRole);
Assert.True(_collector.IsActiveNode);
_collector.SetActiveNode(false);
Assert.Equal("Standby", _collector.CollectReport("site-1").NodeRole);
Assert.False(_collector.IsActiveNode);
}
[Fact]
public void UpdateTagQuality_ReflectedInReport()
{
_collector.UpdateTagQuality("opc-1", good: 80, bad: 15, uncertain: 5);
var report = _collector.CollectReport("site-1");
Assert.NotNull(report.DataConnectionTagQuality);
var quality = report.DataConnectionTagQuality!["opc-1"];
Assert.Equal(80, quality.Good);
Assert.Equal(15, quality.Bad);
Assert.Equal(5, quality.Uncertain);
}
[Fact]
public void UpdateConnectionEndpoint_ReflectedInReport()
{
_collector.UpdateConnectionEndpoint("opc-1", "opc.tcp://plc-1:4840");
var report = _collector.CollectReport("site-1");
Assert.NotNull(report.DataConnectionEndpoints);
Assert.Equal("opc.tcp://plc-1:4840", report.DataConnectionEndpoints!["opc-1"]);
}
[Fact]
public void SetStoreAndForwardDepths_ReflectedInReport()
{
_collector.SetStoreAndForwardDepths(new Dictionary<string, int>
{
["ExternalSystem"] = 5,
["Notification"] = 2
});
var report = _collector.CollectReport("site-1");
Assert.Equal(5, report.StoreAndForwardBufferDepths["ExternalSystem"]);
Assert.Equal(2, report.StoreAndForwardBufferDepths["Notification"]);
}
[Fact]
public async Task ThreadSafety_ConcurrentIncrements()
{
const int iterations = 10_000;
var tasks = new[]
{
Task.Run(() => { for (int i = 0; i < iterations; i++) _collector.IncrementScriptError(); }),
Task.Run(() => { for (int i = 0; i < iterations; i++) _collector.IncrementAlarmError(); }),
Task.Run(() => { for (int i = 0; i < iterations; i++) _collector.IncrementDeadLetter(); })
};
await Task.WhenAll(tasks);
var report = _collector.CollectReport("site-1");
Assert.Equal(iterations, report.ScriptErrorCount);
Assert.Equal(iterations, report.AlarmEvaluationErrorCount);
Assert.Equal(iterations, report.DeadLetterCount);
}
}
@@ -0,0 +1,29 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="coverlet.collector" />
<PackageReference Include="Microsoft.Data.Sqlite" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="xunit" />
<PackageReference Include="xunit.runner.visualstudio" />
</ItemGroup>
<ItemGroup>
<Using Include="Xunit" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../../src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/ZB.MOM.WW.ScadaBridge.HealthMonitoring.csproj" />
</ItemGroup>
</Project>