7b0b9c7365
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
283 lines
12 KiB
C#
283 lines
12 KiB
C#
using Microsoft.Extensions.Logging.Abstractions;
|
|
using Microsoft.Extensions.Options;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.HealthMonitoring.Tests;
|
|
|
|
/// <summary>
|
|
/// HealthMonitoring-009 regression: the central self-report loop had no test
|
|
/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
|
|
/// self-report generation for siteId="central", and monotonic sequence
|
|
/// assignment.
|
|
/// </summary>
|
|
public class CentralHealthReportLoopTests
|
|
{
|
|
private sealed class FakeClusterNodeProvider : IClusterNodeProvider
|
|
{
|
|
public bool SelfIsPrimary { get; set; }
|
|
public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
|
|
public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
|
|
}
|
|
|
|
private sealed class RecordingAggregator : ICentralHealthAggregator
|
|
{
|
|
public List<SiteHealthReport> Processed { get; } = [];
|
|
public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
|
|
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
|
|
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
|
|
new Dictionary<string, SiteHealthState>();
|
|
public SiteHealthState? GetSiteState(string siteId) => null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// HealthMonitoring-022 de-flake: <see cref="CentralHealthReportLoop"/>'s
|
|
/// internal cadence is a real <see cref="PeriodicTimer"/>, so the loop is
|
|
/// timing-sensitive. We can't drive a virtual clock (PeriodicTimer doesn't
|
|
/// consume <see cref="TimeProvider"/>) without refactoring the production
|
|
/// loop, so we keep wall-clock waits but use a *generous* budget: a 5 s
|
|
/// outer cancellation cap with a poll-until-condition wait, instead of a
|
|
/// fixed <see cref="Task.Delay"/> that fails fast on a slow CI runner. The
|
|
/// loop's <c>ReportInterval</c> is set to 50 ms in each test, so under
|
|
/// normal conditions the condition is met almost immediately; under heavy
|
|
/// CI load the poll loop tolerates the slow tick instead of asserting on a
|
|
/// timed-out empty list.
|
|
/// </summary>
|
|
private static async Task RunLoopUntil(
|
|
CentralHealthReportLoop loop,
|
|
Func<bool> condition,
|
|
TimeSpan? maxWait = null)
|
|
{
|
|
var deadline = maxWait ?? TimeSpan.FromSeconds(5);
|
|
using var cts = new CancellationTokenSource(deadline + TimeSpan.FromSeconds(1));
|
|
try
|
|
{
|
|
await loop.StartAsync(cts.Token);
|
|
var sw = System.Diagnostics.Stopwatch.StartNew();
|
|
while (sw.Elapsed < deadline && !condition())
|
|
{
|
|
await Task.Delay(25, CancellationToken.None);
|
|
}
|
|
await loop.StopAsync(CancellationToken.None);
|
|
}
|
|
catch (OperationCanceledException) { }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used by tests that need the loop to run for a bounded period without
|
|
/// waiting on a specific condition (e.g. asserting <i>no</i> reports were
|
|
/// produced). The wait is generous (1 s default) — see
|
|
/// <see cref="RunLoopUntil"/> for the rationale.
|
|
/// </summary>
|
|
private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
|
|
{
|
|
var totalMs = Math.Max(runForMs, 1000);
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(totalMs + 1000));
|
|
try
|
|
{
|
|
await loop.StartAsync(cts.Token);
|
|
await Task.Delay(totalMs, CancellationToken.None);
|
|
await loop.StopAsync(CancellationToken.None);
|
|
}
|
|
catch (OperationCanceledException) { }
|
|
}
|
|
|
|
[Fact]
|
|
public async Task GeneratesCentralReports_WhenSelfIsPrimary()
|
|
{
|
|
var collector = new SiteHealthCollector();
|
|
var aggregator = new RecordingAggregator();
|
|
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
|
|
var options = Options.Create(new HealthMonitoringOptions
|
|
{
|
|
ReportInterval = TimeSpan.FromMilliseconds(50)
|
|
});
|
|
|
|
var loop = new CentralHealthReportLoop(
|
|
collector, aggregator, clusterNodes, options,
|
|
NullLogger<CentralHealthReportLoop>.Instance);
|
|
|
|
// HealthMonitoring-022: wait up to 5 s for at least one report to fire
|
|
// rather than fixed-budget Task.Delay; tolerates slow CI runners.
|
|
await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);
|
|
|
|
Assert.NotEmpty(aggregator.Processed);
|
|
Assert.All(aggregator.Processed,
|
|
r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
|
|
}
|
|
|
|
[Fact]
|
|
public async Task GeneratesNoReports_WhenNotPrimary()
|
|
{
|
|
var collector = new SiteHealthCollector();
|
|
var aggregator = new RecordingAggregator();
|
|
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
|
|
var options = Options.Create(new HealthMonitoringOptions
|
|
{
|
|
ReportInterval = TimeSpan.FromMilliseconds(50)
|
|
});
|
|
|
|
var loop = new CentralHealthReportLoop(
|
|
collector, aggregator, clusterNodes, options,
|
|
NullLogger<CentralHealthReportLoop>.Instance);
|
|
|
|
await RunLoopBriefly(loop, 250);
|
|
|
|
Assert.Empty(aggregator.Processed);
|
|
}
|
|
|
|
[Fact]
|
|
public async Task AssignsMonotonicSequenceNumbers()
|
|
{
|
|
var collector = new SiteHealthCollector();
|
|
var aggregator = new RecordingAggregator();
|
|
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
|
|
var options = Options.Create(new HealthMonitoringOptions
|
|
{
|
|
ReportInterval = TimeSpan.FromMilliseconds(50)
|
|
});
|
|
|
|
var loop = new CentralHealthReportLoop(
|
|
collector, aggregator, clusterNodes, options,
|
|
NullLogger<CentralHealthReportLoop>.Instance);
|
|
|
|
// HealthMonitoring-022: wait up to 5 s for at least 2 reports rather
|
|
// than a fixed 300 ms window that could miss the second tick on a
|
|
// slow CI runner; the assertion below proves the sequence is monotonic.
|
|
await RunLoopUntil(loop, () => aggregator.Processed.Count >= 2);
|
|
|
|
Assert.True(aggregator.Processed.Count >= 2,
|
|
$"Expected at least 2 reports, got {aggregator.Processed.Count}");
|
|
for (int i = 1; i < aggregator.Processed.Count; i++)
|
|
{
|
|
Assert.True(
|
|
aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
|
|
$"Sequence numbers not strictly increasing at index {i}");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// HealthMonitoring-006 regression: the central loop's sequence-number seed
|
|
/// must be derived from the injected <see cref="TimeProvider"/> (Unix-ms),
|
|
/// not from <c>DateTimeOffset.UtcNow</c> read at field initialization, so the
|
|
/// seeding strategy is deterministically testable.
|
|
/// </summary>
|
|
[Fact]
|
|
public void SequenceNumberSeed_UsesInjectedTimeProvider()
|
|
{
|
|
var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
|
|
var timeProvider = new TestTimeProvider(fixedInstant);
|
|
|
|
var loop = new CentralHealthReportLoop(
|
|
new SiteHealthCollector(),
|
|
new RecordingAggregator(),
|
|
new FakeClusterNodeProvider { SelfIsPrimary = true },
|
|
Options.Create(new HealthMonitoringOptions()),
|
|
NullLogger<CentralHealthReportLoop>.Instance,
|
|
timeProvider);
|
|
|
|
Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber);
|
|
}
|
|
|
|
/// <summary>
|
|
/// HealthMonitoring-018 regression: when <see cref="ICentralHealthAggregator.ProcessReport"/>
|
|
/// throws, the per-interval counters that
|
|
/// <see cref="SiteHealthCollector.CollectReport"/> just drained must be
|
|
/// restored back into the shared collector so they roll forward into the
|
|
/// next interval rather than being silently lost. Same shape as the
|
|
/// HealthMonitoring-017 fix in <see cref="HealthReportSender"/>.
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport()
|
|
{
|
|
var collector = new SiteHealthCollector();
|
|
// Pre-populate every per-interval counter so the restore path on each
|
|
// field is exercised. The loop's first iteration will read-and-reset
|
|
// these via CollectReport, then ProcessReport will throw, and the
|
|
// restore must put them back.
|
|
collector.IncrementScriptError();
|
|
collector.IncrementScriptError();
|
|
collector.IncrementAlarmError();
|
|
collector.IncrementDeadLetter();
|
|
collector.IncrementDeadLetter();
|
|
collector.IncrementDeadLetter();
|
|
collector.IncrementSiteAuditWriteFailures();
|
|
collector.IncrementAuditRedactionFailure();
|
|
collector.IncrementAuditRedactionFailure();
|
|
|
|
var aggregator = new FailingThenSucceedingAggregator();
|
|
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
|
|
var options = Options.Create(new HealthMonitoringOptions
|
|
{
|
|
ReportInterval = TimeSpan.FromMilliseconds(50)
|
|
});
|
|
|
|
var loop = new CentralHealthReportLoop(
|
|
collector, aggregator, clusterNodes, options,
|
|
NullLogger<CentralHealthReportLoop>.Instance);
|
|
|
|
// HealthMonitoring-022: the first ProcessReport call throws (counters
|
|
// get restored), the second succeeds. Wait up to 5 s for that second
|
|
// (successful) call rather than a fixed 450 ms budget.
|
|
await RunLoopUntil(loop, () => aggregator.Processed.Count >= 1);
|
|
|
|
// First call threw, later succeeded — the first successful report
|
|
// must carry the previously-failed interval's accumulated counts.
|
|
Assert.NotEmpty(aggregator.Processed);
|
|
var firstSuccess = aggregator.Processed[0];
|
|
Assert.Equal(2, firstSuccess.ScriptErrorCount);
|
|
Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
|
|
Assert.Equal(3, firstSuccess.DeadLetterCount);
|
|
Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
|
|
Assert.Equal(2, firstSuccess.AuditRedactionFailure);
|
|
}
|
|
|
|
/// <summary>
|
|
/// <see cref="ICentralHealthAggregator"/> whose first <c>ProcessReport</c>
|
|
/// call throws (only the first), then subsequent calls succeed. Used by
|
|
/// <see cref="ProcessReportFailure_PreservesIntervalCountersForNextReport"/>
|
|
/// to verify the HealthMonitoring-018 restore-on-failure path.
|
|
/// </summary>
|
|
private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator
|
|
{
|
|
private int _callCount;
|
|
public List<SiteHealthReport> Processed { get; } = [];
|
|
|
|
public void ProcessReport(SiteHealthReport report)
|
|
{
|
|
var n = Interlocked.Increment(ref _callCount);
|
|
if (n == 1)
|
|
{
|
|
throw new InvalidOperationException("aggregator temporarily unavailable");
|
|
}
|
|
Processed.Add(report);
|
|
}
|
|
|
|
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
|
|
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
|
|
new Dictionary<string, SiteHealthState>();
|
|
public SiteHealthState? GetSiteState(string siteId) => null;
|
|
}
|
|
|
|
[Fact]
|
|
public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
|
|
{
|
|
// The loop must still report the node's role to the collector when it is
|
|
// the standby, so the standby's own node card shows the correct role.
|
|
var collector = new SiteHealthCollector();
|
|
var aggregator = new RecordingAggregator();
|
|
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
|
|
var options = Options.Create(new HealthMonitoringOptions
|
|
{
|
|
ReportInterval = TimeSpan.FromMilliseconds(50)
|
|
});
|
|
|
|
var loop = new CentralHealthReportLoop(
|
|
collector, aggregator, clusterNodes, options,
|
|
NullLogger<CentralHealthReportLoop>.Instance);
|
|
|
|
await RunLoopBriefly(loop, 150);
|
|
|
|
Assert.False(collector.IsActiveNode);
|
|
}
|
|
}
|