Files
ScadaBridge/tests/ScadaLink.HealthMonitoring.Tests/CentralHealthReportLoopTests.cs
T
Joseph Doherty 6ae0fea558 fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown
semantics, and audit-row coverage across 9 modules. Highlights:

Cancellation & lifecycle:
- AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the
  captured SyncContext that risked sync-over-async deadlock.
- AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS,
  threaded through drain paths instead of CancellationToken.None.
- Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls.
- Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM
  during the bounded-retry window aborts cleanly.

Cursor / retry / counter correctness:
- AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since`
  when any row's idempotent insert is still being retried (per-EventId
  retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical
  abandon). No more silent abandonment of permanently-failing rows.
- ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's
  SPLIT loop — by class-doc construction the catch could only mask real
  failures and let the next iteration create permanent partition holes.
- HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot
  per-interval counters before sending, restore via new
  ISiteHealthCollector.AddIntervalCounters on transport failure so counts
  aren't silently lost.

Fire-and-forget / shutdown waits:
- InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks
  via OnlyOnFaulted continuation (Warning log; response unchanged).
- SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep
  with a bounded SweepShutdownWaitTimeout (10s).

Leak / refactor:
- Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its
  own try/catch so a throw doesn't leak the relay actor or _activeStreams
  entry.
- Comm-022: VERIFIED already-closed by Comm-016's dead-code purge.
- CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync
  (auth-failure exit-code contract unified).

Defensive / validation:
- CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config
  prints a warning and returns defaults instead of crashing the CLI.
- Host-022: ParseLevel emits stderr one-shot warning for unrecognised
  MinimumLevel instead of silently coercing to Information.
- ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the
  per-call CTS is the sole timeout source (was clipped to 100s by .NET).
- Security-020: New SecurityOptionsValidator (IValidateOptions) rejects
  empty LdapServer/LdapSearchBase with ValidateOnStart.
- DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/
  DeleteTimedOut audit entries (mirrors DeployFailed pattern).

Plus reconciled stale per-module Open-findings counters that had drifted
from prior sessions.

20+ new regression tests across 11 test projects; build clean; affected
suites all green. README regenerated: 75 open (was 93).
2026-05-28 07:13:28 -04:00

235 lines
9.3 KiB
C#

using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Messages.Health;
namespace ScadaLink.HealthMonitoring.Tests;
/// <summary>
/// HealthMonitoring-009 regression: the central self-report loop had no test
/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
/// self-report generation for siteId="central", and monotonic sequence
/// assignment.
/// </summary>
public class CentralHealthReportLoopTests
{
private sealed class FakeClusterNodeProvider : IClusterNodeProvider
{
public bool SelfIsPrimary { get; set; }
public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
}
private sealed class RecordingAggregator : ICentralHealthAggregator
{
public List<SiteHealthReport> Processed { get; } = [];
public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
new Dictionary<string, SiteHealthState>();
public SiteHealthState? GetSiteState(string siteId) => null;
}
private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
{
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(runForMs + 100));
try
{
await loop.StartAsync(cts.Token);
await Task.Delay(runForMs, CancellationToken.None);
await loop.StopAsync(CancellationToken.None);
}
catch (OperationCanceledException) { }
}
[Fact]
public async Task GeneratesCentralReports_WhenSelfIsPrimary()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 250);
Assert.NotEmpty(aggregator.Processed);
Assert.All(aggregator.Processed,
r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
}
[Fact]
public async Task GeneratesNoReports_WhenNotPrimary()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 250);
Assert.Empty(aggregator.Processed);
}
[Fact]
public async Task AssignsMonotonicSequenceNumbers()
{
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 300);
Assert.True(aggregator.Processed.Count >= 2,
$"Expected at least 2 reports, got {aggregator.Processed.Count}");
for (int i = 1; i < aggregator.Processed.Count; i++)
{
Assert.True(
aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
$"Sequence numbers not strictly increasing at index {i}");
}
}
/// <summary>
/// HealthMonitoring-006 regression: the central loop's sequence-number seed
/// must be derived from the injected <see cref="TimeProvider"/> (Unix-ms),
/// not from <c>DateTimeOffset.UtcNow</c> read at field initialization, so the
/// seeding strategy is deterministically testable.
/// </summary>
[Fact]
public void SequenceNumberSeed_UsesInjectedTimeProvider()
{
var fixedInstant = new DateTimeOffset(2026, 5, 16, 12, 0, 0, TimeSpan.Zero);
var timeProvider = new TestTimeProvider(fixedInstant);
var loop = new CentralHealthReportLoop(
new SiteHealthCollector(),
new RecordingAggregator(),
new FakeClusterNodeProvider { SelfIsPrimary = true },
Options.Create(new HealthMonitoringOptions()),
NullLogger<CentralHealthReportLoop>.Instance,
timeProvider);
Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber);
}
/// <summary>
/// HealthMonitoring-018 regression: when <see cref="ICentralHealthAggregator.ProcessReport"/>
/// throws, the per-interval counters that
/// <see cref="SiteHealthCollector.CollectReport"/> just drained must be
/// restored back into the shared collector so they roll forward into the
/// next interval rather than being silently lost. Same shape as the
/// HealthMonitoring-017 fix in <see cref="HealthReportSender"/>.
/// </summary>
[Fact]
public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport()
{
var collector = new SiteHealthCollector();
// Pre-populate every per-interval counter so the restore path on each
// field is exercised. The loop's first iteration will read-and-reset
// these via CollectReport, then ProcessReport will throw, and the
// restore must put them back.
collector.IncrementScriptError();
collector.IncrementScriptError();
collector.IncrementAlarmError();
collector.IncrementDeadLetter();
collector.IncrementDeadLetter();
collector.IncrementDeadLetter();
collector.IncrementSiteAuditWriteFailures();
collector.IncrementAuditRedactionFailure();
collector.IncrementAuditRedactionFailure();
var aggregator = new FailingThenSucceedingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 450);
// First call threw, later succeeded — the first successful report
// must carry the previously-failed interval's accumulated counts.
Assert.NotEmpty(aggregator.Processed);
var firstSuccess = aggregator.Processed[0];
Assert.Equal(2, firstSuccess.ScriptErrorCount);
Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
Assert.Equal(3, firstSuccess.DeadLetterCount);
Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
Assert.Equal(2, firstSuccess.AuditRedactionFailure);
}
/// <summary>
/// <see cref="ICentralHealthAggregator"/> whose first <c>ProcessReport</c>
/// call throws (only the first), then subsequent calls succeed. Used by
/// <see cref="ProcessReportFailure_PreservesIntervalCountersForNextReport"/>
/// to verify the HealthMonitoring-018 restore-on-failure path.
/// </summary>
private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator
{
private int _callCount;
public List<SiteHealthReport> Processed { get; } = [];
public void ProcessReport(SiteHealthReport report)
{
var n = Interlocked.Increment(ref _callCount);
if (n == 1)
{
throw new InvalidOperationException("aggregator temporarily unavailable");
}
Processed.Add(report);
}
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
new Dictionary<string, SiteHealthState>();
public SiteHealthState? GetSiteState(string siteId) => null;
}
[Fact]
public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
{
// The loop must still report the node's role to the collector when it is
// the standby, so the standby's own node card shows the correct role.
var collector = new SiteHealthCollector();
var aggregator = new RecordingAggregator();
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
var options = Options.Create(new HealthMonitoringOptions
{
ReportInterval = TimeSpan.FromMilliseconds(50)
});
var loop = new CentralHealthReportLoop(
collector, aggregator, clusterNodes, options,
NullLogger<CentralHealthReportLoop>.Instance);
await RunLoopBriefly(loop, 150);
Assert.False(collector.IsActiveNode);
}
}