fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93).
This commit is contained in:
@@ -63,4 +63,60 @@ public class CliConfigTests
|
||||
Environment.SetEnvironmentVariable("SCADALINK_FORMAT", orig);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CLI-021 regression: a malformed ~/.scadalink/config.json must NOT abort the
|
||||
/// CLI before any command runs — Load() must warn (to stderr) and return a
|
||||
/// usable default config so command-line overrides (--url, --username, etc.)
|
||||
/// and env vars can still take effect.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void Load_MalformedConfigFile_DoesNotThrow_WarnsAndReturnsDefault()
|
||||
{
|
||||
var tempHome = Path.Combine(Path.GetTempPath(), "scadalink-cli-test-" + Guid.NewGuid().ToString("N"));
|
||||
Directory.CreateDirectory(Path.Combine(tempHome, ".scadalink"));
|
||||
File.WriteAllText(
|
||||
Path.Combine(tempHome, ".scadalink", "config.json"),
|
||||
"{ this is not valid json :: ");
|
||||
|
||||
var origHome = Environment.GetEnvironmentVariable("HOME");
|
||||
var origUserProfile = Environment.GetEnvironmentVariable("USERPROFILE");
|
||||
var origUrl = Environment.GetEnvironmentVariable("SCADALINK_MANAGEMENT_URL");
|
||||
var origFormat = Environment.GetEnvironmentVariable("SCADALINK_FORMAT");
|
||||
var origUser = Environment.GetEnvironmentVariable("SCADALINK_USERNAME");
|
||||
var origPass = Environment.GetEnvironmentVariable("SCADALINK_PASSWORD");
|
||||
var origStderr = Console.Error;
|
||||
try
|
||||
{
|
||||
Environment.SetEnvironmentVariable("HOME", tempHome);
|
||||
Environment.SetEnvironmentVariable("USERPROFILE", tempHome);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_MANAGEMENT_URL", null);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_FORMAT", null);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_USERNAME", null);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_PASSWORD", null);
|
||||
|
||||
var stderrCapture = new StringWriter();
|
||||
Console.SetError(stderrCapture);
|
||||
|
||||
// Must not throw.
|
||||
var config = CliConfig.Load();
|
||||
|
||||
Assert.Equal("json", config.DefaultFormat);
|
||||
Assert.Null(config.ManagementUrl);
|
||||
var stderrText = stderrCapture.ToString();
|
||||
Assert.Contains("warning", stderrText, StringComparison.OrdinalIgnoreCase);
|
||||
Assert.Contains("config.json", stderrText);
|
||||
}
|
||||
finally
|
||||
{
|
||||
Console.SetError(origStderr);
|
||||
Environment.SetEnvironmentVariable("HOME", origHome);
|
||||
Environment.SetEnvironmentVariable("USERPROFILE", origUserProfile);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_MANAGEMENT_URL", origUrl);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_FORMAT", origFormat);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_USERNAME", origUser);
|
||||
Environment.SetEnvironmentVariable("SCADALINK_PASSWORD", origPass);
|
||||
try { Directory.Delete(tempHome, recursive: true); } catch { /* best-effort cleanup */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -308,6 +308,40 @@ public class SiteStreamGrpcServerTests : TestKit
|
||||
await streamTask;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Comm021_SubscribeThrows_StopsRelayActorAndRemovesActiveStreamEntry()
|
||||
{
|
||||
// Communication-021 regression: SubscribeInstance creates a StreamRelayActor
|
||||
// and registers an _activeStreams entry BEFORE calling _streamSubscriber.Subscribe.
|
||||
// If Subscribe throws (e.g. stale instance, site runtime shutting down) and the
|
||||
// pre-fix code lets the throw escape without the wrapping try, the relay actor
|
||||
// and the activeStreams entry both leak. The fix wraps the Subscribe call so the
|
||||
// catch deterministically stops the actor and removes the entry before re-throw.
|
||||
var subscriber = Substitute.For<ISiteStreamSubscriber>();
|
||||
subscriber.Subscribe(Arg.Any<string>(), Arg.Any<IActorRef>())
|
||||
.Returns<string>(_ => throw new InvalidOperationException("instance not found"));
|
||||
|
||||
var server = new SiteStreamGrpcServer(subscriber, _logger);
|
||||
server.SetReady(Sys);
|
||||
|
||||
var writer = Substitute.For<IServerStreamWriter<SiteStreamEvent>>();
|
||||
var context = CreateMockContext();
|
||||
|
||||
// The InvalidOperationException is expected to propagate (the gRPC stack maps
|
||||
// unhandled throws to Internal); the load-bearing assertion is the cleanup.
|
||||
await Assert.ThrowsAsync<InvalidOperationException>(
|
||||
() => server.SubscribeInstance(MakeRequest("corr-comm021"), writer, context));
|
||||
|
||||
// _activeStreams entry was inserted before Subscribe was called; the catch
|
||||
// must remove it so a follow-up subscription with the same correlation id is
|
||||
// not blocked, and the relay actor must be stopped so it does not leak.
|
||||
Assert.Equal(0, server.ActiveStreamCount);
|
||||
|
||||
// RemoveSubscriber must NOT have been called (Subscribe never returned a
|
||||
// subscription id) — verifying we hit the catch path, not the finally path.
|
||||
subscriber.DidNotReceive().RemoveSubscriber(Arg.Any<IActorRef>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetReady_AllowsStreamCreation()
|
||||
{
|
||||
|
||||
@@ -1007,6 +1007,151 @@ public class DeploymentServiceTests : TestKit
|
||||
public SilentProbeActor() => ReceiveAny(_ => { });
|
||||
}
|
||||
|
||||
// ── DeploymentManager-019: lifecycle timeouts must write an audit entry ──
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-019: when a Disable times out at the site, the
|
||||
/// operator's attempted action must still be recorded in the audit log
|
||||
/// with the documented <c>DisableTimedOut</c> action — pre-fix nothing
|
||||
/// was written and the audit trail was silent about the attempt.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DisableInstanceAsync_LifecycleTimeout_WritesDisableTimedOutAuditEntry()
|
||||
{
|
||||
var instance = new Instance("TimeoutAuditInst")
|
||||
{
|
||||
Id = 61,
|
||||
SiteId = 1,
|
||||
State = InstanceState.Enabled,
|
||||
};
|
||||
_repo.GetInstanceByIdAsync(61, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
|
||||
|
||||
var comms = new CommunicationService(
|
||||
Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }),
|
||||
NullLogger<CommunicationService>.Instance);
|
||||
comms.SetCommunicationActor(commActor);
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
var deadline = TimeSpan.FromMilliseconds(300);
|
||||
var service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
new DeploymentStatusNotifier(NullLogger<DeploymentStatusNotifier>.Instance),
|
||||
Options.Create(new DeploymentManagerOptions
|
||||
{
|
||||
OperationLockTimeout = TimeSpan.FromSeconds(5),
|
||||
LifecycleCommandTimeout = deadline,
|
||||
}),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
|
||||
var result = await service.DisableInstanceAsync(61, "operator-jane");
|
||||
|
||||
Assert.True(result.IsFailure);
|
||||
|
||||
// The DisableTimedOut audit entry must have been written. Pre-fix the
|
||||
// catch block returned without calling _auditService at all.
|
||||
await _audit.Received(1).LogAsync(
|
||||
"operator-jane",
|
||||
"DisableTimedOut",
|
||||
"Instance",
|
||||
"61",
|
||||
instance.UniqueName,
|
||||
Arg.Any<object?>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-019: same audit guarantee for the Enable path.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task EnableInstanceAsync_LifecycleTimeout_WritesEnableTimedOutAuditEntry()
|
||||
{
|
||||
var instance = new Instance("EnableTimeoutInst")
|
||||
{
|
||||
Id = 62,
|
||||
SiteId = 1,
|
||||
State = InstanceState.Disabled,
|
||||
};
|
||||
_repo.GetInstanceByIdAsync(62, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
|
||||
var comms = new CommunicationService(
|
||||
Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }),
|
||||
NullLogger<CommunicationService>.Instance);
|
||||
comms.SetCommunicationActor(commActor);
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
var service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
new DeploymentStatusNotifier(NullLogger<DeploymentStatusNotifier>.Instance),
|
||||
Options.Create(new DeploymentManagerOptions
|
||||
{
|
||||
OperationLockTimeout = TimeSpan.FromSeconds(5),
|
||||
LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300),
|
||||
}),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
|
||||
var result = await service.EnableInstanceAsync(62, "operator-jane");
|
||||
|
||||
Assert.True(result.IsFailure);
|
||||
await _audit.Received(1).LogAsync(
|
||||
"operator-jane",
|
||||
"EnableTimedOut",
|
||||
"Instance",
|
||||
"62",
|
||||
instance.UniqueName,
|
||||
Arg.Any<object?>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-019: same audit guarantee for the Delete path.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DeleteInstanceAsync_LifecycleTimeout_WritesDeleteTimedOutAuditEntry()
|
||||
{
|
||||
var instance = new Instance("DeleteTimeoutInst")
|
||||
{
|
||||
Id = 63,
|
||||
SiteId = 1,
|
||||
State = InstanceState.Enabled,
|
||||
};
|
||||
_repo.GetInstanceByIdAsync(63, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
|
||||
var comms = new CommunicationService(
|
||||
Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }),
|
||||
NullLogger<CommunicationService>.Instance);
|
||||
comms.SetCommunicationActor(commActor);
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
var service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
new DeploymentStatusNotifier(NullLogger<DeploymentStatusNotifier>.Instance),
|
||||
Options.Create(new DeploymentManagerOptions
|
||||
{
|
||||
OperationLockTimeout = TimeSpan.FromSeconds(5),
|
||||
LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300),
|
||||
}),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
|
||||
var result = await service.DeleteInstanceAsync(63, "operator-jane");
|
||||
|
||||
Assert.True(result.IsFailure);
|
||||
await _audit.Received(1).LogAsync(
|
||||
"operator-jane",
|
||||
"DeleteTimedOut",
|
||||
"Instance",
|
||||
"63",
|
||||
instance.UniqueName,
|
||||
Arg.Any<object?>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
// ── DeploymentManager-003: post-success persistence must commit the Success status ──
|
||||
|
||||
[Fact]
|
||||
|
||||
@@ -337,6 +337,49 @@ public class ExternalSystemClientTests
|
||||
$"Call took {sw.Elapsed}, expected to time out near the configured 200ms window");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// ExternalSystemGateway-019 regression: <see cref="HttpClient.Timeout"/> defaults
|
||||
/// to 100 seconds and is enforced internally by <c>SendAsync</c> via its own
|
||||
/// private CTS — a <see cref="TaskCanceledException"/> raised by that internal
|
||||
/// CTS does not trip either the caller's token or the gateway's timeout CTS,
|
||||
/// so any operator-configured <see cref="ExternalSystemGatewayOptions.DefaultHttpTimeout"/>
|
||||
/// greater than 100 s would be silently clipped. The fix sets
|
||||
/// <see cref="System.Threading.Timeout.InfiniteTimeSpan"/> on the rented client
|
||||
/// so the per-call <c>CancellationTokenSource(DefaultHttpTimeout)</c> in
|
||||
/// <c>InvokeHttpAsync</c> is the sole timeout source. This test verifies the
|
||||
/// property is in fact set before any request is dispatched.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task Call_DisablesHttpClientFrameworkTimeoutSoLongTimeoutsArentClipped()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
StubResolution(system, method);
|
||||
|
||||
var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.OK, "{}"));
|
||||
// Sanity check: the factory-supplied default is the framework's 100 s — exactly
|
||||
// the value the fix must override so an operator-configured timeout > 100 s is
|
||||
// honoured verbatim.
|
||||
Assert.Equal(TimeSpan.FromSeconds(100), httpClient.Timeout);
|
||||
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMinutes(5) };
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository,
|
||||
NullLogger<ExternalSystemClient>.Instance,
|
||||
options: Microsoft.Extensions.Options.Options.Create(options));
|
||||
|
||||
var result = await client.CallAsync("TestAPI", "getData");
|
||||
|
||||
Assert.True(result.Success);
|
||||
// After InvokeHttpAsync runs, the rented client's Timeout must have been
|
||||
// set to InfiniteTimeSpan — proving the framework-default 100 s clip is
|
||||
// disabled and the per-call CTS built from DefaultHttpTimeout is the
|
||||
// sole timeout source.
|
||||
Assert.Equal(Timeout.InfiniteTimeSpan, httpClient.Timeout);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Call_CallerCancellation_IsNotMisreportedAsTimeout()
|
||||
{
|
||||
|
||||
@@ -133,6 +133,83 @@ public class CentralHealthReportLoopTests
|
||||
Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-018 regression: when <see cref="ICentralHealthAggregator.ProcessReport"/>
|
||||
/// throws, the per-interval counters that
|
||||
/// <see cref="SiteHealthCollector.CollectReport"/> just drained must be
|
||||
/// restored back into the shared collector so they roll forward into the
|
||||
/// next interval rather than being silently lost. Same shape as the
|
||||
/// HealthMonitoring-017 fix in <see cref="HealthReportSender"/>.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport()
|
||||
{
|
||||
var collector = new SiteHealthCollector();
|
||||
// Pre-populate every per-interval counter so the restore path on each
|
||||
// field is exercised. The loop's first iteration will read-and-reset
|
||||
// these via CollectReport, then ProcessReport will throw, and the
|
||||
// restore must put them back.
|
||||
collector.IncrementScriptError();
|
||||
collector.IncrementScriptError();
|
||||
collector.IncrementAlarmError();
|
||||
collector.IncrementDeadLetter();
|
||||
collector.IncrementDeadLetter();
|
||||
collector.IncrementDeadLetter();
|
||||
collector.IncrementSiteAuditWriteFailures();
|
||||
collector.IncrementAuditRedactionFailure();
|
||||
collector.IncrementAuditRedactionFailure();
|
||||
|
||||
var aggregator = new FailingThenSucceedingAggregator();
|
||||
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
|
||||
var options = Options.Create(new HealthMonitoringOptions
|
||||
{
|
||||
ReportInterval = TimeSpan.FromMilliseconds(50)
|
||||
});
|
||||
|
||||
var loop = new CentralHealthReportLoop(
|
||||
collector, aggregator, clusterNodes, options,
|
||||
NullLogger<CentralHealthReportLoop>.Instance);
|
||||
|
||||
await RunLoopBriefly(loop, 450);
|
||||
|
||||
// First call threw, later succeeded — the first successful report
|
||||
// must carry the previously-failed interval's accumulated counts.
|
||||
Assert.NotEmpty(aggregator.Processed);
|
||||
var firstSuccess = aggregator.Processed[0];
|
||||
Assert.Equal(2, firstSuccess.ScriptErrorCount);
|
||||
Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
|
||||
Assert.Equal(3, firstSuccess.DeadLetterCount);
|
||||
Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
|
||||
Assert.Equal(2, firstSuccess.AuditRedactionFailure);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="ICentralHealthAggregator"/> whose first <c>ProcessReport</c>
|
||||
/// call throws (only the first), then subsequent calls succeed. Used by
|
||||
/// <see cref="ProcessReportFailure_PreservesIntervalCountersForNextReport"/>
|
||||
/// to verify the HealthMonitoring-018 restore-on-failure path.
|
||||
/// </summary>
|
||||
private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator
|
||||
{
|
||||
private int _callCount;
|
||||
public List<SiteHealthReport> Processed { get; } = [];
|
||||
|
||||
public void ProcessReport(SiteHealthReport report)
|
||||
{
|
||||
var n = Interlocked.Increment(ref _callCount);
|
||||
if (n == 1)
|
||||
{
|
||||
throw new InvalidOperationException("aggregator temporarily unavailable");
|
||||
}
|
||||
Processed.Add(report);
|
||||
}
|
||||
|
||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
|
||||
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
|
||||
new Dictionary<string, SiteHealthState>();
|
||||
public SiteHealthState? GetSiteState(string siteId) => null;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
|
||||
{
|
||||
|
||||
@@ -315,6 +315,89 @@ public class HealthReportSenderTests
|
||||
Assert.Contains(warnings, w => w.Message.Contains("cluster nodes", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-017 regression: when the transport's <c>Send</c> throws,
|
||||
/// the per-interval counters that <see cref="SiteHealthCollector.CollectReport"/>
|
||||
/// just drained via <c>Interlocked.Exchange</c> must be restored back into the
|
||||
/// collector so they roll forward into the next interval rather than being
|
||||
/// silently lost. Before the fix, a transport failure left the counts in the
|
||||
/// un-sent report only, and the next successful report shipped with the
|
||||
/// counters at zero.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task SendFailure_PreservesIntervalCountersForNextReport()
|
||||
{
|
||||
var transport = new FailingThenSucceedingTransport();
|
||||
var collector = new SiteHealthCollector();
|
||||
collector.SetActiveNode(true);
|
||||
// Pre-populate every per-interval counter so the restore path on each
|
||||
// field is exercised — script error, alarm error, dead letter, site
|
||||
// audit write failure, audit redaction failure.
|
||||
collector.IncrementScriptError();
|
||||
collector.IncrementScriptError();
|
||||
collector.IncrementAlarmError();
|
||||
collector.IncrementDeadLetter();
|
||||
collector.IncrementDeadLetter();
|
||||
collector.IncrementDeadLetter();
|
||||
collector.IncrementSiteAuditWriteFailures();
|
||||
collector.IncrementAuditRedactionFailure();
|
||||
collector.IncrementAuditRedactionFailure();
|
||||
|
||||
var options = Options.Create(new HealthMonitoringOptions
|
||||
{
|
||||
ReportInterval = TimeSpan.FromMilliseconds(50)
|
||||
});
|
||||
|
||||
var sender = new HealthReportSender(
|
||||
collector,
|
||||
transport,
|
||||
options,
|
||||
NullLogger<HealthReportSender>.Instance,
|
||||
new FakeSiteIdentityProvider());
|
||||
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(500));
|
||||
try
|
||||
{
|
||||
await sender.StartAsync(cts.Token);
|
||||
await Task.Delay(450, CancellationToken.None);
|
||||
await sender.StopAsync(CancellationToken.None);
|
||||
}
|
||||
catch (OperationCanceledException) { }
|
||||
|
||||
// The first interval's Send threw, then later intervals succeeded. The
|
||||
// first successful report must include the previously-failed interval's
|
||||
// accumulated counts.
|
||||
Assert.NotEmpty(transport.SentReports);
|
||||
var firstSuccess = transport.SentReports[0];
|
||||
Assert.Equal(2, firstSuccess.ScriptErrorCount);
|
||||
Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount);
|
||||
Assert.Equal(3, firstSuccess.DeadLetterCount);
|
||||
Assert.Equal(1, firstSuccess.SiteAuditWriteFailures);
|
||||
Assert.Equal(2, firstSuccess.AuditRedactionFailure);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="IHealthReportTransport"/> that throws on the first
|
||||
/// <c>Send</c> call (and only the first), then succeeds. Used by
|
||||
/// <see cref="SendFailure_PreservesIntervalCountersForNextReport"/> to
|
||||
/// verify the HealthMonitoring-017 restore-on-failure path.
|
||||
/// </summary>
|
||||
private sealed class FailingThenSucceedingTransport : IHealthReportTransport
|
||||
{
|
||||
private int _callCount;
|
||||
public List<SiteHealthReport> SentReports { get; } = [];
|
||||
|
||||
public void Send(SiteHealthReport report)
|
||||
{
|
||||
var n = Interlocked.Increment(ref _callCount);
|
||||
if (n == 1)
|
||||
{
|
||||
throw new InvalidOperationException("transport temporarily unavailable");
|
||||
}
|
||||
SentReports.Add(report);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-006 regression: the sequence-number seed must be derived
|
||||
/// from the injected <see cref="TimeProvider"/> so the Unix-ms seeding strategy
|
||||
|
||||
@@ -64,4 +64,48 @@ public class LoggerConfigurationTests
|
||||
Assert.Single(sink.LogEvents);
|
||||
Assert.Equal(LogEventLevel.Information, sink.LogEvents[0].Level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Host-022: an unrecognised <c>ScadaLink:Logging:MinimumLevel</c> (e.g. a typo
|
||||
/// like "Informaiton") must NOT abort startup but MUST emit a one-shot warning
|
||||
/// naming the offending value and the fallback so the silent coercion is
|
||||
/// visible. Null/blank is treated as "unset" and silently defaults.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void ParseLevel_UnrecognisedValue_FallsBackAndWarns()
|
||||
{
|
||||
var writer = new StringWriter();
|
||||
|
||||
var result = LoggerConfigurationFactory.ParseLevel("Informaiton", writer);
|
||||
|
||||
Assert.Equal(LogEventLevel.Information, result);
|
||||
var warning = writer.ToString();
|
||||
Assert.Contains("warning", warning, StringComparison.OrdinalIgnoreCase);
|
||||
Assert.Contains("Informaiton", warning);
|
||||
Assert.Contains("Information", warning);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ParseLevel_NullOrBlank_FallsBackSilently()
|
||||
{
|
||||
var writer = new StringWriter();
|
||||
|
||||
var nullResult = LoggerConfigurationFactory.ParseLevel(null, writer);
|
||||
var blankResult = LoggerConfigurationFactory.ParseLevel(" ", writer);
|
||||
|
||||
Assert.Equal(LogEventLevel.Information, nullResult);
|
||||
Assert.Equal(LogEventLevel.Information, blankResult);
|
||||
Assert.Empty(writer.ToString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ParseLevel_RecognisedValue_NoWarning()
|
||||
{
|
||||
var writer = new StringWriter();
|
||||
|
||||
var result = LoggerConfigurationFactory.ParseLevel("Warning", writer);
|
||||
|
||||
Assert.Equal(LogEventLevel.Warning, result);
|
||||
Assert.Empty(writer.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -689,4 +689,115 @@ public class AuditWriteMiddlewareTests
|
||||
$"ResponseSummary byte count {Encoding.UTF8.GetByteCount(evt.ResponseSummary!)} exceeded cap {cap}");
|
||||
Assert.True(evt.PayloadTruncated);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
// InboundAPI-018: asynchronously faulted audit-write tasks must be
|
||||
// observed (logged) rather than silently dropped — but must still NOT
|
||||
// alter the user-facing HTTP response (alog.md §13).
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
/// <summary>
|
||||
/// Test-only writer whose <see cref="WriteAsync"/> returns a Task that
|
||||
/// faults AFTER an asynchronous boundary, so the throw happens after
|
||||
/// <see cref="AuditWriteMiddleware"/>'s synchronous try/catch can see it —
|
||||
/// exactly the fire-and-forget bug InboundAPI-018 closes.
|
||||
/// </summary>
|
||||
private sealed class AsyncFaultingAuditWriter : ICentralAuditWriter
|
||||
{
|
||||
public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
return FaultAsync();
|
||||
|
||||
static async Task FaultAsync()
|
||||
{
|
||||
// Yield off-thread so the fault surfaces ASYNCHRONOUSLY (not
|
||||
// captured by a synchronous try/catch around the WriteAsync
|
||||
// call site).
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("async audit write failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Captures log entries written through a <see cref="ILogger{TCategoryName}"/>
|
||||
/// so the test can assert on the Warning that
|
||||
/// <see cref="AuditWriteMiddleware.ObserveAuditWriteFault"/> emits.
|
||||
/// </summary>
|
||||
private sealed class RecordingLogger : Microsoft.Extensions.Logging.ILogger<AuditWriteMiddleware>
|
||||
{
|
||||
public List<(Microsoft.Extensions.Logging.LogLevel Level, string Message, Exception? Exception)> Entries { get; } = new();
|
||||
|
||||
public IDisposable BeginScope<TState>(TState state) where TState : notnull => NullScope.Instance;
|
||||
|
||||
public bool IsEnabled(Microsoft.Extensions.Logging.LogLevel logLevel) => true;
|
||||
|
||||
public void Log<TState>(
|
||||
Microsoft.Extensions.Logging.LogLevel logLevel,
|
||||
Microsoft.Extensions.Logging.EventId eventId,
|
||||
TState state,
|
||||
Exception? exception,
|
||||
Func<TState, Exception?, string> formatter)
|
||||
{
|
||||
lock (Entries)
|
||||
{
|
||||
Entries.Add((logLevel, formatter(state, exception), exception));
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class NullScope : IDisposable
|
||||
{
|
||||
public static readonly NullScope Instance = new();
|
||||
public void Dispose() { }
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AuditWriter_AsyncFault_IsObserved_AsWarning_AndDoesNotAlterResponse()
|
||||
{
|
||||
var writer = new AsyncFaultingAuditWriter();
|
||||
var logger = new RecordingLogger();
|
||||
var ctx = BuildContext();
|
||||
|
||||
var mw = new AuditWriteMiddleware(
|
||||
next: _ =>
|
||||
{
|
||||
ctx.Response.StatusCode = 200;
|
||||
return Task.CompletedTask;
|
||||
},
|
||||
auditWriter: writer,
|
||||
logger: logger,
|
||||
options: new StaticAuditLogOptionsMonitor(new AuditLogOptions()));
|
||||
|
||||
await mw.InvokeAsync(ctx);
|
||||
|
||||
// The user-facing response is untouched — audit emission is best-effort.
|
||||
Assert.Equal(200, ctx.Response.StatusCode);
|
||||
|
||||
// Give the off-thread continuation a moment to fire and log. Spin
|
||||
// briefly rather than sleep-then-assert so the test is resilient to
|
||||
// scheduler jitter without inflating runtime on success.
|
||||
var deadline = DateTime.UtcNow.AddSeconds(2);
|
||||
while (DateTime.UtcNow < deadline)
|
||||
{
|
||||
lock (logger.Entries)
|
||||
{
|
||||
if (logger.Entries.Any(e =>
|
||||
e.Level == Microsoft.Extensions.Logging.LogLevel.Warning
|
||||
&& e.Exception is not null
|
||||
&& e.Message.Contains("async audit write faulted")))
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
await Task.Delay(20);
|
||||
}
|
||||
|
||||
// If we reach this point, the continuation did not fire — pre-fix the
|
||||
// fault would have been swallowed entirely and no log line emitted.
|
||||
var snapshot = logger.Entries.Select(e => $"{e.Level}: {e.Message}").ToList();
|
||||
Assert.Fail(
|
||||
"Expected a Warning log entry observing the async audit-write fault — none found. " +
|
||||
$"Entries: [{string.Join(", ", snapshot)}]");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -434,6 +434,10 @@ public class SecurityReviewRegressionTests
|
||||
services.AddLogging();
|
||||
services.AddDataProtection();
|
||||
services.AddSecurity();
|
||||
// Security-020: the cookie PostConfigure reads SecurityOptions.Value,
|
||||
// which triggers SecurityOptionsValidator — supply the required LDAP
|
||||
// fields so the cookie wiring under test can be resolved.
|
||||
ConfigureValidLdapDefaults(services);
|
||||
|
||||
using var provider = services.BuildServiceProvider();
|
||||
var cookieOptions = provider
|
||||
@@ -446,6 +450,20 @@ public class SecurityReviewRegressionTests
|
||||
Assert.True(cookieOptions.Cookie.HttpOnly);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Security-020: supplies the minimum-valid LDAP fields so the cookie /
|
||||
/// JWT wiring under test can be resolved without hitting
|
||||
/// <see cref="SecurityOptionsValidator"/>. Used by the cookie-policy
|
||||
/// integration tests in this class, which only care about the cookie
|
||||
/// options shape — not the LDAP fields.
|
||||
/// </summary>
|
||||
private static void ConfigureValidLdapDefaults(IServiceCollection services) =>
|
||||
services.Configure<SecurityOptions>(o =>
|
||||
{
|
||||
o.LdapServer = "ldap.example.com";
|
||||
o.LdapSearchBase = "dc=example,dc=com";
|
||||
});
|
||||
|
||||
// --- CentralUI-005: cookie auth must use a sliding session window ---
|
||||
// Documented policy (CLAUDE.md Security & Auth): sliding refresh with a
|
||||
// 30-minute idle timeout. The cookie middleware must enable SlidingExpiration
|
||||
@@ -458,6 +476,7 @@ public class SecurityReviewRegressionTests
|
||||
services.AddLogging();
|
||||
services.AddDataProtection();
|
||||
services.AddSecurity();
|
||||
ConfigureValidLdapDefaults(services);
|
||||
|
||||
using var provider = services.BuildServiceProvider();
|
||||
var cookieOptions = provider
|
||||
@@ -474,6 +493,7 @@ public class SecurityReviewRegressionTests
|
||||
services.AddLogging();
|
||||
services.AddDataProtection();
|
||||
services.AddSecurity();
|
||||
ConfigureValidLdapDefaults(services);
|
||||
// The idle timeout drives the cookie's expiry window.
|
||||
services.Configure<SecurityOptions>(o => o.IdleTimeoutMinutes = 30);
|
||||
|
||||
@@ -492,6 +512,7 @@ public class SecurityReviewRegressionTests
|
||||
services.AddLogging();
|
||||
services.AddDataProtection();
|
||||
services.AddSecurity();
|
||||
ConfigureValidLdapDefaults(services);
|
||||
services.Configure<SecurityOptions>(o => o.IdleTimeoutMinutes = 45);
|
||||
|
||||
using var provider = services.BuildServiceProvider();
|
||||
@@ -1196,3 +1217,100 @@ public class AuthorizationPolicyTests
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Code Review Regression Tests — Security-020
|
||||
|
||||
/// <summary>
|
||||
/// Security-020: <see cref="SecurityOptionsValidator"/> must reject empty
|
||||
/// <see cref="SecurityOptions.LdapServer"/> / <see cref="SecurityOptions.LdapSearchBase"/>
|
||||
/// at startup with a clear, key-naming message, so a typo'd appsettings section
|
||||
/// fails fast at boot instead of surfacing minutes/hours later as a generic
|
||||
/// LDAP error on the first real login.
|
||||
/// </summary>
|
||||
public class SecurityOptionsValidatorTests
|
||||
{
|
||||
private static SecurityOptions ValidOptions() => new()
|
||||
{
|
||||
LdapServer = "ldap.example.com",
|
||||
LdapSearchBase = "dc=example,dc=com",
|
||||
};
|
||||
|
||||
[Fact]
|
||||
public void Validate_AllRequiredFieldsSet_Succeeds()
|
||||
{
|
||||
var validator = new SecurityOptionsValidator();
|
||||
|
||||
var result = validator.Validate(name: null, ValidOptions());
|
||||
|
||||
Assert.True(result.Succeeded);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("")]
|
||||
[InlineData(" ")]
|
||||
public void Validate_EmptyOrWhitespaceLdapServer_Fails(string ldapServer)
|
||||
{
|
||||
var options = ValidOptions();
|
||||
options.LdapServer = ldapServer;
|
||||
var validator = new SecurityOptionsValidator();
|
||||
|
||||
var result = validator.Validate(name: null, options);
|
||||
|
||||
Assert.True(result.Failed);
|
||||
// Must name the full Section:Field key so the operator can find it.
|
||||
Assert.Contains("Security:LdapServer", result.FailureMessage);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("")]
|
||||
[InlineData(" ")]
|
||||
public void Validate_EmptyOrWhitespaceLdapSearchBase_Fails(string ldapSearchBase)
|
||||
{
|
||||
var options = ValidOptions();
|
||||
options.LdapSearchBase = ldapSearchBase;
|
||||
var validator = new SecurityOptionsValidator();
|
||||
|
||||
var result = validator.Validate(name: null, options);
|
||||
|
||||
Assert.True(result.Failed);
|
||||
Assert.Contains("Security:LdapSearchBase", result.FailureMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Validate_BothRequiredFieldsEmpty_ReportsBoth()
|
||||
{
|
||||
var options = new SecurityOptions
|
||||
{
|
||||
LdapServer = string.Empty,
|
||||
LdapSearchBase = string.Empty,
|
||||
};
|
||||
var validator = new SecurityOptionsValidator();
|
||||
|
||||
var result = validator.Validate(name: null, options);
|
||||
|
||||
Assert.True(result.Failed);
|
||||
// Both keys named — the operator should not need to re-run after
|
||||
// fixing the first one to discover the second is also missing.
|
||||
Assert.Contains("Security:LdapServer", result.FailureMessage);
|
||||
Assert.Contains("Security:LdapSearchBase", result.FailureMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AddSecurity_RegistersSecurityOptionsValidator()
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
services.AddLogging();
|
||||
services.AddDataProtection();
|
||||
services.AddSecurity();
|
||||
|
||||
using var provider = services.BuildServiceProvider();
|
||||
|
||||
// The validator participates in IValidateOptions<SecurityOptions> —
|
||||
// registration is the load-bearing wiring that makes Security-020
|
||||
// ValidateOnStart() actually fire.
|
||||
var validators = provider.GetServices<IValidateOptions<SecurityOptions>>().ToList();
|
||||
Assert.Contains(validators, v => v is SecurityOptionsValidator);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
@@ -513,4 +513,73 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable
|
||||
// counts only sweep retries, so a freshly buffered message has RetryCount 0.
|
||||
Assert.Equal(0, msg.RetryCount);
|
||||
}
|
||||
|
||||
// ─── StoreAndForward-024: StopAsync waits for the in-flight sweep ───
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-024: <see cref="StoreAndForwardService.StopAsync"/> must
|
||||
/// not return until any in-flight retry sweep has completed (or the bounded
|
||||
/// shutdown timeout fires). Pre-fix it disposed the timer and returned
|
||||
/// immediately, leaving a mid-flight sweep touching disposed dependencies.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task StopAsync_AwaitsInFlightRetrySweep_BeforeReturning()
|
||||
{
|
||||
// Build a service whose timer fires almost immediately, with a handler
|
||||
// that pauses in the middle of delivery so we can observe StopAsync's
|
||||
// wait behaviour.
|
||||
var dbName = $"StopWait_{Guid.NewGuid():N}";
|
||||
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
|
||||
using var keepAlive = new SqliteConnection(connStr);
|
||||
keepAlive.Open();
|
||||
|
||||
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
|
||||
await storage.InitializeAsync();
|
||||
|
||||
var options = new StoreAndForwardOptions
|
||||
{
|
||||
DefaultRetryInterval = TimeSpan.Zero,
|
||||
DefaultMaxRetries = 3,
|
||||
// Fire almost immediately so the sweep is in-flight by the time we call StopAsync.
|
||||
RetryTimerInterval = TimeSpan.FromMilliseconds(20),
|
||||
};
|
||||
var service = new StoreAndForwardService(
|
||||
storage, options, NullLogger<StoreAndForwardService>.Instance);
|
||||
|
||||
// Pre-seed a buffered message so the sweep has work to do, and a
|
||||
// handler that blocks until we release it.
|
||||
var handlerEntered = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
var releaseHandler = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
var handlerCompleted = false;
|
||||
service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, async _ =>
|
||||
{
|
||||
handlerEntered.TrySetResult();
|
||||
await releaseHandler.Task;
|
||||
handlerCompleted = true;
|
||||
return true;
|
||||
});
|
||||
|
||||
var seed = await service.EnqueueAsync(
|
||||
StoreAndForwardCategory.ExternalSystem, "api", """{}""",
|
||||
attemptImmediateDelivery: false);
|
||||
Assert.True(seed.WasBuffered);
|
||||
|
||||
await service.StartAsync();
|
||||
// Wait until the timer-driven sweep has called into the handler.
|
||||
var entered = await Task.WhenAny(handlerEntered.Task, Task.Delay(TimeSpan.FromSeconds(2)));
|
||||
Assert.Same(handlerEntered.Task, entered);
|
||||
Assert.False(handlerCompleted, "Handler should still be paused inside the sweep.");
|
||||
|
||||
// Kick StopAsync — it must NOT return until the sweep finishes. Run the
|
||||
// release on a background task so we can prove StopAsync is awaiting.
|
||||
var stopTask = service.StopAsync();
|
||||
Assert.False(stopTask.IsCompleted,
|
||||
"StopAsync returned before the in-flight sweep was given a chance to finish.");
|
||||
|
||||
// Release the handler — StopAsync should now complete shortly.
|
||||
releaseHandler.SetResult();
|
||||
await stopTask.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
Assert.True(handlerCompleted,
|
||||
"Sweep handler must have finished before StopAsync returned.");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user