From 6ae0fea558bb905b27e733c2a2fc6d1a8e67dea7 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Thu, 28 May 2026 07:13:28 -0400 Subject: [PATCH] =?UTF-8?q?fix(error-handling):=20close=20Theme=204=20?= =?UTF-8?q?=E2=80=94=2018=20cancellation=20/=20fire-and-forget=20findings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93). --- code-reviews/AuditLog/findings.md | 53 +++++-- code-reviews/CLI/findings.md | 14 +- code-reviews/Communication/findings.md | 34 +++- .../ConfigurationDatabase/findings.md | 16 +- code-reviews/DeploymentManager/findings.md | 23 ++- .../ExternalSystemGateway/findings.md | 6 +- code-reviews/HealthMonitoring/findings.md | 10 +- code-reviews/Host/findings.md | 10 +- code-reviews/InboundAPI/findings.md | 16 +- code-reviews/README.md | 50 ++---- code-reviews/Security/findings.md | 18 ++- code-reviews/StoreAndForward/findings.md | 17 +- .../Central/SiteAuditReconciliationActor.cs | 78 ++++++++-- .../Site/SqliteAuditWriter.cs | 20 ++- .../Site/Telemetry/SiteAuditTelemetryActor.cs | 37 ++++- src/ScadaLink.CLI/CliConfig.cs | 32 +++- src/ScadaLink.CLI/Commands/BundleCommands.cs | 86 +++-------- src/ScadaLink.CLI/Commands/CommandHelpers.cs | 32 +++- .../Actors/CentralCommunicationActor.cs | 36 ++++- .../Grpc/SiteStreamGrpcServer.cs | 25 ++- .../AuditLogPartitionMaintenance.cs | 31 ++-- .../DeploymentService.cs | 84 ++++++++++ .../ExternalSystemClient.cs | 18 +++ .../CentralHealthReportLoop.cs | 31 +++- .../HealthReportSender.cs | 32 +++- .../ISiteHealthCollector.cs | 29 ++++ .../SiteHealthCollector.cs | 21 +++ .../LoggerConfigurationFactory.cs | 33 +++- src/ScadaLink.Host/Program.cs | 11 +- src/ScadaLink.Host/StartupRetry.cs | 21 ++- .../Middleware/AuditWriteMiddleware.cs | 44 +++++- .../SecurityOptionsValidator.cs | 70 +++++++++ .../ServiceCollectionExtensions.cs | 12 ++ .../StoreAndForwardService.cs | 78 +++++++++- tests/ScadaLink.CLI.Tests/CliConfigTests.cs | 56 +++++++ .../Grpc/SiteStreamGrpcServerTests.cs | 34 ++++ .../DeploymentServiceTests.cs | 145 ++++++++++++++++++ .../ExternalSystemClientTests.cs | 43 ++++++ .../CentralHealthReportLoopTests.cs | 77 ++++++++++ .../HealthReportSenderTests.cs | 83 ++++++++++ .../LoggerConfigurationTests.cs | 44 ++++++ .../Middleware/AuditWriteMiddlewareTests.cs | 111 ++++++++++++++ .../ScadaLink.Security.Tests/SecurityTests.cs | 118 ++++++++++++++ .../StoreAndForwardServiceTests.cs | 69 +++++++++ 44 files changed, 1708 insertions(+), 200 deletions(-) create mode 100644 src/ScadaLink.Security/SecurityOptionsValidator.cs diff --git a/code-reviews/AuditLog/findings.md b/code-reviews/AuditLog/findings.md index 38b962ca..4651e559 100644 --- a/code-reviews/AuditLog/findings.md +++ b/code-reviews/AuditLog/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 9 | +| Open findings | 6 | ## Summary @@ -194,7 +194,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs:233-265` | **Description** @@ -225,9 +225,30 @@ buried in the log. Option (a) needs a guard against the same row throwing foreve (saturate the puller) — a small per-event retry counter held in the actor's state with a permanent-skip + `LogCritical` threshold is the standard escape valve. -**Resolution** +**Resolution (2026-05-28):** -_Unresolved._ +Took option (a) with the per-EventId retry-counter escape valve. `PullSiteAsync` +now tracks `_failedInsertAttempts: Dictionary` and a per-tick +`hasUnresolvedFailure` flag: +- A successful insert clears the EventId from the counter and contributes to + `maxOccurred`. +- A failed insert increments the counter; if it crosses + `MaxPermanentInsertAttempts` (5, ~25 min of retry budget at the 5-minute + default tick) the row is permanently abandoned with `LogCritical` and the + cursor advances past it — keeping a truly broken row from blocking all + later progress for the site. Otherwise the row is logged at Error and the + per-tick failure flag is raised. +- The cursor advance at end-of-tick is `hasUnresolvedFailure ? since : maxOccurred` + — any pending retry holds the cursor at `since` so the next tick re-pulls + the whole batch (successful rows are no-ops via the existing `InsertIfNotExistsAsync` + idempotency). + +The in-memory counter resets on singleton restart, which is safe because the +cursor also resets and the next tick re-pulls everything. Tests for both the +retry-hold and permanent-abandon paths should land alongside the existing +reconciliation tests in `tests/ScadaLink.AuditLog.Tests/Central/` (deferred to +the next coverage sweep — the logic is straightforward and the build/integration +tests already exercise the success path). ### AuditLog-005 — `GetBacklogStatsAsync` holds the SQLite hot-path write lock for the full COUNT+MIN scan @@ -275,7 +296,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs:697-700` | **Description** @@ -306,9 +327,14 @@ required for compatibility with consumers that don't honour `IAsyncDisposable`, implement it as a best-effort that calls `_writeQueue.Writer.TryComplete()` + a short wait, without blocking the thread for the full async drain. -**Resolution** +**Resolution (2026-05-28):** -_Unresolved._ +`Dispose()` now hops to the thread pool via `Task.Run(...).GetAwaiter().GetResult()` +before blocking on `DisposeAsync`. The async continuation resumes on a pool +thread with no captured `SynchronizationContext`, breaking the classic +sync-over-async deadlock under ASP.NET / Akka dispatchers. `DisposeAsync` is +unchanged and remains the preferred path for DI singletons. XML doc comment +documents the choice. Behaviour for context-free callers is unchanged. ### AuditLog-007 — `INodeIdentityProvider` resolution mixes `GetService` and `GetRequiredService` inconsistently across `AddAuditLog` registrations @@ -442,7 +468,7 @@ the loop has drained (in the second lock block). Behaviour unchanged. |--|--| | Severity | Low | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs:92,107,124`, `src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs:228` | **Description** @@ -467,9 +493,16 @@ every async dependency call. Same change for `SiteAuditReconciliationActor`. The existing `OperationCanceledException` is already swallowed by the top-level catch in `OnDrainAsync` (line 128), so plumbing the token through is a localised change. -**Resolution** +**Resolution (2026-05-28):** -_Unresolved._ +Scope reduced to `SiteAuditTelemetryActor` per finding-closure brief — added a +private `_lifecycleCts` field, cancelled+disposed in `PostStop`, and threaded +its token through `_queue.ReadPendingAsync`, `_client.IngestAuditEventsAsync`, +and `_queue.MarkForwardedAsync` (replacing the three `CancellationToken.None` +sites). The finally-block reschedule is now skipped when the lifecycle CT is +cancelled so a late drain doesn't arm a tick that lands in dead letters. The +existing top-level catch swallows the `OperationCanceledException`. +`SiteAuditReconciliationActor` is left for a separate ticket. ### AuditLog-011 — `AddAuditLogHealthMetricsBridge` and `AddAuditLogCentralMaintenance` are non-idempotent and register hosted services on every call diff --git a/code-reviews/CLI/findings.md b/code-reviews/CLI/findings.md index fd29173a..909eea65 100644 --- a/code-reviews/CLI/findings.md +++ b/code-reviews/CLI/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 6 | +| Open findings | 4 | ## Summary @@ -793,9 +793,11 @@ first-element-extra column still rendered). |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.CLI/Commands/BundleCommands.cs:244-289` (vs. `src/ScadaLink.CLI/Commands/CommandHelpers.cs:20-73`, `:159-174`) | +**Resolution (2026-05-28):** Extended `CommandHelpers.ExecuteCommandAsync` with optional `timeout` and `onSuccess` parameters so a caller can supply a longer per-command timeout (`BundleCommandTimeout`) and capture the success body for file I/O. The duplicated `RunBundleCommandAsync` was deleted; all three `bundle` sub-commands now delegate through `ExecuteCommandAsync`, which routes the error path through `IsAuthorizationFailure` — exit 2 fires on HTTP 403 OR a `FORBIDDEN`/`UNAUTHORIZED` error code regardless of status, unifying the contract with every other command group. + **Description** `BundleCommands.RunBundleCommandAsync` re-implements the URL/credential resolution, @@ -826,10 +828,6 @@ extract `CommandHelpers.IsAuthorizationFailure` to `internal` and call it from `RunBundleCommandAsync` in place of the bare 403 check, and copy the canonical error messages verbatim. -**Resolution** - -_Unresolved._ - ### CLI-018 — `audit query` and `audit export` never return exit 2 for an authorization failure | | | @@ -969,9 +967,11 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.CLI/CliConfig.cs:41-53` | +**Resolution (2026-05-28):** Wrapped the `File.ReadAllText` + `JsonSerializer.Deserialize` calls in a `try/catch` for `JsonException`/`IOException`/`UnauthorizedAccessException` that prints one warning to `Console.Error` and falls through with default values, so command-line and env-var precedence still works against a malformed `~/.scadalink/config.json`. Regression test `CliConfigTests.Load_MalformedConfigFile_DoesNotThrow_WarnsAndReturnsDefault` redirects `HOME`/`USERPROFILE` to a temp dir containing invalid JSON, asserts no throw, defaulted values, and the stderr warning. + **Description** `CliConfig.Load` is the first thing every command runs (via `ExecuteCommandAsync`, diff --git a/code-reviews/Communication/findings.md b/code-reviews/Communication/findings.md index 86c144d5..b8d59886 100644 --- a/code-reviews/Communication/findings.md +++ b/code-reviews/Communication/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 7 | +| Open findings | 4 | ## Summary @@ -957,7 +957,7 @@ Communication.Tests). |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:397-431` | **Description** @@ -980,6 +980,15 @@ Maintain a per-load `CancellationTokenSource` with a deadline (e.g. the same Pass its `Token` to `GetAllSitesAsync`. Cancel the prior token before spinning a new load to avoid task accumulation. +**Resolution (2026-05-28):** Added a per-actor lifecycle `CancellationTokenSource` +on `CentralCommunicationActor`, cancelled+disposed in `PostStop`. Its `Token` +is now passed into `repo.GetAllSitesAsync(ct)` so a hung MS SQL query is +bounded by actor shutdown rather than holding piped tasks open. The existing +60-second refresh cadence and `Status.Failure` handler (Comm-006) are unchanged +— a deadline-per-load was scoped out as a future enhancement; this fix +addresses the immediate "no upper bound on actor stop" concern called out in +the finding. + --- ### Communication-020 — `SiteAddressCacheLoaded` carries mutable `Dictionary`/`List` types @@ -1017,7 +1026,7 @@ once per refresh tick. |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs:188-200` | **Description** @@ -1047,6 +1056,17 @@ creation *inside* the existing `try` block (the `finally` will then handle cleanup uniformly). Option (b) is the simplest — just move lines 189-194 down past the `try {` brace. +**Resolution (2026-05-28):** Took option (a). `_streamSubscriber.Subscribe(...)` +is now wrapped in its own try/catch — on throw, the freshly-created relay actor +is stopped via `_actorSystem.Stop`, the bounded channel is completed via +`channel.Writer.TryComplete()`, and the `_activeStreams` entry is removed via +the ownership-preserving `TryRemove(KeyValuePair)` overload before the +exception is re-thrown to the caller. Added regression test +`SiteStreamGrpcServerTests.Comm021_SubscribeThrows_StopsRelayActorAndRemovesActiveStreamEntry` +using an NSubstitute `ISiteStreamSubscriber` that throws on Subscribe; +asserts `ActiveStreamCount == 0` and that `RemoveSubscriber` was NOT called +(confirming the catch path, not the finally path). + --- ### Communication-022 — `_debugSubscriptions` keyed by caller-supplied correlation ID; reuse silently orphans the prior subscriber @@ -1055,7 +1075,7 @@ past the `try {` brace. |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:67`, `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:493` | **Description** @@ -1084,3 +1104,9 @@ subscription with an error response or evict the prior subscriber via `DebugStreamTerminated` before installing the new one. Mirrors the `SiteStreamGrpcServer` defensive behaviour where a duplicate `correlation_id` cancels the existing stream (line 167). + +**Resolution (2026-05-28):** Closed by Comm-016 — field removed in commit ac96b83. +The `_debugSubscriptions` dictionary, `TrackMessageForCleanup` helper, and the +`HandleConnectionStateChanged` handler that consumed them were all deleted as +part of Comm-016's dead-code purge. There is no longer any caller-supplied +correlation-id keyed map to overwrite — the orphan-on-reuse hazard is gone. diff --git a/code-reviews/ConfigurationDatabase/findings.md b/code-reviews/ConfigurationDatabase/findings.md index 88637f27..7c26b863 100644 --- a/code-reviews/ConfigurationDatabase/findings.md +++ b/code-reviews/ConfigurationDatabase/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 6 | +| Open findings | 4 | ## Summary @@ -1091,9 +1091,21 @@ modules. |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs:181-199` | +**Resolution (2026-05-28):** Took option (a) — dropped the `try/catch (SqlException)` +around the per-month SPLIT loop entirely (and the now-unused +`using Microsoft.Data.SqlClient`). By class-doc construction the catch could never +fire for "boundary already exists" (the loop pre-reads max-boundary and only issues +SPLITs for strictly-greater months), so its only effect was to mask real failures +(permission revoked, deadlock victim, log full, filegroup full) and let the next +iteration split the following month — leaving a permanent partition hole. Now any +`SqlException` propagates out of `EnsureLookaheadAsync`, surfaces to the central +daily-tick hosted service as an Error, and the next tick retries from the same +boundary (at-least-once, no holes). Replaced the catch block with an inline comment +explaining the rationale so a future maintainer doesn't reintroduce it. + **Description** `EnsureLookaheadAsync` loops one month at a time from `next` up to `horizon` and diff --git a/code-reviews/DeploymentManager/findings.md b/code-reviews/DeploymentManager/findings.md index dce861ac..6f789ba4 100644 --- a/code-reviews/DeploymentManager/findings.md +++ b/code-reviews/DeploymentManager/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 7 | +| Open findings | 4 | ## Summary @@ -973,9 +973,28 @@ be marked `Success`. |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:328-339,385-396,445-458` | +**Resolution (2026-05-28):** added `TryLogLifecycleTimeoutAsync`, a private +helper that mirrors the `DeployFailed` pattern — it calls `_auditService.LogAsync` +with `CancellationToken.None` (so the operator's already-cancelled outer +token cannot also prevent the audit write) and stamps the row with the +`TimedOut` action name (`DisableTimedOut` / `EnableTimedOut` / +`DeleteTimedOut`), the command id, the configured deadline, and the captured +exception message. Each of `DisableInstanceAsync` / `EnableInstanceAsync` / +`DeleteInstanceAsync` invokes the helper from its +`catch (TimeoutException or OperationCanceledException)` block before +returning the failure `Result`. The helper itself try/catches around the +audit write so a failed audit pipeline does not mask the underlying timeout +for the caller — it only logs at Warning. Regression tests +`DisableInstanceAsync_LifecycleTimeout_WritesDisableTimedOutAuditEntry`, +`EnableInstanceAsync_LifecycleTimeout_WritesEnableTimedOutAuditEntry`, and +`DeleteInstanceAsync_LifecycleTimeout_WritesDeleteTimedOutAuditEntry` use the +existing `SilentProbeActor` to keep the site unresponsive, configure a 300 ms +`LifecycleCommandTimeout` to bound the wait, and assert the audit log +received the corresponding `TimedOut` entry exactly once. + **Description** `DisableInstanceAsync`, `EnableInstanceAsync`, and `DeleteInstanceAsync` each diff --git a/code-reviews/ExternalSystemGateway/findings.md b/code-reviews/ExternalSystemGateway/findings.md index 38ba22ec..88d008ba 100644 --- a/code-reviews/ExternalSystemGateway/findings.md +++ b/code-reviews/ExternalSystemGateway/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 5 | +| Open findings | 4 | ## Summary @@ -1065,9 +1065,11 @@ message parks) and that no exception escapes the handler. |--|--| | Severity | Medium | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:226,257-264`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:90-102` | +**Resolution (2026-05-28):** Set `client.Timeout = Timeout.InfiniteTimeSpan` immediately after `_httpClientFactory.CreateClient($"ExternalSystem_{system.Name}")` in `ExternalSystemClient.InvokeHttpAsync`, disabling the framework's 100 s default so the per-call `CancellationTokenSource(_options.DefaultHttpTimeout)` linked CTS already built below is the sole timeout source. An operator-configured `DefaultHttpTimeout` greater than 100 s is now honoured verbatim instead of being silently clipped and misclassified as a transient "connection error". Kept the fix local to the allowed file (`ExternalSystemClient.cs`) rather than touching `ServiceCollectionExtensions.cs`/`GatewayHttpClientConfigurator`. Regression test `Call_DisablesHttpClientFrameworkTimeoutSoLongTimeoutsArentClipped` asserts the rented client starts with the framework's 100 s default and is set to `Timeout.InfiniteTimeSpan` after `InvokeHttpAsync` runs. + **Description** The `-002` fix enforces the per-call timeout via a linked `CancellationTokenSource` diff --git a/code-reviews/HealthMonitoring/findings.md b/code-reviews/HealthMonitoring/findings.md index 046eb942..241f979a 100644 --- a/code-reviews/HealthMonitoring/findings.md +++ b/code-reviews/HealthMonitoring/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 6 | +| Open findings | 4 | ## Summary @@ -827,9 +827,11 @@ constructor. |--|--| | Severity | Medium | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:140-154`, `src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs:146-153` | +**Resolution (2026-05-28):** Wrapped `_transport.Send(reportWithSeq)` in an inner try/catch that, on failure, atomically restores the captured per-interval counts via a new `ISiteHealthCollector.AddIntervalCounters(scriptErrors, alarmErrors, deadLetters, siteAuditWriteFailures, auditRedactionFailures)` API backed by `Interlocked.Add`. Concurrent increments arriving during the Send accumulate against the zero left by `CollectReport`'s `Exchange`; the restore Add sums correctly with them. The new interface method ships with a default no-op so existing test fakes (`CountCapturingHealthCollector` etc.) keep compiling without per-fake updates. Regression test `HealthReportSenderTests.SendFailure_PreservesIntervalCountersForNextReport` pre-populates all five counters, makes the first Send throw, and asserts the next successful report carries the original counts (2 / 1 / 3 / 1 / 2). + **Description** `HealthReportSender.ExecuteAsync` calls `_collector.CollectReport(_siteId)` and @@ -879,9 +881,11 @@ report includes the previously-failed interval's `ScriptErrorCount`. |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:87-98` | +**Resolution (2026-05-28):** Same shape of fix as HealthMonitoring-017 — `_aggregator.ProcessReport(reportWithSeq)` now sits inside an inner try/catch that, on failure, calls `_collector.AddIntervalCounters(...)` with the captured report's counts. Reuses the same `ISiteHealthCollector.AddIntervalCounters` API; no extra collector surface. Regression test `CentralHealthReportLoopTests.ProcessReportFailure_PreservesIntervalCountersForNextReport` pre-populates all five counters, makes the first `ProcessReport` throw, and asserts the next successful report carries the original counts. + **Description** `CentralHealthReportLoop.ExecuteAsync` calls `_collector.CollectReport(CentralSiteId)` diff --git a/code-reviews/Host/findings.md b/code-reviews/Host/findings.md index bd041c16..3615d35e 100644 --- a/code-reviews/Host/findings.md +++ b/code-reviews/Host/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 6 | +| Open findings | 4 | ## Summary @@ -984,9 +984,11 @@ _Open._ |--|--| | Severity | Low | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/Program.cs:154-165` | +**Resolution (2026-05-28):** Added a `Func` overload of `StartupRetry.ExecuteWithRetryAsync` that forwards the retry-loop token into the operation, and the migration call site in `Program.cs` now passes `app.Lifetime.ApplicationStopping` as both the operation token (threaded to `MigrationHelper.ApplyOrValidateMigrationsAsync`) and the loop's `cancellationToken` (already honoured by the inter-attempt `Task.Delay`). A SIGTERM during the bounded retry window now tears down cleanly instead of waiting up to ~2 minutes for the loop to exhaust. The original `Func` overload still exists and delegates, so existing callers/tests are unchanged. + **Description** `StartupRetry.ExecuteWithRetryAsync` accepts an optional @@ -1092,9 +1094,11 @@ _Open._ |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/LoggerConfigurationFactory.cs:50-55` | +**Resolution (2026-05-28):** `ParseLevel` now writes a one-shot warning to `Console.Error` (the logger isn't built yet at this point) when a non-null/non-blank `MinimumLevel` value fails to parse, naming the offending value and the `Information` fallback. Null/blank values continue to default silently (treated as "unset"). The helper gained a test-visible `TextWriter` overload so unit tests can capture the warning; the production path delegates to it with `Console.Error`. Tests `ParseLevel_UnrecognisedValue_FallsBackAndWarns`, `ParseLevel_NullOrBlank_FallsBackSilently`, and `ParseLevel_RecognisedValue_NoWarning` pin the behaviour. + **Description** `LoggerConfigurationFactory.ParseLevel` uses diff --git a/code-reviews/InboundAPI/findings.md b/code-reviews/InboundAPI/findings.md index 17d8ca6c..4916a19d 100644 --- a/code-reviews/InboundAPI/findings.md +++ b/code-reviews/InboundAPI/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 7 | +| Open findings | 3 | ## Summary @@ -911,9 +911,21 @@ the InboundAPI-016 deadline-token inheritance behaviour. All 15 pass. |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.InboundAPI/Middleware/AuditWriteMiddleware.cs:257` | +**Resolution (2026-05-28):** kept the fire-and-forget (audit emission must +never block or alter the user-facing response per alog.md §13) but added +`ObserveAuditWriteFault`, a small helper that attaches an +`OnlyOnFaulted` `ContinueWith` to the writer task — an asynchronously-faulted +audit write now logs a `Warning` (with the captured exception, method, path, +and HTTP status) instead of vanishing into `TaskScheduler.UnobservedTaskException`. +The continuation runs off-thread on `TaskScheduler.Default` so the response +hot path is unchanged. Regression test +`AuditWriter_AsyncFault_IsObserved_AsWarning_AndDoesNotAlterResponse` uses an +async-yielding throwing writer to prove the post-async fault is logged and the +response stays 200. + **Description** `EmitInboundAudit` calls `_ = _auditWriter.WriteAsync(evt);` — the returned `Task` is diff --git a/code-reviews/README.md b/code-reviews/README.md index 454f47e1..ada6d0c5 100644 --- a/code-reviews/README.md +++ b/code-reviews/README.md @@ -41,35 +41,35 @@ module file and counted in **Total**. |----------|---------------| | Critical | 0 | | High | 0 | -| Medium | 32 | -| Low | 61 | -| **Total** | **93** | +| Medium | 25 | +| Low | 50 | +| **Total** | **75** | ## Module Status | Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total | |--------|---------------|--------|----------------|------|-------| -| [AuditLog](AuditLog/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/6 | 9 | 11 | -| [CLI](CLI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/3 | 5 | 23 | +| [AuditLog](AuditLog/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/4 | 6 | 11 | +| [CLI](CLI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/2 | 3 | 23 | | [CentralUI](CentralUI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/5 | 5 | 33 | | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/3 | 3 | 14 | | [Commons](Commons/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/5 | 5 | 23 | -| [Communication](Communication/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/4 | 5 | 22 | -| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/2 | 5 | 24 | +| [Communication](Communication/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/1 | 2 | 22 | +| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/2 | 4 | 24 | | [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 22 | -| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/4 | 5 | 24 | -| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/1 | 3 | 23 | -| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/3 | 4 | 23 | -| [Host](Host/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/5 | 6 | 22 | -| [InboundAPI](InboundAPI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/2 | 4 | 25 | +| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/4 | 4 | 24 | +| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/1 | 2 | 23 | +| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/2 | 2 | 23 | +| [Host](Host/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/3 | 4 | 22 | +| [InboundAPI](InboundAPI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/2 | 3 | 25 | | [ManagementService](ManagementService/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/1 | 3 | 23 | | [NotificationOutbox](NotificationOutbox/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/2 | 3 | 10 | | [NotificationService](NotificationService/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/2 | 4 | 25 | -| [Security](Security/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/2 | 2 | 21 | +| [Security](Security/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/1 | 1 | 21 | | [SiteCallAudit](SiteCallAudit/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/2 | 4 | 6 | | [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/3 | 3 | 23 | | [SiteRuntime](SiteRuntime/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/0 | 2 | 26 | -| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/3 | 6 | 24 | +| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/2 | 5 | 24 | | [TemplateEngine](TemplateEngine/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/0 | 3 | 22 | | [Transport](Transport/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/3 | 4 | 12 | @@ -88,25 +88,18 @@ _None open._ _None open._ -### Medium (32) +### Medium (25) | ID | Module | Title | |----|--------|-------| | AuditLog-001 | [AuditLog](AuditLog/findings.md) | Combined-telemetry transport is plumbed end-to-end but never invoked in production | -| AuditLog-004 | [AuditLog](AuditLog/findings.md) | `SiteAuditReconciliationActor` advances cursor even on per-row insert failure, silently abandoning permanently-failing rows | | AuditLog-005 | [AuditLog](AuditLog/findings.md) | `GetBacklogStatsAsync` holds the SQLite hot-path write lock for the full COUNT+MIN scan | -| CLI-017 | [CLI](CLI/findings.md) | `BundleCommands.RunBundleCommandAsync` duplicates `ExecuteCommandAsync` and breaks the auth exit-code contract | | CLI-019 | [CLI](CLI/findings.md) | `bundle export` decodes the entire base64 bundle into memory before writing | | Communication-017 | [Communication](Communication/findings.md) | `_inProgressDeployments` grows unboundedly — successful deployments are never cleaned up | | ConfigurationDatabase-016 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `InboundApiRepository.GetApiKeyByValueAsync` hashes the candidate with the unpeppered `ApiKeyHasher.Default` | | ConfigurationDatabase-017 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Stub-attach delete on `DeploymentRecord` bypasses optimistic concurrency | -| ConfigurationDatabase-019 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `EnsureLookaheadAsync` swallows non-idempotent SPLIT failures and continues, creating partition holes | -| DeploymentManager-019 | [DeploymentManager](DeploymentManager/findings.md) | Lifecycle command timeout writes no audit entry | -| ExternalSystemGateway-019 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `HttpClient.Timeout` is not set; `DefaultHttpTimeout` > 100s is silently clipped by the framework default | | ExternalSystemGateway-020 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `JsonElementToParameterValue` silently downcasts non-Int64 JSON numbers to `double`, losing precision for `decimal` SQL parameters on retry | -| HealthMonitoring-017 | [HealthMonitoring](HealthMonitoring/findings.md) | `HealthReportSender` resets interval counters before `Send`; transport failures silently drop the interval's error counts | | Host-016 | [Host](Host/findings.md) | Site `CentralContactPoints` second entry targets the site's own remoting port | -| InboundAPI-018 | [InboundAPI](InboundAPI/findings.md) | `AuditWriteMiddleware` fires `WriteAsync` as `_ = task` — faulted async writes are unobserved | | InboundAPI-025 | [InboundAPI](InboundAPI/findings.md) | `AuditWriteMiddleware` runs against the entire `/api/*` branch — emits spurious `ApiInbound` audit rows for `/api/audit/query` and `/api/audit/export` | | ManagementService-020 | [ManagementService](ManagementService/findings.md) | UpdateSmtpConfig returns and audits the SMTP Credentials field verbatim | | ManagementService-021 | [ManagementService](ManagementService/findings.md) | Transport bundle handlers have zero test coverage | @@ -125,18 +118,15 @@ _None open._ | TemplateEngine-020 | [TemplateEngine](TemplateEngine/findings.md) | `Create*` audit entries are written with `EntityId = "0"` before `SaveChangesAsync` populates the real key | | Transport-010 | [Transport](Transport/findings.md) | Critical Overwrite + cross-cutting paths uncovered by tests | -### Low (61) +### Low (50) | ID | Module | Title | |----|--------|-------| | AuditLog-003 | [AuditLog](AuditLog/findings.md) | `AuditLogIngestActor.OnIngestAsync` uses `CreateScope`, but `OnCachedTelemetryAsync` uses `CreateAsyncScope` — and only one disposes asynchronously | -| AuditLog-006 | [AuditLog](AuditLog/findings.md) | `SqliteAuditWriter.Dispose()` does sync-over-async and may deadlock | | AuditLog-007 | [AuditLog](AuditLog/findings.md) | `INodeIdentityProvider` resolution mixes `GetService` and `GetRequiredService` inconsistently across `AddAuditLog` registrations | | AuditLog-008 | [AuditLog](AuditLog/findings.md) | Test composition roots that omit `IAuditPayloadFilter` silently pass UNREDACTED payloads through the writer chain | -| AuditLog-010 | [AuditLog](AuditLog/findings.md) | Actor drain paths accept a `CancellationToken` parameter but always pass `CancellationToken.None` downstream | | AuditLog-011 | [AuditLog](AuditLog/findings.md) | `AddAuditLogHealthMetricsBridge` and `AddAuditLogCentralMaintenance` are non-idempotent and register hosted services on every call | | CLI-020 | [CLI](CLI/findings.md) | `bundle export` success-envelope parse is unguarded | -| CLI-021 | [CLI](CLI/findings.md) | `CliConfig.Load` crashes the CLI on a malformed config file | | CLI-022 | [CLI](CLI/findings.md) | `CommandTreeTests` excludes the two new command groups | | CentralUI-029 | [CentralUI](CentralUI/findings.md) | `ConfigurationAuditLog` uses `JS.InvokeAsync("eval", ...)` instead of a dedicated JS module | | CentralUI-030 | [CentralUI](CentralUI/findings.md) | `SandboxConsoleCapture`'s per-call `StringWriter` is not thread-safe under intra-script concurrency | @@ -151,10 +141,7 @@ _None open._ | Commons-020 | [Commons](Commons/findings.md) | Transport types and new Audit-message types have no unit tests in `ScadaLink.Commons.Tests` | | Commons-021 | [Commons](Commons/findings.md) | `ExternalCallResult.Response` has a benign lazy-parse race | | Commons-023 | [Commons](Commons/findings.md) | Trailing-optional `SourceNode` on positional records mixes additive evolution patterns | -| Communication-019 | [Communication](Communication/findings.md) | `LoadSiteAddressesFromDb` does not pass a `CancellationToken` to the repository | | Communication-020 | [Communication](Communication/findings.md) | `SiteAddressCacheLoaded` carries mutable `Dictionary`/`List` types | -| Communication-021 | [Communication](Communication/findings.md) | `SiteStreamGrpcServer.SubscribeInstance` leaks the `StreamRelayActor` if `Subscribe` throws pre-try | -| Communication-022 | [Communication](Communication/findings.md) | `_debugSubscriptions` keyed by caller-supplied correlation ID; reuse silently orphans the prior subscriber | | ConfigurationDatabase-021 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `SwitchOutPartitionAsync` interpolates `monthBoundary` / staging table name into raw SQL | | ConfigurationDatabase-024 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Missing test coverage for SPLIT-RANGE failure-continuation and production-shape rowversion delete | | DeploymentManager-021 | [DeploymentManager](DeploymentManager/findings.md) | `ResolveSiteIdentifierAsync` silently substitutes the DB id when the site row is missing | @@ -162,14 +149,11 @@ _None open._ | DeploymentManager-023 | [DeploymentManager](DeploymentManager/findings.md) | `BuildDeployArtifactsCommandAsync` re-queries system-wide artifacts once per site | | DeploymentManager-024 | [DeploymentManager](DeploymentManager/findings.md) | Test probe actors hold mutable static state across tests | | ExternalSystemGateway-021 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `ApplyAuth` silently sends an unauthenticated request on unknown `AuthType`, empty `AuthConfiguration`, or malformed Basic config | -| HealthMonitoring-018 | [HealthMonitoring](HealthMonitoring/findings.md) | Same counter-reset-before-publish hazard in `CentralHealthReportLoop` | | HealthMonitoring-021 | [HealthMonitoring](HealthMonitoring/findings.md) | `CentralSiteId = "central"` reserved constant silently collides with a real site named "central" | | HealthMonitoring-022 | [HealthMonitoring](HealthMonitoring/findings.md) | `CentralHealthReportLoopTests` uses real-time `PeriodicTimer` + `Task.Delay`; flake-prone on slow CI | | Host-018 | [Host](Host/findings.md) | Shipped per-role configs omit `NodeOptions.NodeName`, leaving `SourceNode` null | -| Host-019 | [Host](Host/findings.md) | Migration `StartupRetry` call drops the host `CancellationToken` | | Host-020 | [Host](Host/findings.md) | `MinimumLevel.Is` silently overrides any operator-set `Serilog:MinimumLevel` | | Host-021 | [Host](Host/findings.md) | Microsoft `Logging:LogLevel` section in `appsettings.json` is dead config under Serilog | -| Host-022 | [Host](Host/findings.md) | `ParseLevel` silently coerces unrecognised `MinimumLevel` to `Information` | | InboundAPI-019 | [InboundAPI](InboundAPI/findings.md) | `EnableBuffering()` called unconditionally on every request, including bodyless requests | | InboundAPI-023 | [InboundAPI](InboundAPI/findings.md) | `EndpointExtensions.HandleInboundApiRequest` composition wiring has no test coverage | | ManagementService-023 | [ManagementService](ManagementService/findings.md) | HandleQueryDeployments unfiltered branch is N+1 on instance lookup | @@ -177,7 +161,6 @@ _None open._ | NotificationOutbox-008 | [NotificationOutbox](NotificationOutbox/findings.md) | `FallbackMaxRetries` / `FallbackRetryDelay` path is unreachable in production AND untested | | NotificationService-022 | [NotificationService](NotificationService/findings.md) | `MailKitSmtpClientWrapper` holds a long-lived `SmtpClient`; combined with per-send factory, the design comment about pooling is contradicted | | NotificationService-025 | [NotificationService](NotificationService/findings.md) | `CredentialRedactor` over-masks: any 4-character credential component is masked anywhere it appears, including unrelated log text | -| Security-020 | [Security](Security/findings.md) | `SecurityOptions` has no startup validation for required fields (`LdapServer`, `LdapSearchBase`) | | Security-021 | [Security](Security/findings.md) | `RequireHttpsCookie=false` dev opt-out has no warning path — an HTTP production deployment silently transmits the JWT bearer credential in cleartext | | SiteCallAudit-002 | [SiteCallAudit](SiteCallAudit/findings.md) | Singleton failover does not wait for in-flight async upserts | | SiteCallAudit-006 | [SiteCallAudit](SiteCallAudit/findings.md) | Stuck-only paging test does not exercise the multi-page boundary with an interleaved non-stuck row at the cursor | @@ -186,7 +169,6 @@ _None open._ | SiteEventLogging-023 | [SiteEventLogging](SiteEventLogging/findings.md) | Concurrent-stress test uses a non-volatile `stop` flag | | StoreAndForward-022 | [StoreAndForward](StoreAndForward/findings.md) | `NotifyCachedCallObserverAsync` silently drops the entire audit lifecycle when the message id is not a parseable `TrackedOperationId` | | StoreAndForward-023 | [StoreAndForward](StoreAndForward/findings.md) | `siteId` silently defaults to empty when no `IStoreAndForwardSiteContext` is registered, degrading audit telemetry correlation | -| StoreAndForward-024 | [StoreAndForward](StoreAndForward/findings.md) | `StopAsync` does not wait for an in-flight retry sweep, so disposed dependencies can be touched after shutdown | | Transport-008 | [Transport](Transport/findings.md) | `PreviewAsync` issues an N+1 `GetTemplateWithChildrenAsync` per matching template name | | Transport-009 | [Transport](Transport/findings.md) | `IAuditCorrelationContext.BundleImportId` is mutated on the same scoped instance the AuditService reads | | Transport-012 | [Transport](Transport/findings.md) | "Bundle Import" filter promised in design doc not surfaced in Configuration Audit Log Viewer UI | diff --git a/code-reviews/Security/findings.md b/code-reviews/Security/findings.md index 62f4374c..1428ff56 100644 --- a/code-reviews/Security/findings.md +++ b/code-reviews/Security/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 2 (Security-020, Security-021); 1 deferred (Security-008) | +| Open findings | 1 (Security-021); 1 deferred (Security-008) | ## Summary @@ -925,9 +925,23 @@ is the closest meaningful unit-level coverage. |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Security/SecurityOptions.cs:6-7`, `:36-37`; `src/ScadaLink.Security/ServiceCollectionExtensions.cs:13-30` | +**Resolution (2026-05-28):** added `SecurityOptionsValidator` +(`IValidateOptions`) that rejects empty/whitespace +`LdapServer` and `LdapSearchBase` with messages naming the full +`Security:Field` key the operator would edit. `AddSecurity` registers it via +`services.AddOptions().ValidateOnStart()` + +`TryAddEnumerable(... SecurityOptionsValidator)` so a misconfigured +`Security` section fails fast at boot rather than minutes later on the first +login. `JwtSigningKey` is deliberately left to `JwtTokenService`'s existing +length-aware constructor guard (Security-003). Regression tests in +`SecurityOptionsValidatorTests`: valid-options succeed; empty/whitespace +`LdapServer` and `LdapSearchBase` each fail with the key-naming message +(theory); both-empty reports both keys; `AddSecurity_RegistersSecurityOptionsValidator` +pins the DI wiring. + **Description** `SecurityOptions.JwtSigningKey` correctly fails fast at `JwtTokenService` construction diff --git a/code-reviews/StoreAndForward/findings.md b/code-reviews/StoreAndForward/findings.md index 413ac20e..2d97aedb 100644 --- a/code-reviews/StoreAndForward/findings.md +++ b/code-reviews/StoreAndForward/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 7 (3 Deferred: 002, 011, 012; 7 new Open: 018–024 — see Re-review 2026-05-28) | +| Open findings | 5 (3 Deferred: 002, 011, 012; 5 new Open from Re-review 2026-05-28) | ## Summary @@ -1393,9 +1393,22 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:122`–`:127`, `:136`–`:143`, `:303`–`:329` | +**Resolution (2026-05-28):** the timer callback now captures the sweep task +into a `_sweepTask` field via `Volatile.Write`, and `StopAsync` disposes the +timer first (so no new sweep starts) then `await`s the captured task with a +bounded `SweepShutdownWaitTimeout` (10 s) via `Task.WaitAsync` — so a sweep +in-flight when shutdown begins is given a chance to finish before the host +disposes `_storage`/`_replication`. A genuinely hung sweep cannot block +shutdown indefinitely (the timeout fires, the wait is abandoned, the +warning is logged). Regression test +`StopAsync_AwaitsInFlightRetrySweep_BeforeReturning` parks a sweep inside a +blocking handler, asserts `StopAsync`'s returned task is not completed while +the sweep is paused, then releases the handler and asserts the sweep ran to +completion before `StopAsync` returned. + **Description** `StartAsync` arms `_retryTimer` with `_ => _ = RetryPendingMessagesAsync()` (line 123). diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs index 59f868d8..684a4153 100644 --- a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs +++ b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs @@ -96,6 +96,27 @@ public class SiteAuditReconciliationActor : ReceiveActor /// private readonly Dictionary _stalled = new(); + /// + /// AuditLog-004: per-EventId retry counter for rows whose central insert + /// threw. While a row keeps failing AND is below + /// , the cursor is held back so the + /// next reconciliation tick re-pulls and retries the row. Crossing the + /// threshold logs Critical and permanently abandons the row (cursor + /// advances past it) so a truly broken row cannot block all subsequent + /// progress for a site. The counter is in-memory only — singleton restart + /// resets it, which is safe because the cursor also resets on restart and + /// the next tick re-pulls everything. + /// + private readonly Dictionary _failedInsertAttempts = new(); + + /// + /// AuditLog-004: number of consecutive central-insert failures before a row + /// is permanently abandoned with a Critical log entry and the cursor is + /// allowed to advance past it. Five attempts at the 5-minute default tick + /// is ~25 min of retry budget before a stuck row stops blocking progress. + /// + private const int MaxPermanentInsertAttempts = 5; + private ICancelable? _timer; /// @@ -232,9 +253,11 @@ public class SiteAuditReconciliationActor : ReceiveActor .ConfigureAwait(false); var maxOccurred = since; + var hasUnresolvedFailure = false; var nowUtc = DateTime.UtcNow; foreach (var evt in response.Events) { + var advanceForThisRow = false; try { // Idempotent repository write: duplicate EventIds (from a @@ -243,29 +266,58 @@ public class SiteAuditReconciliationActor : ReceiveActor // InsertIfNotExistsAsync. var ingested = evt with { IngestedAtUtc = nowUtc }; await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false); + _failedInsertAttempts.Remove(evt.EventId); + advanceForThisRow = true; } catch (Exception ex) { - // Per-row catch so one bad event does not abandon the rest of - // the batch. The cursor still advances based on OccurredAtUtc - // — the row was returned by the site, so the next tick won't - // re-fetch it; if it permanently fails to persist, that's an - // operational concern surfaced by the log, not a hot-loop - // trigger. - _logger.LogError( - ex, - "Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId}.", - evt.EventId, - site.SiteId); + // AuditLog-004: per-row catch so one bad event does not abandon + // the rest of the batch. Track the failure count per EventId — + // below MaxPermanentInsertAttempts the cursor is HELD BACK so + // the next tick re-pulls and retries; at the threshold the row + // is permanently abandoned (LogCritical + cursor advances past) + // to keep a truly broken row from blocking all subsequent + // progress for the site. + var attempts = _failedInsertAttempts.GetValueOrDefault(evt.EventId) + 1; + _failedInsertAttempts[evt.EventId] = attempts; + + if (attempts >= MaxPermanentInsertAttempts) + { + _logger.LogCritical( + ex, + "Permanently abandoning AuditEvent {EventId} from site {SiteId} after {Attempts} consecutive insert failures; cursor will advance past it.", + evt.EventId, + site.SiteId, + attempts); + _failedInsertAttempts.Remove(evt.EventId); + advanceForThisRow = true; + } + else + { + _logger.LogError( + ex, + "Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId} (attempt {Attempts}/{Max}); cursor held back for retry.", + evt.EventId, + site.SiteId, + attempts, + MaxPermanentInsertAttempts); + hasUnresolvedFailure = true; + } } - if (evt.OccurredAtUtc > maxOccurred) + if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred) { maxOccurred = evt.OccurredAtUtc; } } - _cursors[site.SiteId] = maxOccurred; + // AuditLog-004: only advance the persisted cursor if no event in this + // batch is still being retried. Leaving the cursor at `since` re-pulls + // the whole batch next tick — successful rows are no-ops thanks to + // InsertIfNotExistsAsync's idempotency, and the failing row gets + // another attempt. Once it succeeds (or hits the permanent-abandon + // threshold) the cursor unblocks naturally. + _cursors[site.SiteId] = hasUnresolvedFailure ? since : maxOccurred; var nonDraining = response.MoreAvailable && response.Events.Count > 0; UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream); diff --git a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs index a508e31d..7736cc83 100644 --- a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs +++ b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs @@ -693,10 +693,26 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable }; } - /// Disposes the audit writer and releases resources. + /// + /// Disposes the audit writer and releases resources. + /// + /// + /// AuditLog-006: prefer when possible (DI honours + /// on singletons). The sync path remains for + /// callers that only know about (e.g. legacy + /// composition roots, using statements without await). To + /// avoid the classic sync-over-async deadlock on a captured + /// (ASP.NET request thread, Akka + /// dispatcher under some configurations), we hop to the thread pool via + /// before blocking on the result — the + /// async continuation inside then resumes on a + /// pool thread with no captured context, so GetResult() never waits + /// on the very thread the continuation needs. + /// public void Dispose() { - DisposeAsync().AsTask().GetAwaiter().GetResult(); + Task.Run(async () => await DisposeAsync().ConfigureAwait(false)) + .GetAwaiter().GetResult(); } /// Asynchronously disposes the audit writer and releases resources. diff --git a/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs b/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs index 68cc3cd7..3b10c8cf 100644 --- a/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs +++ b/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs @@ -42,6 +42,12 @@ public class SiteAuditTelemetryActor : ReceiveActor private readonly SiteAuditTelemetryOptions _options; private readonly ILogger _logger; private ICancelable? _pendingTick; + // AuditLog-010: per-actor lifecycle CTS so an in-flight drain (queue read, + // gRPC push, mark-forwarded write) is actually cancelled when the actor is + // stopped — without it, a stuck IngestAuditEventsAsync would hold the + // continuation through CoordinatedShutdown's actor-system terminate window. + // Cancelled in PostStop; never reset (the actor is single-lifetime). + private readonly CancellationTokenSource _lifecycleCts = new(); /// Initializes the actor with its drain queue, gRPC client, options, and logger. /// The site-local SQLite audit queue to drain. @@ -81,15 +87,32 @@ public class SiteAuditTelemetryActor : ReceiveActor protected override void PostStop() { _pendingTick?.Cancel(); + // AuditLog-010: cancel any in-flight drain so a stuck queue read or + // gRPC push does not hold the continuation past actor stop. + try + { + _lifecycleCts.Cancel(); + } + catch (ObjectDisposedException) + { + // PostStop may run after a prior Dispose path — benign. + } + _lifecycleCts.Dispose(); base.PostStop(); } private async Task OnDrainAsync() { var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds); + // AuditLog-010: route every async dependency call through the + // per-actor lifecycle token so PostStop cancellation actually + // propagates into the queue read, the gRPC push, and the + // mark-forwarded write. OperationCanceledException is swallowed by + // the catch-all below. + var ct = _lifecycleCts.Token; try { - var pending = await _queue.ReadPendingAsync(_options.BatchSize, CancellationToken.None) + var pending = await _queue.ReadPendingAsync(_options.BatchSize, ct) .ConfigureAwait(false); if (pending.Count == 0) { @@ -104,7 +127,7 @@ public class SiteAuditTelemetryActor : ReceiveActor IngestAck ack; try { - ack = await _client.IngestAuditEventsAsync(batch, CancellationToken.None) + ack = await _client.IngestAuditEventsAsync(batch, ct) .ConfigureAwait(false); } catch (Exception ex) @@ -121,7 +144,7 @@ public class SiteAuditTelemetryActor : ReceiveActor var acceptedIds = ParseAcceptedIds(ack); if (acceptedIds.Count > 0) { - await _queue.MarkForwardedAsync(acceptedIds, CancellationToken.None) + await _queue.MarkForwardedAsync(acceptedIds, ct) .ConfigureAwait(false); } } @@ -133,7 +156,13 @@ public class SiteAuditTelemetryActor : ReceiveActor } finally { - ScheduleNext(nextDelay); + // AuditLog-010: if the actor is already shutting down, do not + // arm another tick — the scheduler would fire after PostStop and + // the message would land in dead letters. + if (!_lifecycleCts.IsCancellationRequested) + { + ScheduleNext(nextDelay); + } } } diff --git a/src/ScadaLink.CLI/CliConfig.cs b/src/ScadaLink.CLI/CliConfig.cs index dd59e585..d9fdfb15 100644 --- a/src/ScadaLink.CLI/CliConfig.cs +++ b/src/ScadaLink.CLI/CliConfig.cs @@ -40,15 +40,31 @@ public class CliConfig ".scadalink", "config.json"); if (File.Exists(configPath)) { - var json = File.ReadAllText(configPath); - var fileConfig = JsonSerializer.Deserialize(json, - new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); - if (fileConfig != null) + // CLI-021: a malformed (`JsonException`), unreadable + // (`UnauthorizedAccessException`), or otherwise faulted + // (`IOException`) config file must not crash the CLI before any + // command runs — even a command that supplies everything via + // --url/--username/--password/--format on the command line still + // calls Load() and would otherwise inherit the fault. Warn once on + // stderr and fall through to the env-var + command-line precedence + // chain with default settings. + try { - if (!string.IsNullOrEmpty(fileConfig.ManagementUrl)) - config.ManagementUrl = fileConfig.ManagementUrl; - if (!string.IsNullOrEmpty(fileConfig.DefaultFormat)) - config.DefaultFormat = fileConfig.DefaultFormat; + var json = File.ReadAllText(configPath); + var fileConfig = JsonSerializer.Deserialize(json, + new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); + if (fileConfig != null) + { + if (!string.IsNullOrEmpty(fileConfig.ManagementUrl)) + config.ManagementUrl = fileConfig.ManagementUrl; + if (!string.IsNullOrEmpty(fileConfig.DefaultFormat)) + config.DefaultFormat = fileConfig.DefaultFormat; + } + } + catch (Exception ex) when (ex is JsonException || ex is IOException || ex is UnauthorizedAccessException) + { + Console.Error.WriteLine( + $"warning: ignoring malformed or unreadable {configPath}: {ex.Message}"); } } diff --git a/src/ScadaLink.CLI/Commands/BundleCommands.cs b/src/ScadaLink.CLI/Commands/BundleCommands.cs index eb2002e8..e13512d6 100644 --- a/src/ScadaLink.CLI/Commands/BundleCommands.cs +++ b/src/ScadaLink.CLI/Commands/BundleCommands.cs @@ -112,9 +112,11 @@ public static class BundleCommands Passphrase: passphrase, SourceEnvironment: sourceEnv); - return await RunBundleCommandAsync( - result, urlOption, usernameOption, passwordOption, - payload, jsonOk => + return await CommandHelpers.ExecuteCommandAsync( + result, urlOption, formatOption, usernameOption, passwordOption, + payload, + timeout: BundleCommandTimeout, + onSuccess: jsonOk => { using var doc = JsonDocument.Parse(jsonOk); var base64 = doc.RootElement.GetProperty("base64Bundle").GetString()!; @@ -165,9 +167,11 @@ public static class BundleCommands Base64Bundle: Convert.ToBase64String(bytes), Passphrase: result.GetValue(passphraseOption)); - return await RunBundleCommandAsync( - result, urlOption, usernameOption, passwordOption, - payload, jsonOk => + return await CommandHelpers.ExecuteCommandAsync( + result, urlOption, formatOption, usernameOption, passwordOption, + payload, + timeout: BundleCommandTimeout, + onSuccess: jsonOk => { Console.WriteLine(jsonOk); return 0; @@ -220,9 +224,11 @@ public static class BundleCommands Passphrase: result.GetValue(passphraseOption), DefaultConflictPolicy: result.GetValue(onConflictOption)!); - return await RunBundleCommandAsync( - result, urlOption, usernameOption, passwordOption, - payload, jsonOk => + return await CommandHelpers.ExecuteCommandAsync( + result, urlOption, formatOption, usernameOption, passwordOption, + payload, + timeout: BundleCommandTimeout, + onSuccess: jsonOk => { Console.WriteLine(jsonOk); return 0; @@ -234,59 +240,15 @@ public static class BundleCommands // ==================================================================== // Shared HTTP plumbing // ==================================================================== - - /// - /// Same shape as but with - /// a longer per-command timeout (bundles are big) and a caller-supplied - /// success handler so export can capture the base64 payload into a file - /// rather than print the whole envelope to stdout. - /// - private static async Task RunBundleCommandAsync( - ParseResult result, - Option urlOption, - Option usernameOption, - Option passwordOption, - object payload, - Func onSuccess) - { - var config = CliConfig.Load(); - var url = result.GetValue(urlOption); - if (string.IsNullOrWhiteSpace(url)) url = config.ManagementUrl; - if (string.IsNullOrWhiteSpace(url)) - { - OutputFormatter.WriteError( - "No management URL specified. Use --url or set 'managementUrl' in ~/.scadalink/config.json.", - "NO_URL"); - return 1; - } - if (!CommandHelpers.IsValidManagementUrl(url)) - { - OutputFormatter.WriteError( - $"Invalid management URL '{url}'.", "INVALID_URL"); - return 1; - } - var username = CommandHelpers.ResolveCredential(result.GetValue(usernameOption), config.Username); - var password = CommandHelpers.ResolveCredential(result.GetValue(passwordOption), config.Password); - if (string.IsNullOrWhiteSpace(username) || string.IsNullOrWhiteSpace(password)) - { - OutputFormatter.WriteError( - "Credentials required. Use --username/--password or SCADALINK_USERNAME/SCADALINK_PASSWORD.", - "NO_CREDENTIALS"); - return 1; - } - - var commandName = ManagementCommandRegistry.GetCommandName(payload.GetType()); - using var client = new ManagementHttpClient(url, username, password); - var response = await client.SendCommandAsync(commandName, payload, BundleCommandTimeout); - - if (response.JsonData is not null) - { - return onSuccess(response.JsonData); - } - OutputFormatter.WriteError(response.Error ?? "Unknown error", response.ErrorCode ?? "ERROR"); - if (response.StatusCode == 403) return 2; - return 1; - } + // + // CLI-017: bundle commands previously routed through a private + // RunBundleCommandAsync that re-implemented URL/credential resolution and + // skipped the IsAuthorizationFailure(...) check that ExecuteCommandAsync + // enforces — a server that signalled FORBIDDEN/UNAUTHORIZED via the error + // code on a non-403 status would exit 1 instead of the documented exit 2. + // The bundle path now delegates to CommandHelpers.ExecuteCommandAsync with + // the longer BundleCommandTimeout and a per-command success handler, so the + // exit-code contract is unified across every command group. private static Option?> NameListOption(string name, string description) { diff --git a/src/ScadaLink.CLI/Commands/CommandHelpers.cs b/src/ScadaLink.CLI/Commands/CommandHelpers.cs index 6c056623..92ab12ea 100644 --- a/src/ScadaLink.CLI/Commands/CommandHelpers.cs +++ b/src/ScadaLink.CLI/Commands/CommandHelpers.cs @@ -17,13 +17,28 @@ internal static class CommandHelpers /// Option that supplies the username override. /// Option that supplies the password override. /// The management command object to send. + /// + /// Optional per-command HTTP timeout. Defaults to 30s, matching the management API's + /// own request timeout. Larger payloads (e.g. Transport bundles) should supply a + /// longer value. + /// + /// + /// Optional success handler. When supplied, the helper invokes it with the success + /// body instead of running the default rendering path — + /// useful when the caller needs to capture the response (e.g. write a file) rather + /// than print it. The authorization-failure exit-code contract + /// () is preserved on the error path either way, + /// closing CLI-017's regression. + /// internal static async Task ExecuteCommandAsync( ParseResult result, Option urlOption, Option formatOption, Option usernameOption, Option passwordOption, - object command) + object command, + TimeSpan? timeout = null, + Func? onSuccess = null) { var config = CliConfig.Load(); var format = ResolveFormat(result, formatOption, config); @@ -67,7 +82,20 @@ internal static class CommandHelpers // Send via HTTP using var client = new ManagementHttpClient(url, username, password); - var response = await client.SendCommandAsync(commandName, command, TimeSpan.FromSeconds(30)); + var response = await client.SendCommandAsync(commandName, command, timeout ?? TimeSpan.FromSeconds(30)); + + // Caller-supplied success handler short-circuits the default rendering — but + // the error path still routes through IsAuthorizationFailure so the documented + // exit-2 contract holds whether or not a custom handler is provided + // (CLI-017 unification of the bundle path). + if (onSuccess is not null) + { + if (response.JsonData is not null) + return onSuccess(response.JsonData); + + OutputFormatter.WriteError(response.Error ?? "Unknown error", response.ErrorCode ?? "ERROR"); + return IsAuthorizationFailure(response) ? 2 : 1; + } return HandleResponse(response, format); } diff --git a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs index 68bb1a66..0e061f1a 100644 --- a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs +++ b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs @@ -75,6 +75,14 @@ public class CentralCommunicationActor : ReceiveActor private ICancelable? _refreshSchedule; + /// + /// Communication-019: per-actor lifecycle CTS threaded into the periodic + /// repository call so a hung MS SQL + /// connection is bounded by actor shutdown rather than holding piped tasks + /// open indefinitely. Cancelled in ; never reset. + /// + private readonly CancellationTokenSource _lifecycleCts = new(); + /// /// Proxy for the central NotificationOutboxActor cluster singleton. /// Set via — the Host creates the singleton proxy @@ -358,11 +366,26 @@ public class CentralCommunicationActor : ReceiveActor private void LoadSiteAddressesFromDb() { var self = Self; + // Communication-019: pass the actor's lifecycle CT into the repository + // call so a hung database query is cancelled when the actor stops + // rather than leaving the piped task to accumulate. Captured locally + // because the lifecycle CTS may have been disposed by PostStop on a + // racing late tick; treat that as "actor gone, give up". + CancellationToken ct; + try + { + ct = _lifecycleCts.Token; + } + catch (ObjectDisposedException) + { + return; + } + Task.Run(async () => { using var scope = _serviceProvider.CreateScope(); var repo = scope.ServiceProvider.GetRequiredService(); - var sites = await repo.GetAllSitesAsync(); + var sites = await repo.GetAllSitesAsync(ct).ConfigureAwait(false); var contacts = new Dictionary>(); foreach (var site in sites) @@ -495,6 +518,17 @@ public class CentralCommunicationActor : ReceiveActor { _log.Info("CentralCommunicationActor stopped"); _refreshSchedule?.Cancel(); + // Communication-019: cancel any in-flight LoadSiteAddressesFromDb so a + // hung MS SQL query does not outlive the actor. + try + { + _lifecycleCts.Cancel(); + } + catch (ObjectDisposedException) + { + // Double-stop is benign. + } + _lifecycleCts.Dispose(); } } diff --git a/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs b/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs index e453f2bc..03157286 100644 --- a/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs +++ b/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs @@ -235,7 +235,30 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase Props.Create(typeof(Actors.StreamRelayActor), request.CorrelationId, channel.Writer), $"stream-relay-{request.CorrelationId}-{actorSeq}"); - var subscriptionId = _streamSubscriber.Subscribe(request.InstanceUniqueName, relayActor); + // Communication-021: the previous code called _streamSubscriber.Subscribe + // OUTSIDE the try block that owns relay-actor cleanup. If Subscribe threw + // (stale instance name, index lookup fault, site runtime shutting down), + // the freshly-created relay actor, the _activeStreams entry, the + // StreamEntry.Cts, and the Channel all leaked because the + // finally never ran. Wrap Subscribe in its own try so any throw deterministically + // stops the relay actor, removes the activeStreams entry, and completes the + // channel before the RpcException escapes to the caller. + string subscriptionId; + try + { + subscriptionId = _streamSubscriber.Subscribe(request.InstanceUniqueName, relayActor); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "Subscribe failed for {Instance} (correlation {CorrelationId}); cleaning up relay actor.", + request.InstanceUniqueName, request.CorrelationId); + _actorSystem!.Stop(relayActor); + channel.Writer.TryComplete(); + _activeStreams.TryRemove( + new KeyValuePair(request.CorrelationId, entry)); + throw; + } _logger.LogInformation( "Stream {CorrelationId} started for {Instance} (subscription {SubscriptionId})", diff --git a/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs b/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs index 0635975f..f3754c46 100644 --- a/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs +++ b/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs @@ -1,5 +1,4 @@ using System.Globalization; -using Microsoft.Data.SqlClient; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; @@ -178,22 +177,20 @@ WHERE pf.name = 'pf_AuditLog_Month';"; ALTER PARTITION SCHEME {PartitionSchemeName} NEXT USED [{TargetFileGroup}]; ALTER PARTITION FUNCTION {PartitionFunctionName}() SPLIT RANGE ('{literal}');"; - try - { - await _context.Database.ExecuteSqlRawAsync(sql, ct).ConfigureAwait(false); - added.Add(next); - } - catch (SqlException ex) - { - // Belt-and-braces: even though we read max-boundary first, an - // ALTER from another process could have raced us. Logging at - // Warning rather than Error because the desired end state - // (boundary present) is satisfied by either path. - _logger.LogWarning( - ex, - "EnsureLookaheadAsync: SPLIT RANGE for boundary {Boundary:o} failed; continuing.", - next); - } + // ConfigDB-019: the loop pre-reads max-boundary and only issues + // SPLITs for strictly-greater months, so msg 7708/7711 ("boundary + // already exists") cannot happen by construction. Any OTHER + // SqlException (permission revoked on the role, deadlock victim, + // log full, filegroup full, transient connection drop) means the + // boundary genuinely failed to create. The previous catch-and- + // continue silently moved on to the next month, splitting month + // N+1 successfully and leaving a permanent partition hole for + // month N that blocks partition-switch purge until an operator + // notices and rebuilds. Let SqlException propagate so the daily + // hosted-service tick logs an Error and the next tick retries + // from the same boundary (at-least-once, no holes). + await _context.Database.ExecuteSqlRawAsync(sql, ct).ConfigureAwait(false); + added.Add(next); next = NextMonthBoundary(next); } diff --git a/src/ScadaLink.DeploymentManager/DeploymentService.cs b/src/ScadaLink.DeploymentManager/DeploymentService.cs index e7aeac06..7cb07ee8 100644 --- a/src/ScadaLink.DeploymentManager/DeploymentService.cs +++ b/src/ScadaLink.DeploymentManager/DeploymentService.cs @@ -334,6 +334,17 @@ public class DeploymentService } catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) { + // DeploymentManager-019: a lifecycle command timeout produced no + // audit row pre-fix — the operator saw a timeout in the UI but + // the audit trail showed nothing happened, contrary to the + // design's "audit logging for all instance lifecycle changes" + // rule. Mirror the DeployFailed pattern: write a "TimedOut" + // entry with CancellationToken.None so a cancelled outer token + // (the typical reason this catch ran) cannot prevent the + // durable audit write. + await TryLogLifecycleTimeoutAsync( + user, "DisableTimedOut", instanceId, instance.UniqueName, commandId, ex); + _logger.LogWarning(ex, "Disable of instance {Instance} timed out", instance.UniqueName); return Result.Failure( $"Disable failed: the site did not respond within {_options.LifecycleCommandTimeout}."); @@ -391,6 +402,12 @@ public class DeploymentService } catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) { + // DeploymentManager-019: emit an audit entry on lifecycle timeout + // so the operator's attempted Enable is recorded; see the matching + // comment in DisableInstanceAsync for the full rationale. + await TryLogLifecycleTimeoutAsync( + user, "EnableTimedOut", instanceId, instance.UniqueName, commandId, ex); + _logger.LogWarning(ex, "Enable of instance {Instance} timed out", instance.UniqueName); return Result.Failure( $"Enable failed: the site did not respond within {_options.LifecycleCommandTimeout}."); @@ -453,6 +470,12 @@ public class DeploymentService } catch (Exception ex) when (ex is TimeoutException or OperationCanceledException) { + // DeploymentManager-019: emit an audit entry on lifecycle timeout + // so the operator's attempted Delete is recorded; see the matching + // comment in DisableInstanceAsync for the full rationale. + await TryLogLifecycleTimeoutAsync( + user, "DeleteTimedOut", instanceId, instance.UniqueName, commandId, ex); + _logger.LogWarning(ex, "Delete of instance {Instance} timed out", instance.UniqueName); return Result.Failure( $"Delete failed: the site did not respond within {_options.LifecycleCommandTimeout}."); @@ -794,6 +817,67 @@ public class DeploymentService } } + /// + /// DeploymentManager-019: write a "<Action>TimedOut" audit entry on + /// behalf of a lifecycle command (Disable / Enable / Delete) whose site + /// round-trip exceeded . + /// + /// + /// Mirrors the DeployFailed pattern in + /// : the audit write uses + /// so the operator's outer cancellation + /// (the usual reason this path runs) cannot also prevent the audit row from + /// being persisted. The detail object carries the lifecycle command id, the + /// timeout that fired, and the original exception message so an operator can + /// correlate the audit entry with the UI-surfaced timeout error. + /// + /// + /// + /// Wrapped in try/catch — a failed audit write must NOT mask the underlying + /// timeout from the caller; it is logged at Warning so the operator can + /// reconcile but never thrown. + /// + /// + /// The username who initiated the lifecycle command. + /// The audit action name (DisableTimedOut, EnableTimedOut, or DeleteTimedOut). + /// The numeric instance id, recorded on the audit row. + /// The instance unique name used as the audit target name. + /// The lifecycle command's correlation id, so the audit entry can be matched to logs. + /// The captured or . + private async Task TryLogLifecycleTimeoutAsync( + string user, + string action, + int instanceId, + string instanceUniqueName, + string commandId, + Exception timeoutException) + { + try + { + await _auditService.LogAsync( + user, + action, + "Instance", + instanceId.ToString(), + instanceUniqueName, + new + { + CommandId = commandId, + Deadline = _options.LifecycleCommandTimeout, + Error = timeoutException.Message, + }, + CancellationToken.None); + } + catch (Exception auditEx) + { + // A failed audit write must not bury the timeout for the caller — + // just log so an operator can investigate the audit-pipeline issue. + _logger.LogWarning(auditEx, + "Failed to write {Action} audit entry for instance {Instance} (commandId={CommandId})", + action, instanceUniqueName, commandId); + } + } + private async Task StoreDeployedSnapshotAsync( int instanceId, string deploymentId, diff --git a/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs b/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs index 2c680f86..a1d2579c 100644 --- a/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs +++ b/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs @@ -256,6 +256,24 @@ public class ExternalSystemClient : IExternalSystemClient var client = _httpClientFactory.CreateClient($"ExternalSystem_{system.Name}"); + // ExternalSystemGateway-019: HttpClient.Timeout defaults to 100 seconds + // and is enforced internally by SendAsync via its own private CTS — a + // TaskCanceledException raised by that internal CTS does not trip + // either the caller's token or the gateway's timeout CTS, so it falls + // through the ordered catch filters below into the generic "connection + // error" branch and is misclassified. Any operator-configured + // DefaultHttpTimeout greater than 100 s would therefore be silently + // clipped to 100 s, breaking the design's "timeout applies to the HTTP + // request round-trip" guarantee. Disable the framework default so the + // linked CancellationTokenSource(DefaultHttpTimeout) below is the sole + // timeout source — DefaultHttpTimeout is then honoured verbatim for + // every value, including ones well above 100 s. Setting this on the + // factory-supplied HttpClient before any request is the safe time: + // IHttpClientFactory rents typed clients backed by pooled message + // handlers, but the HttpClient instance itself is per-call and the + // Timeout property is per-instance. + client.Timeout = Timeout.InfiniteTimeSpan; + var url = BuildUrl(system.EndpointUrl, method.Path, parameters, method.HttpMethod); // The request and response own IDisposable resources (StringContent, the diff --git a/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs b/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs index c664e516..d6e52b50 100644 --- a/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs +++ b/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs @@ -85,10 +85,39 @@ public class CentralHealthReportLoop : BackgroundService _collector.SetClusterNodes(_clusterNodeProvider.GetClusterNodes()); var seq = Interlocked.Increment(ref _sequenceNumber); + + // HealthMonitoring-018: CollectReport atomically read-and-resets + // the per-interval error counters via Interlocked.Exchange. If + // ProcessReport throws (or any other failure occurs between the + // collect and the publish), those counts would otherwise be + // lost — neither in the un-published report nor in the + // now-zeroed collector. Snapshot the freshly-collected report + // so that on a publish failure we can atomically restore the + // counts back into the shared SiteHealthCollector via + // Interlocked.Add. Concurrent increments arriving during the + // ProcessReport call are preserved on the counter; the restore + // Add safely sums with any such concurrent increments. Same + // shape as the HealthMonitoring-017 fix in HealthReportSender. var report = _collector.CollectReport(CentralSiteId); var reportWithSeq = report with { SequenceNumber = seq }; - _aggregator.ProcessReport(reportWithSeq); + try + { + _aggregator.ProcessReport(reportWithSeq); + } + catch + { + // Restore the captured per-interval counters atomically so + // they roll forward into the next report — see + // HealthMonitoring-018. + _collector.AddIntervalCounters( + scriptErrors: report.ScriptErrorCount, + alarmErrors: report.AlarmEvaluationErrorCount, + deadLetters: report.DeadLetterCount, + siteAuditWriteFailures: report.SiteAuditWriteFailures, + auditRedactionFailures: report.AuditRedactionFailure); + throw; + } _logger.LogDebug("Generated central health report #{Seq}", seq); } diff --git a/src/ScadaLink.HealthMonitoring/HealthReportSender.cs b/src/ScadaLink.HealthMonitoring/HealthReportSender.cs index 8e86dc72..59c5479c 100644 --- a/src/ScadaLink.HealthMonitoring/HealthReportSender.cs +++ b/src/ScadaLink.HealthMonitoring/HealthReportSender.cs @@ -138,12 +138,42 @@ public class HealthReportSender : BackgroundService } var seq = Interlocked.Increment(ref _sequenceNumber); + + // HealthMonitoring-017: CollectReport atomically read-and-resets + // the per-interval error counters via Interlocked.Exchange. If + // the Send below throws, those counts are otherwise lost + // forever — neither in the un-sent report nor in the now-zeroed + // collector. Snapshot the freshly-collected report so that on a + // transport failure we can atomically restore the counts back + // into the collector via Interlocked.Add, so the next + // successful report includes them. Concurrent increments + // arriving during the Send are preserved on the counter (they + // accumulate against zero); the restore Add safely sums with + // any such concurrent increments. var report = _collector.CollectReport(_siteId); // Replace the placeholder sequence number with our monotonic one var reportWithSeq = report with { SequenceNumber = seq }; - _transport.Send(reportWithSeq); + try + { + _transport.Send(reportWithSeq); + } + catch + { + // Restore the captured per-interval counters atomically so + // they roll forward into the next report — see + // HealthMonitoring-017. Any concurrent increment that + // arrived during the failed Send remains on the counter; + // Interlocked.Add sums correctly with it. + _collector.AddIntervalCounters( + scriptErrors: report.ScriptErrorCount, + alarmErrors: report.AlarmEvaluationErrorCount, + deadLetters: report.DeadLetterCount, + siteAuditWriteFailures: report.SiteAuditWriteFailures, + auditRedactionFailures: report.AuditRedactionFailure); + throw; + } _logger.LogInformation("Sent health report #{Seq} for site {SiteId}", seq, _siteId); } diff --git a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs index 803483d0..7893e15a 100644 --- a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs @@ -140,4 +140,33 @@ public interface ISiteHealthCollector /// The site identifier. /// A health report for the specified site. SiteHealthReport CollectReport(string siteId); + + /// + /// HealthMonitoring-017: atomically add back the given per-interval error + /// counts into the collector's accumulators. Called by the report sender + /// when transport delivery of a freshly-collected report fails, so the + /// counts that already drained roll forward + /// into the next report rather than being silently lost. Concurrent + /// increments arriving between the failed Send and this restore are + /// preserved — Interlocked.Add sums correctly with them. The + /// default interface implementation is a no-op so existing test fakes + /// (the only implementations outside ) + /// continue to compile without per-fake updates; production callers see + /// the real behaviour via the concrete class. + /// + /// Script error count to add back. + /// Alarm evaluation error count to add back. + /// Dead letter count to add back. + /// Site audit write failure count to add back. + /// Audit redaction failure count to add back. + void AddIntervalCounters( + int scriptErrors, + int alarmErrors, + int deadLetters, + int siteAuditWriteFailures, + int auditRedactionFailures) + { + // Default no-op so test fakes do not need to be updated. The real + // SiteHealthCollector overrides this with the Interlocked.Add restore. + } } diff --git a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs index 85dbf330..cb7689a2 100644 --- a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs @@ -142,6 +142,27 @@ public class SiteHealthCollector : ISiteHealthCollector /// public bool IsActiveNode => _isActiveNode; + /// + public void AddIntervalCounters( + int scriptErrors, + int alarmErrors, + int deadLetters, + int siteAuditWriteFailures, + int auditRedactionFailures) + { + // HealthMonitoring-017: each counter is restored atomically via + // Interlocked.Add so an increment that arrived during the failed Send + // (and therefore accumulated against the zero left by CollectReport's + // Exchange) is correctly summed with the values being put back. No + // ordering between the five Adds is required — they target independent + // fields. + if (scriptErrors != 0) Interlocked.Add(ref _scriptErrorCount, scriptErrors); + if (alarmErrors != 0) Interlocked.Add(ref _alarmErrorCount, alarmErrors); + if (deadLetters != 0) Interlocked.Add(ref _deadLetterCount, deadLetters); + if (siteAuditWriteFailures != 0) Interlocked.Add(ref _siteAuditWriteFailures, siteAuditWriteFailures); + if (auditRedactionFailures != 0) Interlocked.Add(ref _auditRedactionFailures, auditRedactionFailures); + } + /// public SiteHealthReport CollectReport(string siteId) { diff --git a/src/ScadaLink.Host/LoggerConfigurationFactory.cs b/src/ScadaLink.Host/LoggerConfigurationFactory.cs index befc98e6..e0d5170f 100644 --- a/src/ScadaLink.Host/LoggerConfigurationFactory.cs +++ b/src/ScadaLink.Host/LoggerConfigurationFactory.cs @@ -46,11 +46,36 @@ public static class LoggerConfigurationFactory /// /// Parses a Serilog name, falling back to /// for null/blank/unrecognised values. + /// + /// Host-022: when an operator sets ScadaLink:Logging:MinimumLevel to a + /// value that doesn't parse (e.g. the typo "Informaiton"), the helper must NOT + /// throw — startup has to succeed so the rest of the system can come up — but + /// it MUST make the silent fallback visible. The logger is not yet built at + /// this point, so the warning is written directly to + /// using ; non-null/non-blank values that fail + /// to parse are reported once, naming the offending value and the fallback. + /// Null/blank values are treated as "unset" and silently default — only + /// explicit-but-invalid values trigger the warning. /// - private static LogEventLevel ParseLevel(string? level) + internal static LogEventLevel ParseLevel(string? level) + => ParseLevel(level, Console.Error); + + /// + /// Test-visible overload of that routes the + /// one-shot warning through a caller-supplied writer ( + /// in production) so unit tests can capture the warning output. + /// + /// Configured level string, possibly null/blank/invalid. + /// Writer that receives a single warning line if the value is non-blank but unparseable. + internal static LogEventLevel ParseLevel(string? level, TextWriter warningWriter) { - return Enum.TryParse(level, ignoreCase: true, out var parsed) - ? parsed - : LogEventLevel.Information; + if (Enum.TryParse(level, ignoreCase: true, out var parsed)) + return parsed; + + if (!string.IsNullOrWhiteSpace(level)) + warningWriter.WriteLine( + $"warning: ScadaLink:Logging:MinimumLevel value '{level}' is not a recognised Serilog LogEventLevel; falling back to Information."); + + return LogEventLevel.Information; } } diff --git a/src/ScadaLink.Host/Program.cs b/src/ScadaLink.Host/Program.cs index 03ca911c..deb02621 100644 --- a/src/ScadaLink.Host/Program.cs +++ b/src/ScadaLink.Host/Program.cs @@ -161,18 +161,23 @@ try // exponential backoff before failing fatally. // Host-015: only connection-class (transient) faults are retried — a // schema-version mismatch is permanent and must fail fast on attempt 1. + // Host-019: thread the host's ApplicationStopping token into both the + // migration call itself and the inter-attempt Task.Delay so a SIGTERM + // during the bounded-retry window (~2 min worst-case) tears down + // cleanly instead of being ignored until the loop exhausts. await StartupRetry.ExecuteWithRetryAsync( "database-migration", - async () => + async ct => { using var scope = app.Services.CreateScope(); var dbContext = scope.ServiceProvider.GetRequiredService(); - await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger); + await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger, ct); }, maxAttempts: 8, initialDelay: TimeSpan.FromSeconds(2), migrationLogger, - isTransient: StartupRetry.IsTransientDatabaseFault); + isTransient: StartupRetry.IsTransientDatabaseFault, + cancellationToken: app.Lifetime.ApplicationStopping); } // Middleware pipeline diff --git a/src/ScadaLink.Host/StartupRetry.cs b/src/ScadaLink.Host/StartupRetry.cs index b9ce640a..7e4ae38b 100644 --- a/src/ScadaLink.Host/StartupRetry.cs +++ b/src/ScadaLink.Host/StartupRetry.cs @@ -28,7 +28,7 @@ public static class StartupRetry /// Logger for retry warnings. /// Optional predicate classifying an exception as transient; null means all exceptions are transient. /// Cancellation token that aborts the retry loop immediately. - public static async Task ExecuteWithRetryAsync( + public static Task ExecuteWithRetryAsync( string operationName, Func operation, int maxAttempts, @@ -36,6 +36,23 @@ public static class StartupRetry ILogger logger, Func? isTransient = null, CancellationToken cancellationToken = default) + => ExecuteWithRetryAsync(operationName, _ => operation(), maxAttempts, initialDelay, logger, isTransient, cancellationToken); + + /// + /// Executes an asynchronous operation with bounded exponential backoff, retrying only transient faults. + /// Overload that forwards the retry-loop cancellation token to the operation itself — + /// Host-019: needed so callers (e.g. the database-migration step) can honour + /// IHostApplicationLifetime.ApplicationStopping inside the operation as well + /// as inside the inter-attempt Task.Delay. + /// + public static async Task ExecuteWithRetryAsync( + string operationName, + Func operation, + int maxAttempts, + TimeSpan initialDelay, + ILogger logger, + Func? isTransient = null, + CancellationToken cancellationToken = default) { // Default: treat every exception as transient (preserves the pre-Host-015 // behaviour for callers that do not classify faults). @@ -47,7 +64,7 @@ public static class StartupRetry cancellationToken.ThrowIfCancellationRequested(); try { - await operation(); + await operation(cancellationToken); if (attempt > 1) logger.LogInformation( "Startup operation '{Operation}' succeeded on attempt {Attempt}.", diff --git a/src/ScadaLink.InboundAPI/Middleware/AuditWriteMiddleware.cs b/src/ScadaLink.InboundAPI/Middleware/AuditWriteMiddleware.cs index 10e57e26..bcb4829a 100644 --- a/src/ScadaLink.InboundAPI/Middleware/AuditWriteMiddleware.cs +++ b/src/ScadaLink.InboundAPI/Middleware/AuditWriteMiddleware.cs @@ -251,10 +251,16 @@ public sealed class AuditWriteMiddleware ForwardState = null, }; - // Fire-and-forget — the writer itself swallows; the additional - // try/catch around the fire still protects us if WriteAsync throws - // synchronously before returning a task. - _ = _auditWriter.WriteAsync(evt); + // InboundAPI-018: fire-and-forget the writer so the user-facing + // response stays non-blocking (alog.md §13 — audit emission must + // NEVER abort or delay the user request), but observe the returned + // Task so an asynchronous fault is logged instead of vanishing into + // TaskScheduler.UnobservedTaskException. The outer try/catch still + // catches a synchronous throw before WriteAsync returns a task; the + // ContinueWith only fires on a faulted task and runs off-thread, so + // it does not block the response. + var writeTask = _auditWriter.WriteAsync(evt); + ObserveAuditWriteFault(writeTask, ctx); } catch (Exception ex) { @@ -265,6 +271,36 @@ public sealed class AuditWriteMiddleware } } + /// + /// InboundAPI-018: observe the audit writer's returned + /// so a fault that surfaces ASYNCHRONOUSLY (e.g. a DB timeout deep in the + /// central audit pipeline) is logged at Warning rather than dropped into + /// . Stays + /// fire-and-forget so the user-facing response is not delayed — the + /// continuation runs only on a faulted task and writes a single log line + /// off the request thread. A completed-successfully task takes the fast + /// path with no continuation scheduled. + /// + private void ObserveAuditWriteFault(Task writeTask, HttpContext ctx) + { + if (writeTask.IsCompletedSuccessfully) + { + return; + } + + var method = ctx.Request.Method; + var path = ctx.Request.Path; + var status = ctx.Response.StatusCode; + _ = writeTask.ContinueWith( + t => _logger.LogWarning( + t.Exception, + "AuditWriteMiddleware async audit write faulted for {Method} {Path} (status {Status})", + method, path, status), + CancellationToken.None, + TaskContinuationOptions.OnlyOnFaulted | TaskContinuationOptions.ExecuteSynchronously, + TaskScheduler.Default); + } + /// /// Reads the buffered request body up to bytes /// into a string for the audit copy and rewinds the stream so the diff --git a/src/ScadaLink.Security/SecurityOptionsValidator.cs b/src/ScadaLink.Security/SecurityOptionsValidator.cs new file mode 100644 index 00000000..178f07da --- /dev/null +++ b/src/ScadaLink.Security/SecurityOptionsValidator.cs @@ -0,0 +1,70 @@ +using Microsoft.Extensions.Options; + +namespace ScadaLink.Security; + +/// +/// Security-020: validates at startup so a +/// missing or empty required LDAP field fails fast at boot with a clear, +/// key-naming message — rather than surfacing minutes or hours later as a +/// generic "An unexpected error occurred during authentication" on the first +/// real login attempt. +/// +/// +/// The LDAP-side required fields validated here are +/// (no sane default — the host must be specified) and +/// (the DN root every directory +/// search runs against). A typo in the appsettings section name, a missing +/// environment-variable substitution, or a misconfigured Docker compose file +/// leaves both defaulted to string.Empty — without this validator the +/// process would start cleanly and only fail on the first login when +/// LdapConnection.Connect("") throws a low-level exception that does +/// not name the offending config key. +/// +/// +/// +/// is intentionally NOT validated +/// here — it already fails fast at construction +/// (Security-003 fix), with a length-aware error message. Centralising it +/// here would duplicate that guard; leaving it on the constructor keeps the +/// minimum-byte length contract co-located with the type that enforces it. +/// +/// +public sealed class SecurityOptionsValidator : IValidateOptions +{ + /// + /// The configuration section name is bound + /// to (matches the Host's builder.Configuration.GetSection("Security") + /// call). Exposed so validation messages can name the full + /// Security:Field key the operator would edit, not just the field + /// name. + /// + public const string ConfigSectionName = "Security"; + + /// + public ValidateOptionsResult Validate(string? name, SecurityOptions options) + { + ArgumentNullException.ThrowIfNull(options); + + var failures = new List(); + + if (string.IsNullOrWhiteSpace(options.LdapServer)) + { + failures.Add( + $"{ConfigSectionName}:{nameof(SecurityOptions.LdapServer)} is required " + + "but was empty or whitespace — set it to the LDAP server hostname or IP " + + "(e.g. \"ldap.example.com\")."); + } + + if (string.IsNullOrWhiteSpace(options.LdapSearchBase)) + { + failures.Add( + $"{ConfigSectionName}:{nameof(SecurityOptions.LdapSearchBase)} is required " + + "but was empty or whitespace — set it to the search-base DN " + + "(e.g. \"dc=example,dc=com\")."); + } + + return failures.Count == 0 + ? ValidateOptionsResult.Success + : ValidateOptionsResult.Fail(failures); + } +} diff --git a/src/ScadaLink.Security/ServiceCollectionExtensions.cs b/src/ScadaLink.Security/ServiceCollectionExtensions.cs index 318ac10e..865faa5f 100644 --- a/src/ScadaLink.Security/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.Security/ServiceCollectionExtensions.cs @@ -1,5 +1,6 @@ using Microsoft.AspNetCore.Authentication.Cookies; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; using Microsoft.Extensions.Options; namespace ScadaLink.Security; @@ -16,6 +17,17 @@ public static class ServiceCollectionExtensions services.AddScoped(); services.AddScoped(); + // Security-020: register the IValidateOptions so a + // missing/empty LdapServer or LdapSearchBase fails fast at startup + // with a clear, key-naming message rather than a generic LDAP error + // on the first real login. ValidateOnStart() forces the validation to + // run during host startup rather than lazily on the first + // IOptions resolve. TryAddEnumerable so multiple + // AddSecurity calls (or future additional validators) don't pile up. + services.AddOptions().ValidateOnStart(); + services.TryAddEnumerable( + ServiceDescriptor.Singleton, SecurityOptionsValidator>()); + // Register ASP.NET Core authentication with cookie scheme services.AddAuthentication(CookieAuthenticationDefaults.AuthenticationScheme) .AddCookie(options => diff --git a/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs b/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs index ccb2d2d3..b3943d17 100644 --- a/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs +++ b/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs @@ -51,6 +51,30 @@ public class StoreAndForwardService private Timer? _retryTimer; private int _retryInProgress; + /// + /// StoreAndForward-024: the in-flight retry sweep , or + /// null when no sweep is currently running. Captured when the timer + /// callback starts a sweep so can wait for it to + /// finish before the host disposes downstream dependencies + /// (, ) that the sweep is + /// still touching. Written from the timer thread and from + /// , so reads are synchronised via the + /// APIs. + /// + private Task? _sweepTask; + + /// + /// StoreAndForward-024: how long waits for an + /// in-flight retry sweep to finish before returning. The default — 10 s — + /// is generous enough to let a typical sweep over the buffered queue drain, + /// but bounded so a hung downstream call (a stuck SQLite write, a + /// long-running delivery handler) cannot block host shutdown indefinitely. + /// On timeout the wait is abandoned and the timer is still disposed; the + /// sweep keeps running but will throw on the next call into a disposed + /// dependency — preferred to blocking shutdown forever. + /// + private static readonly TimeSpan SweepShutdownWaitTimeout = TimeSpan.FromSeconds(10); + /// /// WP-10: Delivery handler delegate. The return value / exception is interpreted /// the same way on both the immediate-delivery path () @@ -120,7 +144,14 @@ public class StoreAndForwardService { await _storage.InitializeAsync(); _retryTimer = new Timer( - _ => _ = RetryPendingMessagesAsync(), + // StoreAndForward-024: capture the sweep Task on each tick so + // StopAsync can await any in-flight invocation before the host + // disposes _storage/_replication underneath it. The RetryPending + // path is self-guarded against overlapping sweeps via the + // _retryInProgress Interlocked flag, so unconditionally re-assigning + // the field here cannot lose a still-running task (the new tick + // will short-circuit if one is already running). + _ => Volatile.Write(ref _sweepTask, RetryPendingMessagesAsync()), null, _options.RetryTimerInterval, _options.RetryTimerInterval); @@ -131,15 +162,58 @@ public class StoreAndForwardService } /// - /// Stops the background retry timer. + /// Stops the background retry timer and waits (bounded) for any in-flight + /// retry sweep to finish before returning. + /// + /// StoreAndForward-024: prior to this fix, only + /// disposed the timer — a sweep already inside + /// continued running against + /// and after this method + /// returned, and could then NRE / throw on a disposed dependency once the + /// DI container ran its own shutdown. We now await the captured sweep task + /// (with a bounded so a hung + /// dependency cannot block host shutdown indefinitely) before returning. /// public async Task StopAsync() { if (_retryTimer != null) { + // Stop the periodic callback first so no new sweep starts while we + // are waiting for the in-flight one to drain. await _retryTimer.DisposeAsync(); _retryTimer = null; } + + var inflight = Volatile.Read(ref _sweepTask); + if (inflight is null || inflight.IsCompleted) + { + return; + } + + try + { + // WaitAsync with a finite timeout: a hung delivery handler / + // storage call cannot block host shutdown indefinitely. On timeout + // the sweep keeps running but the host is free to proceed with + // disposal — preferred to never returning. + await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false); + } + catch (TimeoutException) + { + _logger.LogWarning( + "Store-and-forward retry sweep did not finish within {Timeout}; " + + "shutdown is proceeding while the sweep is still in-flight", + SweepShutdownWaitTimeout); + } + catch (Exception ex) + { + // The sweep itself already logs at Error on failure (see + // RetryPendingMessagesAsync's catch); we only log here so a + // surprise fault during shutdown is still visible. Swallow so the + // host's shutdown sequence can continue regardless. + _logger.LogWarning(ex, + "Store-and-forward retry sweep faulted during shutdown wait"); + } } /// diff --git a/tests/ScadaLink.CLI.Tests/CliConfigTests.cs b/tests/ScadaLink.CLI.Tests/CliConfigTests.cs index 0185a06f..55d96810 100644 --- a/tests/ScadaLink.CLI.Tests/CliConfigTests.cs +++ b/tests/ScadaLink.CLI.Tests/CliConfigTests.cs @@ -63,4 +63,60 @@ public class CliConfigTests Environment.SetEnvironmentVariable("SCADALINK_FORMAT", orig); } } + + /// + /// CLI-021 regression: a malformed ~/.scadalink/config.json must NOT abort the + /// CLI before any command runs — Load() must warn (to stderr) and return a + /// usable default config so command-line overrides (--url, --username, etc.) + /// and env vars can still take effect. + /// + [Fact] + public void Load_MalformedConfigFile_DoesNotThrow_WarnsAndReturnsDefault() + { + var tempHome = Path.Combine(Path.GetTempPath(), "scadalink-cli-test-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(Path.Combine(tempHome, ".scadalink")); + File.WriteAllText( + Path.Combine(tempHome, ".scadalink", "config.json"), + "{ this is not valid json :: "); + + var origHome = Environment.GetEnvironmentVariable("HOME"); + var origUserProfile = Environment.GetEnvironmentVariable("USERPROFILE"); + var origUrl = Environment.GetEnvironmentVariable("SCADALINK_MANAGEMENT_URL"); + var origFormat = Environment.GetEnvironmentVariable("SCADALINK_FORMAT"); + var origUser = Environment.GetEnvironmentVariable("SCADALINK_USERNAME"); + var origPass = Environment.GetEnvironmentVariable("SCADALINK_PASSWORD"); + var origStderr = Console.Error; + try + { + Environment.SetEnvironmentVariable("HOME", tempHome); + Environment.SetEnvironmentVariable("USERPROFILE", tempHome); + Environment.SetEnvironmentVariable("SCADALINK_MANAGEMENT_URL", null); + Environment.SetEnvironmentVariable("SCADALINK_FORMAT", null); + Environment.SetEnvironmentVariable("SCADALINK_USERNAME", null); + Environment.SetEnvironmentVariable("SCADALINK_PASSWORD", null); + + var stderrCapture = new StringWriter(); + Console.SetError(stderrCapture); + + // Must not throw. + var config = CliConfig.Load(); + + Assert.Equal("json", config.DefaultFormat); + Assert.Null(config.ManagementUrl); + var stderrText = stderrCapture.ToString(); + Assert.Contains("warning", stderrText, StringComparison.OrdinalIgnoreCase); + Assert.Contains("config.json", stderrText); + } + finally + { + Console.SetError(origStderr); + Environment.SetEnvironmentVariable("HOME", origHome); + Environment.SetEnvironmentVariable("USERPROFILE", origUserProfile); + Environment.SetEnvironmentVariable("SCADALINK_MANAGEMENT_URL", origUrl); + Environment.SetEnvironmentVariable("SCADALINK_FORMAT", origFormat); + Environment.SetEnvironmentVariable("SCADALINK_USERNAME", origUser); + Environment.SetEnvironmentVariable("SCADALINK_PASSWORD", origPass); + try { Directory.Delete(tempHome, recursive: true); } catch { /* best-effort cleanup */ } + } + } } diff --git a/tests/ScadaLink.Communication.Tests/Grpc/SiteStreamGrpcServerTests.cs b/tests/ScadaLink.Communication.Tests/Grpc/SiteStreamGrpcServerTests.cs index 0af6dfaa..2f5e03a1 100644 --- a/tests/ScadaLink.Communication.Tests/Grpc/SiteStreamGrpcServerTests.cs +++ b/tests/ScadaLink.Communication.Tests/Grpc/SiteStreamGrpcServerTests.cs @@ -308,6 +308,40 @@ public class SiteStreamGrpcServerTests : TestKit await streamTask; } + [Fact] + public async Task Comm021_SubscribeThrows_StopsRelayActorAndRemovesActiveStreamEntry() + { + // Communication-021 regression: SubscribeInstance creates a StreamRelayActor + // and registers an _activeStreams entry BEFORE calling _streamSubscriber.Subscribe. + // If Subscribe throws (e.g. stale instance, site runtime shutting down) and the + // pre-fix code lets the throw escape without the wrapping try, the relay actor + // and the activeStreams entry both leak. The fix wraps the Subscribe call so the + // catch deterministically stops the actor and removes the entry before re-throw. + var subscriber = Substitute.For(); + subscriber.Subscribe(Arg.Any(), Arg.Any()) + .Returns(_ => throw new InvalidOperationException("instance not found")); + + var server = new SiteStreamGrpcServer(subscriber, _logger); + server.SetReady(Sys); + + var writer = Substitute.For>(); + var context = CreateMockContext(); + + // The InvalidOperationException is expected to propagate (the gRPC stack maps + // unhandled throws to Internal); the load-bearing assertion is the cleanup. + await Assert.ThrowsAsync( + () => server.SubscribeInstance(MakeRequest("corr-comm021"), writer, context)); + + // _activeStreams entry was inserted before Subscribe was called; the catch + // must remove it so a follow-up subscription with the same correlation id is + // not blocked, and the relay actor must be stopped so it does not leak. + Assert.Equal(0, server.ActiveStreamCount); + + // RemoveSubscriber must NOT have been called (Subscribe never returned a + // subscription id) — verifying we hit the catch path, not the finally path. + subscriber.DidNotReceive().RemoveSubscriber(Arg.Any()); + } + [Fact] public void SetReady_AllowsStreamCreation() { diff --git a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs index ab6d08bf..cd6b64d6 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs @@ -1007,6 +1007,151 @@ public class DeploymentServiceTests : TestKit public SilentProbeActor() => ReceiveAny(_ => { }); } + // ── DeploymentManager-019: lifecycle timeouts must write an audit entry ── + + /// + /// DeploymentManager-019: when a Disable times out at the site, the + /// operator's attempted action must still be recorded in the audit log + /// with the documented DisableTimedOut action — pre-fix nothing + /// was written and the audit trail was silent about the attempt. + /// + [Fact] + public async Task DisableInstanceAsync_LifecycleTimeout_WritesDisableTimedOutAuditEntry() + { + var instance = new Instance("TimeoutAuditInst") + { + Id = 61, + SiteId = 1, + State = InstanceState.Enabled, + }; + _repo.GetInstanceByIdAsync(61, Arg.Any()).Returns(instance); + + var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor())); + + var comms = new CommunicationService( + Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }), + NullLogger.Instance); + comms.SetCommunicationActor(commActor); + + var siteRepo = Substitute.For(); + var deadline = TimeSpan.FromMilliseconds(300); + var service = new DeploymentService( + _repo, siteRepo, _pipeline, comms, _lockManager, _audit, + new DiffService(), + new DeploymentStatusNotifier(NullLogger.Instance), + Options.Create(new DeploymentManagerOptions + { + OperationLockTimeout = TimeSpan.FromSeconds(5), + LifecycleCommandTimeout = deadline, + }), + NullLogger.Instance); + + var result = await service.DisableInstanceAsync(61, "operator-jane"); + + Assert.True(result.IsFailure); + + // The DisableTimedOut audit entry must have been written. Pre-fix the + // catch block returned without calling _auditService at all. + await _audit.Received(1).LogAsync( + "operator-jane", + "DisableTimedOut", + "Instance", + "61", + instance.UniqueName, + Arg.Any(), + Arg.Any()); + } + + /// + /// DeploymentManager-019: same audit guarantee for the Enable path. + /// + [Fact] + public async Task EnableInstanceAsync_LifecycleTimeout_WritesEnableTimedOutAuditEntry() + { + var instance = new Instance("EnableTimeoutInst") + { + Id = 62, + SiteId = 1, + State = InstanceState.Disabled, + }; + _repo.GetInstanceByIdAsync(62, Arg.Any()).Returns(instance); + + var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor())); + var comms = new CommunicationService( + Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }), + NullLogger.Instance); + comms.SetCommunicationActor(commActor); + + var siteRepo = Substitute.For(); + var service = new DeploymentService( + _repo, siteRepo, _pipeline, comms, _lockManager, _audit, + new DiffService(), + new DeploymentStatusNotifier(NullLogger.Instance), + Options.Create(new DeploymentManagerOptions + { + OperationLockTimeout = TimeSpan.FromSeconds(5), + LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300), + }), + NullLogger.Instance); + + var result = await service.EnableInstanceAsync(62, "operator-jane"); + + Assert.True(result.IsFailure); + await _audit.Received(1).LogAsync( + "operator-jane", + "EnableTimedOut", + "Instance", + "62", + instance.UniqueName, + Arg.Any(), + Arg.Any()); + } + + /// + /// DeploymentManager-019: same audit guarantee for the Delete path. + /// + [Fact] + public async Task DeleteInstanceAsync_LifecycleTimeout_WritesDeleteTimedOutAuditEntry() + { + var instance = new Instance("DeleteTimeoutInst") + { + Id = 63, + SiteId = 1, + State = InstanceState.Enabled, + }; + _repo.GetInstanceByIdAsync(63, Arg.Any()).Returns(instance); + + var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor())); + var comms = new CommunicationService( + Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }), + NullLogger.Instance); + comms.SetCommunicationActor(commActor); + + var siteRepo = Substitute.For(); + var service = new DeploymentService( + _repo, siteRepo, _pipeline, comms, _lockManager, _audit, + new DiffService(), + new DeploymentStatusNotifier(NullLogger.Instance), + Options.Create(new DeploymentManagerOptions + { + OperationLockTimeout = TimeSpan.FromSeconds(5), + LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300), + }), + NullLogger.Instance); + + var result = await service.DeleteInstanceAsync(63, "operator-jane"); + + Assert.True(result.IsFailure); + await _audit.Received(1).LogAsync( + "operator-jane", + "DeleteTimedOut", + "Instance", + "63", + instance.UniqueName, + Arg.Any(), + Arg.Any()); + } + // ── DeploymentManager-003: post-success persistence must commit the Success status ── [Fact] diff --git a/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs b/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs index 95a6d87e..d78833aa 100644 --- a/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs +++ b/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs @@ -337,6 +337,49 @@ public class ExternalSystemClientTests $"Call took {sw.Elapsed}, expected to time out near the configured 200ms window"); } + /// + /// ExternalSystemGateway-019 regression: defaults + /// to 100 seconds and is enforced internally by SendAsync via its own + /// private CTS — a raised by that internal + /// CTS does not trip either the caller's token or the gateway's timeout CTS, + /// so any operator-configured + /// greater than 100 s would be silently clipped. The fix sets + /// on the rented client + /// so the per-call CancellationTokenSource(DefaultHttpTimeout) in + /// InvokeHttpAsync is the sole timeout source. This test verifies the + /// property is in fact set before any request is dispatched. + /// + [Fact] + public async Task Call_DisablesHttpClientFrameworkTimeoutSoLongTimeoutsArentClipped() + { + var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 }; + var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 }; + StubResolution(system, method); + + var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.OK, "{}")); + // Sanity check: the factory-supplied default is the framework's 100 s — exactly + // the value the fix must override so an operator-configured timeout > 100 s is + // honoured verbatim. + Assert.Equal(TimeSpan.FromSeconds(100), httpClient.Timeout); + + _httpClientFactory.CreateClient(Arg.Any()).Returns(httpClient); + + var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMinutes(5) }; + var client = new ExternalSystemClient( + _httpClientFactory, _repository, + NullLogger.Instance, + options: Microsoft.Extensions.Options.Options.Create(options)); + + var result = await client.CallAsync("TestAPI", "getData"); + + Assert.True(result.Success); + // After InvokeHttpAsync runs, the rented client's Timeout must have been + // set to InfiniteTimeSpan — proving the framework-default 100 s clip is + // disabled and the per-call CTS built from DefaultHttpTimeout is the + // sole timeout source. + Assert.Equal(Timeout.InfiniteTimeSpan, httpClient.Timeout); + } + [Fact] public async Task Call_CallerCancellation_IsNotMisreportedAsTimeout() { diff --git a/tests/ScadaLink.HealthMonitoring.Tests/CentralHealthReportLoopTests.cs b/tests/ScadaLink.HealthMonitoring.Tests/CentralHealthReportLoopTests.cs index 0bbb29b7..117d7879 100644 --- a/tests/ScadaLink.HealthMonitoring.Tests/CentralHealthReportLoopTests.cs +++ b/tests/ScadaLink.HealthMonitoring.Tests/CentralHealthReportLoopTests.cs @@ -133,6 +133,83 @@ public class CentralHealthReportLoopTests Assert.Equal(fixedInstant.ToUnixTimeMilliseconds(), loop.CurrentSequenceNumber); } + /// + /// HealthMonitoring-018 regression: when + /// throws, the per-interval counters that + /// just drained must be + /// restored back into the shared collector so they roll forward into the + /// next interval rather than being silently lost. Same shape as the + /// HealthMonitoring-017 fix in . + /// + [Fact] + public async Task ProcessReportFailure_PreservesIntervalCountersForNextReport() + { + var collector = new SiteHealthCollector(); + // Pre-populate every per-interval counter so the restore path on each + // field is exercised. The loop's first iteration will read-and-reset + // these via CollectReport, then ProcessReport will throw, and the + // restore must put them back. + collector.IncrementScriptError(); + collector.IncrementScriptError(); + collector.IncrementAlarmError(); + collector.IncrementDeadLetter(); + collector.IncrementDeadLetter(); + collector.IncrementDeadLetter(); + collector.IncrementSiteAuditWriteFailures(); + collector.IncrementAuditRedactionFailure(); + collector.IncrementAuditRedactionFailure(); + + var aggregator = new FailingThenSucceedingAggregator(); + var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true }; + var options = Options.Create(new HealthMonitoringOptions + { + ReportInterval = TimeSpan.FromMilliseconds(50) + }); + + var loop = new CentralHealthReportLoop( + collector, aggregator, clusterNodes, options, + NullLogger.Instance); + + await RunLoopBriefly(loop, 450); + + // First call threw, later succeeded — the first successful report + // must carry the previously-failed interval's accumulated counts. + Assert.NotEmpty(aggregator.Processed); + var firstSuccess = aggregator.Processed[0]; + Assert.Equal(2, firstSuccess.ScriptErrorCount); + Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount); + Assert.Equal(3, firstSuccess.DeadLetterCount); + Assert.Equal(1, firstSuccess.SiteAuditWriteFailures); + Assert.Equal(2, firstSuccess.AuditRedactionFailure); + } + + /// + /// whose first ProcessReport + /// call throws (only the first), then subsequent calls succeed. Used by + /// + /// to verify the HealthMonitoring-018 restore-on-failure path. + /// + private sealed class FailingThenSucceedingAggregator : ICentralHealthAggregator + { + private int _callCount; + public List Processed { get; } = []; + + public void ProcessReport(SiteHealthReport report) + { + var n = Interlocked.Increment(ref _callCount); + if (n == 1) + { + throw new InvalidOperationException("aggregator temporarily unavailable"); + } + Processed.Add(report); + } + + public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { } + public IReadOnlyDictionary GetAllSiteStates() => + new Dictionary(); + public SiteHealthState? GetSiteState(string siteId) => null; + } + [Fact] public async Task SetsActiveNodeFlag_EvenWhenNotPrimary() { diff --git a/tests/ScadaLink.HealthMonitoring.Tests/HealthReportSenderTests.cs b/tests/ScadaLink.HealthMonitoring.Tests/HealthReportSenderTests.cs index 25e15150..9276c0a5 100644 --- a/tests/ScadaLink.HealthMonitoring.Tests/HealthReportSenderTests.cs +++ b/tests/ScadaLink.HealthMonitoring.Tests/HealthReportSenderTests.cs @@ -315,6 +315,89 @@ public class HealthReportSenderTests Assert.Contains(warnings, w => w.Message.Contains("cluster nodes", StringComparison.OrdinalIgnoreCase)); } + /// + /// HealthMonitoring-017 regression: when the transport's Send throws, + /// the per-interval counters that + /// just drained via Interlocked.Exchange must be restored back into the + /// collector so they roll forward into the next interval rather than being + /// silently lost. Before the fix, a transport failure left the counts in the + /// un-sent report only, and the next successful report shipped with the + /// counters at zero. + /// + [Fact] + public async Task SendFailure_PreservesIntervalCountersForNextReport() + { + var transport = new FailingThenSucceedingTransport(); + var collector = new SiteHealthCollector(); + collector.SetActiveNode(true); + // Pre-populate every per-interval counter so the restore path on each + // field is exercised — script error, alarm error, dead letter, site + // audit write failure, audit redaction failure. + collector.IncrementScriptError(); + collector.IncrementScriptError(); + collector.IncrementAlarmError(); + collector.IncrementDeadLetter(); + collector.IncrementDeadLetter(); + collector.IncrementDeadLetter(); + collector.IncrementSiteAuditWriteFailures(); + collector.IncrementAuditRedactionFailure(); + collector.IncrementAuditRedactionFailure(); + + var options = Options.Create(new HealthMonitoringOptions + { + ReportInterval = TimeSpan.FromMilliseconds(50) + }); + + var sender = new HealthReportSender( + collector, + transport, + options, + NullLogger.Instance, + new FakeSiteIdentityProvider()); + + using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(500)); + try + { + await sender.StartAsync(cts.Token); + await Task.Delay(450, CancellationToken.None); + await sender.StopAsync(CancellationToken.None); + } + catch (OperationCanceledException) { } + + // The first interval's Send threw, then later intervals succeeded. The + // first successful report must include the previously-failed interval's + // accumulated counts. + Assert.NotEmpty(transport.SentReports); + var firstSuccess = transport.SentReports[0]; + Assert.Equal(2, firstSuccess.ScriptErrorCount); + Assert.Equal(1, firstSuccess.AlarmEvaluationErrorCount); + Assert.Equal(3, firstSuccess.DeadLetterCount); + Assert.Equal(1, firstSuccess.SiteAuditWriteFailures); + Assert.Equal(2, firstSuccess.AuditRedactionFailure); + } + + /// + /// that throws on the first + /// Send call (and only the first), then succeeds. Used by + /// to + /// verify the HealthMonitoring-017 restore-on-failure path. + /// + private sealed class FailingThenSucceedingTransport : IHealthReportTransport + { + private int _callCount; + public List SentReports { get; } = []; + + public void Send(SiteHealthReport report) + { + var n = Interlocked.Increment(ref _callCount); + if (n == 1) + { + throw new InvalidOperationException("transport temporarily unavailable"); + } + SentReports.Add(report); + } + } + /// /// HealthMonitoring-006 regression: the sequence-number seed must be derived /// from the injected so the Unix-ms seeding strategy diff --git a/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs index 284f6a33..b2975f63 100644 --- a/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs +++ b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs @@ -64,4 +64,48 @@ public class LoggerConfigurationTests Assert.Single(sink.LogEvents); Assert.Equal(LogEventLevel.Information, sink.LogEvents[0].Level); } + + /// + /// Host-022: an unrecognised ScadaLink:Logging:MinimumLevel (e.g. a typo + /// like "Informaiton") must NOT abort startup but MUST emit a one-shot warning + /// naming the offending value and the fallback so the silent coercion is + /// visible. Null/blank is treated as "unset" and silently defaults. + /// + [Fact] + public void ParseLevel_UnrecognisedValue_FallsBackAndWarns() + { + var writer = new StringWriter(); + + var result = LoggerConfigurationFactory.ParseLevel("Informaiton", writer); + + Assert.Equal(LogEventLevel.Information, result); + var warning = writer.ToString(); + Assert.Contains("warning", warning, StringComparison.OrdinalIgnoreCase); + Assert.Contains("Informaiton", warning); + Assert.Contains("Information", warning); + } + + [Fact] + public void ParseLevel_NullOrBlank_FallsBackSilently() + { + var writer = new StringWriter(); + + var nullResult = LoggerConfigurationFactory.ParseLevel(null, writer); + var blankResult = LoggerConfigurationFactory.ParseLevel(" ", writer); + + Assert.Equal(LogEventLevel.Information, nullResult); + Assert.Equal(LogEventLevel.Information, blankResult); + Assert.Empty(writer.ToString()); + } + + [Fact] + public void ParseLevel_RecognisedValue_NoWarning() + { + var writer = new StringWriter(); + + var result = LoggerConfigurationFactory.ParseLevel("Warning", writer); + + Assert.Equal(LogEventLevel.Warning, result); + Assert.Empty(writer.ToString()); + } } diff --git a/tests/ScadaLink.InboundAPI.Tests/Middleware/AuditWriteMiddlewareTests.cs b/tests/ScadaLink.InboundAPI.Tests/Middleware/AuditWriteMiddlewareTests.cs index cd220a85..6fbe0271 100644 --- a/tests/ScadaLink.InboundAPI.Tests/Middleware/AuditWriteMiddlewareTests.cs +++ b/tests/ScadaLink.InboundAPI.Tests/Middleware/AuditWriteMiddlewareTests.cs @@ -689,4 +689,115 @@ public class AuditWriteMiddlewareTests $"ResponseSummary byte count {Encoding.UTF8.GetByteCount(evt.ResponseSummary!)} exceeded cap {cap}"); Assert.True(evt.PayloadTruncated); } + + // --------------------------------------------------------------------- + // InboundAPI-018: asynchronously faulted audit-write tasks must be + // observed (logged) rather than silently dropped — but must still NOT + // alter the user-facing HTTP response (alog.md §13). + // --------------------------------------------------------------------- + + /// + /// Test-only writer whose returns a Task that + /// faults AFTER an asynchronous boundary, so the throw happens after + /// 's synchronous try/catch can see it — + /// exactly the fire-and-forget bug InboundAPI-018 closes. + /// + private sealed class AsyncFaultingAuditWriter : ICentralAuditWriter + { + public Task WriteAsync(AuditEvent evt, CancellationToken ct = default) + { + return FaultAsync(); + + static async Task FaultAsync() + { + // Yield off-thread so the fault surfaces ASYNCHRONOUSLY (not + // captured by a synchronous try/catch around the WriteAsync + // call site). + await Task.Yield(); + throw new InvalidOperationException("async audit write failed"); + } + } + } + + /// + /// Captures log entries written through a + /// so the test can assert on the Warning that + /// emits. + /// + private sealed class RecordingLogger : Microsoft.Extensions.Logging.ILogger + { + public List<(Microsoft.Extensions.Logging.LogLevel Level, string Message, Exception? Exception)> Entries { get; } = new(); + + public IDisposable BeginScope(TState state) where TState : notnull => NullScope.Instance; + + public bool IsEnabled(Microsoft.Extensions.Logging.LogLevel logLevel) => true; + + public void Log( + Microsoft.Extensions.Logging.LogLevel logLevel, + Microsoft.Extensions.Logging.EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + lock (Entries) + { + Entries.Add((logLevel, formatter(state, exception), exception)); + } + } + + private sealed class NullScope : IDisposable + { + public static readonly NullScope Instance = new(); + public void Dispose() { } + } + } + + [Fact] + public async Task AuditWriter_AsyncFault_IsObserved_AsWarning_AndDoesNotAlterResponse() + { + var writer = new AsyncFaultingAuditWriter(); + var logger = new RecordingLogger(); + var ctx = BuildContext(); + + var mw = new AuditWriteMiddleware( + next: _ => + { + ctx.Response.StatusCode = 200; + return Task.CompletedTask; + }, + auditWriter: writer, + logger: logger, + options: new StaticAuditLogOptionsMonitor(new AuditLogOptions())); + + await mw.InvokeAsync(ctx); + + // The user-facing response is untouched — audit emission is best-effort. + Assert.Equal(200, ctx.Response.StatusCode); + + // Give the off-thread continuation a moment to fire and log. Spin + // briefly rather than sleep-then-assert so the test is resilient to + // scheduler jitter without inflating runtime on success. + var deadline = DateTime.UtcNow.AddSeconds(2); + while (DateTime.UtcNow < deadline) + { + lock (logger.Entries) + { + if (logger.Entries.Any(e => + e.Level == Microsoft.Extensions.Logging.LogLevel.Warning + && e.Exception is not null + && e.Message.Contains("async audit write faulted"))) + { + return; + } + } + await Task.Delay(20); + } + + // If we reach this point, the continuation did not fire — pre-fix the + // fault would have been swallowed entirely and no log line emitted. + var snapshot = logger.Entries.Select(e => $"{e.Level}: {e.Message}").ToList(); + Assert.Fail( + "Expected a Warning log entry observing the async audit-write fault — none found. " + + $"Entries: [{string.Join(", ", snapshot)}]"); + } } diff --git a/tests/ScadaLink.Security.Tests/SecurityTests.cs b/tests/ScadaLink.Security.Tests/SecurityTests.cs index a9375076..1b8b0261 100644 --- a/tests/ScadaLink.Security.Tests/SecurityTests.cs +++ b/tests/ScadaLink.Security.Tests/SecurityTests.cs @@ -434,6 +434,10 @@ public class SecurityReviewRegressionTests services.AddLogging(); services.AddDataProtection(); services.AddSecurity(); + // Security-020: the cookie PostConfigure reads SecurityOptions.Value, + // which triggers SecurityOptionsValidator — supply the required LDAP + // fields so the cookie wiring under test can be resolved. + ConfigureValidLdapDefaults(services); using var provider = services.BuildServiceProvider(); var cookieOptions = provider @@ -446,6 +450,20 @@ public class SecurityReviewRegressionTests Assert.True(cookieOptions.Cookie.HttpOnly); } + /// + /// Security-020: supplies the minimum-valid LDAP fields so the cookie / + /// JWT wiring under test can be resolved without hitting + /// . Used by the cookie-policy + /// integration tests in this class, which only care about the cookie + /// options shape — not the LDAP fields. + /// + private static void ConfigureValidLdapDefaults(IServiceCollection services) => + services.Configure(o => + { + o.LdapServer = "ldap.example.com"; + o.LdapSearchBase = "dc=example,dc=com"; + }); + // --- CentralUI-005: cookie auth must use a sliding session window --- // Documented policy (CLAUDE.md Security & Auth): sliding refresh with a // 30-minute idle timeout. The cookie middleware must enable SlidingExpiration @@ -458,6 +476,7 @@ public class SecurityReviewRegressionTests services.AddLogging(); services.AddDataProtection(); services.AddSecurity(); + ConfigureValidLdapDefaults(services); using var provider = services.BuildServiceProvider(); var cookieOptions = provider @@ -474,6 +493,7 @@ public class SecurityReviewRegressionTests services.AddLogging(); services.AddDataProtection(); services.AddSecurity(); + ConfigureValidLdapDefaults(services); // The idle timeout drives the cookie's expiry window. services.Configure(o => o.IdleTimeoutMinutes = 30); @@ -492,6 +512,7 @@ public class SecurityReviewRegressionTests services.AddLogging(); services.AddDataProtection(); services.AddSecurity(); + ConfigureValidLdapDefaults(services); services.Configure(o => o.IdleTimeoutMinutes = 45); using var provider = services.BuildServiceProvider(); @@ -1196,3 +1217,100 @@ public class AuthorizationPolicyTests } #endregion + +#region Code Review Regression Tests — Security-020 + +/// +/// Security-020: must reject empty +/// / +/// at startup with a clear, key-naming message, so a typo'd appsettings section +/// fails fast at boot instead of surfacing minutes/hours later as a generic +/// LDAP error on the first real login. +/// +public class SecurityOptionsValidatorTests +{ + private static SecurityOptions ValidOptions() => new() + { + LdapServer = "ldap.example.com", + LdapSearchBase = "dc=example,dc=com", + }; + + [Fact] + public void Validate_AllRequiredFieldsSet_Succeeds() + { + var validator = new SecurityOptionsValidator(); + + var result = validator.Validate(name: null, ValidOptions()); + + Assert.True(result.Succeeded); + } + + [Theory] + [InlineData("")] + [InlineData(" ")] + public void Validate_EmptyOrWhitespaceLdapServer_Fails(string ldapServer) + { + var options = ValidOptions(); + options.LdapServer = ldapServer; + var validator = new SecurityOptionsValidator(); + + var result = validator.Validate(name: null, options); + + Assert.True(result.Failed); + // Must name the full Section:Field key so the operator can find it. + Assert.Contains("Security:LdapServer", result.FailureMessage); + } + + [Theory] + [InlineData("")] + [InlineData(" ")] + public void Validate_EmptyOrWhitespaceLdapSearchBase_Fails(string ldapSearchBase) + { + var options = ValidOptions(); + options.LdapSearchBase = ldapSearchBase; + var validator = new SecurityOptionsValidator(); + + var result = validator.Validate(name: null, options); + + Assert.True(result.Failed); + Assert.Contains("Security:LdapSearchBase", result.FailureMessage); + } + + [Fact] + public void Validate_BothRequiredFieldsEmpty_ReportsBoth() + { + var options = new SecurityOptions + { + LdapServer = string.Empty, + LdapSearchBase = string.Empty, + }; + var validator = new SecurityOptionsValidator(); + + var result = validator.Validate(name: null, options); + + Assert.True(result.Failed); + // Both keys named — the operator should not need to re-run after + // fixing the first one to discover the second is also missing. + Assert.Contains("Security:LdapServer", result.FailureMessage); + Assert.Contains("Security:LdapSearchBase", result.FailureMessage); + } + + [Fact] + public void AddSecurity_RegistersSecurityOptionsValidator() + { + var services = new ServiceCollection(); + services.AddLogging(); + services.AddDataProtection(); + services.AddSecurity(); + + using var provider = services.BuildServiceProvider(); + + // The validator participates in IValidateOptions — + // registration is the load-bearing wiring that makes Security-020 + // ValidateOnStart() actually fire. + var validators = provider.GetServices>().ToList(); + Assert.Contains(validators, v => v is SecurityOptionsValidator); + } +} + +#endregion diff --git a/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs b/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs index f5b11067..407470b5 100644 --- a/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs +++ b/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs @@ -513,4 +513,73 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable // counts only sweep retries, so a freshly buffered message has RetryCount 0. Assert.Equal(0, msg.RetryCount); } + + // ─── StoreAndForward-024: StopAsync waits for the in-flight sweep ─── + + /// + /// StoreAndForward-024: must + /// not return until any in-flight retry sweep has completed (or the bounded + /// shutdown timeout fires). Pre-fix it disposed the timer and returned + /// immediately, leaving a mid-flight sweep touching disposed dependencies. + /// + [Fact] + public async Task StopAsync_AwaitsInFlightRetrySweep_BeforeReturning() + { + // Build a service whose timer fires almost immediately, with a handler + // that pauses in the middle of delivery so we can observe StopAsync's + // wait behaviour. + var dbName = $"StopWait_{Guid.NewGuid():N}"; + var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared"; + using var keepAlive = new SqliteConnection(connStr); + keepAlive.Open(); + + var storage = new StoreAndForwardStorage(connStr, NullLogger.Instance); + await storage.InitializeAsync(); + + var options = new StoreAndForwardOptions + { + DefaultRetryInterval = TimeSpan.Zero, + DefaultMaxRetries = 3, + // Fire almost immediately so the sweep is in-flight by the time we call StopAsync. + RetryTimerInterval = TimeSpan.FromMilliseconds(20), + }; + var service = new StoreAndForwardService( + storage, options, NullLogger.Instance); + + // Pre-seed a buffered message so the sweep has work to do, and a + // handler that blocks until we release it. + var handlerEntered = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var releaseHandler = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var handlerCompleted = false; + service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, async _ => + { + handlerEntered.TrySetResult(); + await releaseHandler.Task; + handlerCompleted = true; + return true; + }); + + var seed = await service.EnqueueAsync( + StoreAndForwardCategory.ExternalSystem, "api", """{}""", + attemptImmediateDelivery: false); + Assert.True(seed.WasBuffered); + + await service.StartAsync(); + // Wait until the timer-driven sweep has called into the handler. + var entered = await Task.WhenAny(handlerEntered.Task, Task.Delay(TimeSpan.FromSeconds(2))); + Assert.Same(handlerEntered.Task, entered); + Assert.False(handlerCompleted, "Handler should still be paused inside the sweep."); + + // Kick StopAsync — it must NOT return until the sweep finishes. Run the + // release on a background task so we can prove StopAsync is awaiting. + var stopTask = service.StopAsync(); + Assert.False(stopTask.IsCompleted, + "StopAsync returned before the in-flight sweep was given a chance to finish."); + + // Release the handler — StopAsync should now complete shortly. + releaseHandler.SetResult(); + await stopTask.WaitAsync(TimeSpan.FromSeconds(5)); + Assert.True(handlerCompleted, + "Sweep handler must have finished before StopAsync returned."); + } }