20 Commits

Author SHA1 Message Date
Joseph Doherty f23513c30b docs(code-reviews): regenerate index after resolving 64 of 66 re-review findings 2026-05-17 03:18:47 -04:00
Joseph Doherty d6221419c6 fix(template-engine): resolve TemplateEngine-015,016 — cascade-rename nested derived templates, correct composed-script ParentPath 2026-05-17 03:18:41 -04:00
Joseph Doherty 0135a6b2a6 fix(store-and-forward): resolve StoreAndForward-015..017 — document maxRetries=0 contract, replicate operator retry/discard, real category in activity log 2026-05-17 03:18:41 -04:00
Joseph Doherty be274212f0 fix(site-runtime): resolve SiteRuntime-017..019 — isolated attribute snapshot for child actors, corrected dispatcher doc, remove dead lifecycle handlers 2026-05-17 03:18:41 -04:00
Joseph Doherty 6d63fef934 fix(site-event-logging): resolve SiteEventLogging-012..014 — fault dropped-event tasks, escape LIKE wildcards, re-triage startup-purge finding (Won't Fix) 2026-05-17 03:18:41 -04:00
Joseph Doherty a58cec5776 fix(security): resolve Security-012..015 — fail login on partial LDAP outage, escape-aware DN parsing, idle check on refresh, username normalization 2026-05-17 03:18:33 -04:00
Joseph Doherty f5199e9da9 fix(notification-service): resolve NotificationService-014..018 — classify OAuth2 failures, fail on bad auth config, wire NotificationOptions fallback, disposable concurrency limiter 2026-05-17 03:18:33 -04:00
Joseph Doherty bf6bd8de5a fix(management-service): resolve ManagementService-014..017 — site-scope enforcement on QueryDeployments, atomic override validation, curated fault messages, test coverage 2026-05-17 03:18:33 -04:00
Joseph Doherty 73a393076a fix(inbound-api): resolve InboundAPI-014..017 — return-value validation, reflection-gateway hardening, deadline-bound routed calls, RouteHelper test coverage 2026-05-17 03:18:33 -04:00
Joseph Doherty aca65e85bb fix(host): resolve Host-012..015 — consume DownIfAlone in HOCON, sub-second timing precision, config-driven Serilog sinks, transient-only startup retry 2026-05-17 03:18:33 -04:00
Joseph Doherty eae4077414 fix(health-monitoring): resolve HealthMonitoring-013,014,016 — shorter-timeout cadence, options validation, injected TimeProvider; HealthMonitoring-015 left open (cross-module design decision) 2026-05-17 03:18:24 -04:00
Joseph Doherty da8c9f171b fix(external-system-gateway): resolve ExternalSystemGateway-015..017 — treat MaxRetries=0 as unset, scope HTTP connection cap to gateway clients, no bare trailing '?' 2026-05-17 03:18:24 -04:00
Joseph Doherty 4fa6f0e774 fix(deployment-manager): resolve DeploymentManager-015..017 — reconciliation applies post-success side effects, updates RevisionHash, corrected XML doc 2026-05-17 03:18:24 -04:00
Joseph Doherty 14ba5495d1 fix(data-connection-layer): resolve DataConnectionLayer-014..017 — real logger for OPC UA client, initial-connect failover, accurate subscribe response, per-tag write-batch results 2026-05-17 03:18:24 -04:00
Joseph Doherty 3d3f43229f fix(configuration-database): resolve ConfigurationDatabase-013,014 — fail-fast on missing key ring, single converter local; ConfigurationDatabase-012 left open (cross-module design decision) 2026-05-17 03:18:24 -04:00
Joseph Doherty a768135237 fix(communication): resolve Communication-012..015 — endpoint-aware gRPC client cache, address-change recreation, correlation-id validation, node-flip tests 2026-05-17 03:18:17 -04:00
Joseph Doherty a78c3bcb6f fix(commons): resolve Commons-013,014 — integral JSON index handling, distinguish Malformed vs Legacy OPC UA config 2026-05-17 03:18:17 -04:00
Joseph Doherty 21856a4be7 fix(cluster-infrastructure): resolve ClusterInfrastructure-009,010 — DownIfAlone consumption (via Host-012), validator enforces DownIfAlone=true 2026-05-17 03:18:17 -04:00
Joseph Doherty d7d74ebe5e fix(central-ui): resolve CentralUI-020..025 — auth-ping idle logout, DebugView race, push-handler disposal guard, JS-interop catch narrowing, claim-constant helper, SessionExpiry tests 2026-05-17 03:18:16 -04:00
Joseph Doherty f82bcbed7c fix(cli): resolve CLI-014..016 — re-triage update-command contract, doc-surface drift, table-column union 2026-05-17 03:18:16 -04:00
139 changed files with 6998 additions and 763 deletions
+63 -10
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 |
| Open findings | 0 |
## Summary
@@ -575,11 +575,27 @@ The CLI test suite went from 42 to 77 passing tests.
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Severity | Low |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Commands/TemplateCommands.cs:77`, `src/ScadaLink.CLI/Commands/SiteCommands.cs:86`, `src/ScadaLink.CLI/Commands/ExternalSystemCommands.cs:40-42`, `src/ScadaLink.CLI/Commands/DataConnectionCommands.cs:39-40`, `src/ScadaLink.CLI/Commands/NotificationCommands.cs:40-41`, `src/ScadaLink.CLI/Commands/ApiMethodCommands.cs:79` |
**Re-triage 2026-05-17:** the finding was originally filed as a Medium "Correctness &
logic bugs" issue, but verification against the Commons message contracts shows the
**code is correct** and the defect is purely documentation drift. Every `Update*Command`
record (`UpdateTemplateCommand`, `UpdateSiteCommand`, `UpdateDataConnectionCommand`,
`UpdateExternalSystemCommand`, `UpdateNotificationListCommand`, `UpdateApiMethodCommand`,
`UpdateTemplateAttribute/Alarm/ScriptCommand`, `UpdateRoleMappingCommand`,
`UpdateSharedScriptCommand`, `UpdateDatabaseConnectionDefCommand`, `UpdateAreaCommand`)
carries **non-nullable** "core" fields — they are *whole-entity replace* records, not
sparse patches. Marking the corresponding `--name` / `--protocol` / `--script` flags
`Required = true` is therefore the *correct* CLI behaviour: making them optional would
let an omitted flag send `null`/empty and silently blank the field server-side. Adopting
the sparse-patch model (recommendation option a) would require changing the Commons
records and the server-side Management handlers — both outside this module's editable
surface. Re-triaged to Low / Design-document adherence; resolved per recommendation
option (b).
**Description**
The design doc presents `update` commands with all non-`--id` fields as optional, e.g.
@@ -618,7 +634,20 @@ entity. Option (a) matches the documented surface and the typical CLI expectatio
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending) per recommendation option (b). Verification of the
Commons `Update*Command` records confirmed whole-replace is the intentional contract, so
the CLI's `Required = true` flags are correct and were left unchanged. The in-repo
`src/ScadaLink.CLI/README.md` — which is authoritative and previously listed every
`update` core field as optional `[--name]` — was corrected: the core flags
(`--name`/`--protocol`/`--script`/`--code`/`--emails`/`--endpoint-url`/`--auth-type`/
`--data-type`/`--trigger-type`/`--priority`/`--connection-string`/`--ldap-group`/`--role`)
are now marked `required: yes`, the command synopses drop the `[...]`, and each `update`
section states that an update replaces the whole entity. Added
`UpdateCommandContractTests` (10 tests) pinning the whole-replace contract — every
listed `update` command's core flags must be `Required`, and the genuinely-sparse
`external-system method update` must remain optional. Note: `docs/requirements/Component-CLI.md`
also still shows these flags as optional, but it is outside this module's editable
surface — that doc-side correction is owned by the docs surface.
### CLI-015 — `Component-CLI.md` command surface has drifted again in two places
@@ -626,8 +655,16 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `docs/requirements/Component-CLI.md:75`, `docs/requirements/Component-CLI.md:125-126` (vs. `src/ScadaLink.CLI/Commands/TemplateCommands.cs:404-413`, `src/ScadaLink.CLI/Commands/DataConnectionCommands.cs:41`, `:86`) |
| Status | Resolved |
| Location | `docs/requirements/Component-CLI.md:75`, `docs/requirements/Component-CLI.md:125-126`, `src/ScadaLink.CLI/README.md` (vs. `src/ScadaLink.CLI/Commands/TemplateCommands.cs:404-413`, `src/ScadaLink.CLI/Commands/DataConnectionCommands.cs:41`, `:86`) |
**Re-triage 2026-05-17:** verification found the same two drifts also present in the
in-repo `src/ScadaLink.CLI/README.md` (the authoritative reference): its
`template composition delete` section used the non-existent `--template-id` /
`--instance-name` form, and `data-connection create`/`update` documented only
`--configuration` without the canonical `--primary-config` flag (`--configuration` is in
fact an alias of `--primary-config`). The README is in this module's editable surface;
`docs/requirements/Component-CLI.md` is not.
**Description**
@@ -655,7 +692,15 @@ documented elsewhere.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Both drifts were present in the in-repo
`src/ScadaLink.CLI/README.md` and were corrected there (the README is this module's
authoritative reference): `template composition delete` now documents the real single
`--id <int>` form, and `data-connection create`/`update` now document `--primary-config`
(with the `--configuration` alias noted) alongside `--backup-config` and
`--failover-retry-count`. Added `CommandTreeTests.TemplateCompositionDelete_IsKeyedByIdOnly`
pinning the composition-delete surface (`--id` present; `--template-id`/`--instance-name`
absent). The corresponding edit to `docs/requirements/Component-CLI.md:75,125-126` is
outside this module's editable surface and remains for the docs surface to apply.
### CLI-016 — `WriteAsTable` derives columns from the first array element only
@@ -663,7 +708,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:184-200` |
**Description**
@@ -687,4 +732,12 @@ rows, so no element's data is silently dropped.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Root cause confirmed — `WriteAsTable` derived the
header set from `items[0].EnumerateObject()` only, dropping any property unique to a
later element. The header set is now computed as the union of property names across
*every* object element of the array, in first-seen order (a `HashSet` guards duplicates
while an ordered list preserves column order). Rows already project against the header
list and `OutputFormatter.WriteTable` pads missing cells, so heterogeneous arrays now
render every column. Regression tests added in `TableHeaderUnionTests` (3 tests:
later-element-only column included, first-seen column order preserved,
first-element-extra column still rendered).
+13 -13
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 6 |
| Open findings | 0 |
## Summary
@@ -994,7 +994,7 @@ in the fixture.)
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Shared/SessionExpiry.razor:39-62`; `src/ScadaLink.CentralUI/Auth/CookieAuthenticationStateProvider.cs:29-43` |
**Description**
@@ -1036,7 +1036,7 @@ expired session (see CentralUI-025).
**Resolution**
_Unresolved._
2026-05-17 — `SessionExpiry` no longer polls the frozen `AuthenticationStateProvider`; it polls a new anonymous `GET /auth/ping` minimal-API endpoint (re-validated by the cookie middleware on every HTTP request) via a `fetch()` JS helper and redirects to `/login` on HTTP 401, so the documented 30-minute idle logout actually fires.
### CentralUI-021 — `DebugView` stream callback mutates `Dictionary` off the render thread
@@ -1044,7 +1044,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Deployment/DebugView.razor:404-419,511-519,275-289` |
**Description**
@@ -1075,7 +1075,7 @@ critical section as the upsert.
**Resolution**
_Unresolved._
2026-05-17 — the stream callback now routes through `HandleStreamEvent`, which marshals the `UpsertWithCap` mutation (and the cap-trim loop) onto the renderer's dispatcher via `SafeInvokeAsync`, so every read and write of `_attributeValues`/`_alarmStates` happens on the render thread.
### CentralUI-022 — `Deployments` push handler fires `InvokeAsync` with no disposal guard
@@ -1083,7 +1083,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Deployment/Deployments.razor:221-229,317-322` |
**Description**
@@ -1114,7 +1114,7 @@ rather than the whole table on each event.
**Resolution**
_Unresolved._
2026-05-17 — added a `volatile bool _disposed` set first in `Dispose()`; `OnDeploymentStatusChanged` no-ops when set, and the fire-and-forget dispatch (`DispatchReloadAsync`) swallows the residual `ObjectDisposedException`, mirroring the `DebugView`/`ToastNotification` guards.
### CentralUI-023 — Residual bare `catch {}` blocks swallow JS interop errors
@@ -1122,7 +1122,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Monitoring/ParkedMessages.razor:690-698`; `src/ScadaLink.CentralUI/Components/Shared/DiffDialog.razor:107-116,118-130,104` |
**Description**
@@ -1147,7 +1147,7 @@ call, consistent with the CentralUI-018 fixes in the same module.
**Resolution**
_Unresolved._
2026-05-17 — the bare `catch` blocks in `ParkedMessages.CopyAsync` and `DiffDialog.TryLockBodyAsync`/`TryUnlockBodyAsync`/`OnAfterRenderAsync` now catch `JSDisconnectedException` (and `InvalidOperationException` for prerender focus) silently and log genuine `JSException` failures via injected `ILogger`.
### CentralUI-024 — Claim lookups use magic strings instead of `JwtTokenService` constants
@@ -1155,7 +1155,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Layout/NavMenu.razor:102`; `src/ScadaLink.CentralUI/Components/Pages/Dashboard.razor:14`; `GetCurrentUserAsync` in `Templates.razor`, `TemplateEdit.razor`, `TemplateCreate.razor`, `SharedScripts.razor`, `SharedScriptForm.razor`, `Sites.razor`, `Topology.razor`, `InstanceCreate.razor`, `InstanceConfigure.razor` |
**Description**
@@ -1181,7 +1181,7 @@ or a small scoped service) so the claim lookup lives in exactly one place.
**Resolution**
_Unresolved._
2026-05-17 — added `ClaimsPrincipalExtensions` (`GetUsername`/`GetDisplayName`/`GetCurrentUsernameAsync`) resolving claims through the `JwtTokenService` constants; the ten copy-pasted `GetCurrentUserAsync` helpers and the `NavMenu`/`Dashboard` `DisplayName` lookups now delegate to it, eliminating every magic-string claim literal.
### CentralUI-025 — `SessionExpiry` polling/redirect path has no test coverage
@@ -1189,7 +1189,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Status | Resolved |
| Location | `tests/ScadaLink.CentralUI.Tests/Auth/SessionExpiryPolicyTests.cs`; `src/ScadaLink.CentralUI/Components/Shared/SessionExpiry.razor` |
**Description**
@@ -1215,4 +1215,4 @@ also forces the CentralUI-020 fix.
**Resolution**
_Unresolved._
2026-05-17 — added `SessionExpiryComponentTests` (bUnit): an expired ping (401) redirects to `/login`, a live ping (200) and a transient failure (status 0) do not, and on the `/login` route the component neither pings nor redirects; also added `AuthPingEndpointTests` covering the `/auth/ping` endpoint contract.
+30 -5
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 2 |
| Open findings | 0 |
## Summary
@@ -516,7 +516,7 @@ inspection of `ServiceCollectionExtensions.cs` and
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:74` |
**Description**
@@ -563,7 +563,21 @@ controls nothing.
**Resolution**
_Unresolved._
Root cause verified against the source at commit `39d737e`:
`src/ScadaLink.Host/Actors/AkkaHostedService.cs:147` hard-codes `down-if-alone = on`
inside the `keep-oldest` block, and `BuildHocon` consumes every other `ClusterOptions`
field but never reads `clusterOptions.DownIfAlone`. The finding's facts hold. The fix
is correctly scoped to the **Host** module — the configuration property and its
validation legitimately live in `ScadaLink.ClusterInfrastructure` (this module),
and the per-CLAUDE.md ownership split (CI-001) places HOCON generation in
`ScadaLink.Host`.
**Resolved** — by cross-reference, date 2026-05-17. No code change in this module:
`ClusterOptions.DownIfAlone` and its validation (CI-010) are correct and complete here.
The consumption fix — wiring `DownIfAlone` into `BuildHocon` so the property has runtime
effect — is tracked and fixed as **Host-012** in `code-reviews/Host/findings.md`,
addressed this session. CI-009 closes against that cross-reference. Module test suite
green (18 passed).
### ClusterInfrastructure-010 — Validator does not enforce `DownIfAlone = true` despite the design doc requiring it
@@ -571,7 +585,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptionsValidator.cs:21-71` |
**Description**
@@ -602,4 +616,15 @@ should be closed as resolved alongside it.
**Resolution**
_Unresolved._
Confirmed against the source: `ClusterOptionsValidator.Validate` guarded every other
safety-critical setting but performed no check on `DownIfAlone`, so `DownIfAlone: false`
passed validation cleanly. With CI-009/Host-012 wiring the property into the HOCON this
session, an unguarded `false` would now silently produce the unsafe isolated-single-node
behaviour the design doc forbids.
**Resolved** — fixing commit `commit pending`, date 2026-05-17. Added a `DownIfAlone`
check to `ClusterOptionsValidator.Validate` that fails when the flag is `false`, with a
message explaining the isolated-single-node-cluster hazard, consistent with how the
validator already rejects quorum split-brain strategies. Developed test-first:
`ClusterOptionsValidatorTests.DownIfAloneFalse_FailsValidation` was written first,
confirmed failing, then passing after the fix. Module test suite green (18 passed).
+27 -5
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 2 |
| Open findings | 0 |
## Summary
@@ -587,7 +587,7 @@ each pinned under `de-DE`).
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Commons/Types/DynamicJsonElement.cs:40-54` |
**Description**
@@ -611,7 +611,18 @@ single integer before the bounds check. Add a regression test indexing with a `l
**Resolution**
_Unresolved._
Resolved 2026-05-17 — confirmed the `int`-only guard, and during fixing found the
underlying cause was wider than the finding text: `Wrap`'s `TryGetInt64 ? long : double`
ternary unified both branches to `double`, so integral JSON numbers were never actually
boxed as `long``obj.count` always surfaced as `double`. Fixed at the root: `Wrap`
delegates to a new `WrapNumber` helper that boxes integral numbers as `long` and
non-integral as `double` with separate typed returns; `TryGetIndex` now accepts any
integral index via `TryGetIntegralIndex` (`int`/`long`/`short`/`byte`/`sbyte`/`ushort`/
`uint`/`ulong`, plus whole-valued `double`/`decimal`) normalized to `long` before the
bounds check. Internal-only change — no public-API or message-contract change. Regression
tests added in `DynamicJsonElementTests` (`IndexAccess_WithLongIndex_Works`,
`IndexAccess_WithIndexDerivedFromWrappedJsonNumber_Works`,
`IndexAccess_WithWideningIntegralIndex_Works`, `IndexAccess_WithLongIndexOutOfRange_Throws`).
### Commons-014 — `OpcUaEndpointConfigSerializer.Deserialize` can mislabel a corrupt typed row as `Legacy`
@@ -619,7 +630,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Commons/Serialization/OpcUaEndpointConfigSerializer.cs:107-131` |
**Description**
@@ -651,4 +662,15 @@ field.
**Resolution**
_Unresolved._
Resolved 2026-05-17 — confirmed the swallow-and-fall-through against the source. Split
`Deserialize` into a shape-detection phase and a materialization phase: it first parses
the document to decide whether the row is the current typed shape (root object carrying
`endpointUrl`); only a non-typed-shape row falls through to `LoadLegacy`. A row that *is*
the typed shape but fails `JsonSerializer.Deserialize` (invalid enum, wrong-typed field)
is now classified `Malformed` instead of being mislabelled `Legacy` with the offending
field silently dropped. The `OpcUaConfigParseResult` shape and `(Config, IsLegacy)`
deconstruction are unchanged, so existing callers are unaffected. XML docs updated to
describe the corrupt-typed-row branch. Regression tests added in
`OpcUaEndpointConfigSerializerTests` (`Deserialize_TypedShapeWithInvalidEnum_ReportsMalformedNotLegacy`,
`Deserialize_TypedShapeWithWrongTypeField_ReportsMalformedNotLegacy`,
`Deserialize_ValidTypedRow_StillReportsTyped`).
+51 -9
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 |
| Open findings | 0 |
## Summary
@@ -543,7 +543,7 @@ The full module suite (`dotnet test tests/ScadaLink.Communication.Tests`) is gre
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClientFactory.cs:39`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:166` |
**Description**
@@ -582,7 +582,17 @@ targets the other endpoint.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Root cause confirmed against source:
`GetOrCreate` was `_clients.GetOrAdd(siteIdentifier, …)` — keyed by site identifier
only, so the `grpcEndpoint` argument was honoured solely on first creation and the
NodeA→NodeB flip reconnected to the dead endpoint forever. Fix:
`SiteStreamGrpcClient` now exposes its bound `Endpoint`, and `GetOrCreate` compares
the cached client's endpoint against the requested one — on a mismatch it atomically
installs (via `ConcurrentDictionary.AddOrUpdate`) a fresh client for the new endpoint
and disposes the stale one, so a node flip actually moves to the surviving node.
Regression tests `SiteStreamGrpcClientFactoryTests.GetOrCreate_EndpointChanged_ReturnsClientBoundToNewEndpoint`
and `GetOrCreate_SameEndpoint_DoesNotDisposeOrRecreate` fail against the pre-fix
factory and pass after.
### Communication-013 — Site gRPC address changes are never applied; `RemoveSiteAsync` has no production caller
@@ -590,7 +600,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClientFactory.cs:58` |
**Description**
@@ -619,7 +629,19 @@ the on-the-fly address-change requirement is intentionally dropped, remove
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). The address-*change* staleness — the primary
impact ("never have working debug streaming until the central node is restarted") —
is fixed in-module by the Communication-012 change: `GetOrCreate` is now
endpoint-change-aware, so the next time `DebugStreamBridgeActor` requests a stream
with a corrected `GrpcNodeAAddress`/`GrpcNodeBAddress` the stale cached client is
disposed and replaced — no central restart needed and no external wiring required.
`RemoveSiteAsync` is retained as the disposal path for full site *removal* (a deleted
site record) and its doc comment now states that role explicitly; wiring a
delete-site callback belongs to the site-management flow in another module and is out
of this module's scope. Regression test
`SiteStreamGrpcClientFactoryTests.GetOrCreate_EndpointChanged_DisposesPriorClient`
fails against the pre-fix factory (stale client never disposed/replaced) and passes
after.
### Communication-014 — Untrusted gRPC `correlation_id` flows directly into an Akka actor name
@@ -627,7 +649,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs:124` |
**Description**
@@ -652,7 +674,17 @@ actor state / dictionary key.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Root cause confirmed: `SubscribeInstance` fed
the off-the-wire `request.CorrelationId` straight into the `stream-relay-…` actor
name, so an id with `/`, whitespace, or other disallowed characters made `ActorOf`
throw `InvalidActorNameException` as an unhandled RPC fault. Fix: `SubscribeInstance`
now validates `CorrelationId` on entry — rejecting null/empty or any value failing
`ActorPath.IsValidPathElement` with `StatusCode.InvalidArgument` before any actor or
subscription state is created. Regression test
`SiteStreamGrpcServerTests.RejectsCorrelationIdThatIsNotActorNameSafe` (theory:
`/`-bearing, whitespace, empty, `$`-prefixed ids) fails against the pre-fix server
and passes after; `AcceptsActorNameSafeCorrelationId` confirms a normal GUID is still
accepted.
### Communication-015 — No test exercises the real gRPC client factory across a node flip
@@ -660,7 +692,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Status | Resolved |
| Location | `tests/ScadaLink.Communication.Tests/Grpc/DebugStreamBridgeActorTests.cs:401`, `tests/ScadaLink.Communication.Tests/Grpc/SiteStreamGrpcClientFactoryTests.cs` |
**Description**
@@ -683,4 +715,14 @@ test's mock factory track the endpoint per call so node-flip coverage is meaning
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). `SiteStreamGrpcClientFactoryTests` gained
`GetOrCreate_EndpointChanged_ReturnsClientBoundToNewEndpoint` and
`GetOrCreate_EndpointChanged_DisposesPriorClient`, which drive the *real*
`SiteStreamGrpcClientFactory` across a node flip and assert the second call yields a
client bound to the new endpoint with the stale one disposed — both fail against the
pre-fix factory and pass after (the Communication-012 fix). `DebugStreamBridgeActorTests`
gained `On_GrpcError_Reconnects_To_Other_Node_Endpoint`, which uses a new
`EndpointTrackingGrpcClientFactory` test double that hands out a distinct mock client
per endpoint (instead of one fixed mock regardless of endpoint), so the bridge actor's
NodeA→NodeB reconnect is now verified to actually target the NodeB endpoint rather
than being masked by an endpoint-agnostic mock.
+81 -6
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 |
| Open findings | 1 |
## Summary
@@ -636,7 +636,42 @@ key value redacted.
**Resolution**
_Unresolved._
_Open — requires a cross-module design decision that cannot be made within this
module's editable scope._
Root cause confirmed against source. `ApiKey.KeyValue`
(`src/ScadaLink.Commons/Entities/InboundApi/ApiKey.cs`) is mapped as a plaintext
`nvarchar(500)` column with a unique index (`InboundApiConfiguration.cs:17-22`) and is
the bearer credential for Inbound API requests. The CD-004 `EncryptedStringConverter`
is correctly inapplicable here: it is non-deterministic, and the key is resolved
by value.
The recommended fix — store a deterministic salted/peppered hash instead of the
plaintext — cannot be completed within this module alone, for three concrete reasons:
1. **Commons entity change (out of scope).** Hashing changes the semantics of
`ApiKey.KeyValue`: after creation it can only ever hold the hash, never the live
key. The proper shape is a dedicated `KeyHash` field on the `ApiKey` entity (in
`ScadaLink.Commons`), which this task may not edit.
2. **Cross-module consumer change (out of scope).** `ScadaLink.InboundAPI`'s
`ApiKeyValidator.FindKeyConstantTime` reads `key.KeyValue` and does a constant-time
byte comparison against the *raw presented secret*. If the column becomes a hash,
the validator must hash the presented candidate before comparing — an InboundAPI
change. The CLI (`SecurityCommands.cs`) and Management Service also create/read
`ApiKey` and would need the "plaintext shown once at creation" model.
3. **Irreversible data migration / operational decision.** A hash is one-way, so a
migration cannot convert existing plaintext keys to hashes without the originals —
every existing API key must be re-issued. Whether and how to do that is an
operational/design call, not a mechanical fix.
**Decision needed (for a follow-up spanning Commons + InboundAPI + ManagementService
+ CLI):** (a) add an `ApiKey.KeyHash` column and a per-key salt (or a system pepper);
(b) hash on create and authenticate by hashing the presented key; (c) decide the
hash algorithm (e.g. PBKDF2/Argon2 vs. a fast salted SHA-256 — note the constant-time
lookup in `ApiKeyValidator` already mitigates timing, so a slow KDF is the safer
choice if the key space is small); (d) plan the one-time re-issue of existing keys;
(e) record the scheme in `docs/requirements/Component-ConfigurationDatabase.md` and the
Inbound API design doc. Until that decision is made the finding stays **Open**.
### ConfigurationDatabase-013 — Secret-column encryption silently falls back to an ephemeral (throwaway) key
@@ -644,7 +679,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/ScadaLinkDbContext.cs:107-124` |
**Description**
@@ -685,7 +720,33 @@ through the EF-activator registration at all (e.g. register only the factory, or
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Root cause confirmed against source:
`ApplySecretColumnEncryption` resolved the provider as
`_dataProtectionProvider ?? new EphemeralDataProtectionProvider()`, so a context built
via the single-argument constructor would silently encrypt secret columns with a
throwaway key and persist permanently undecryptable ciphertext with no error.
Applied the recommendation — a combination of options (a) and (b). The silent
ephemeral substitution was removed: a no-provider context now builds its model with an
internal `SchemaOnlyDataProtector` that satisfies schema generation but throws a clear
`InvalidOperationException` if it is ever asked to `Protect`/`Unprotect`. On top of
that, `SaveChanges`/`SaveChangesAsync` were overridden with a `GuardSecretWritesHaveAKeyRing`
check that fails fast — before any database round-trip — with a clear
`InvalidOperationException` (naming the offending `Entity.Property`) whenever a
no-provider context has a pending non-null write to one of the three secret-bearing
columns. A custom `SecretAwareModelCacheKeyFactory` (registered via
`OnConfiguring`/`ReplaceService`) folds the presence of a real provider into EF's model
cache key, so a write-capable and a schema-only context can no longer share one cached
model. Design-time `dotnet ef` tooling (which only emits schema) and the test fixture
both keep working — `SqliteTestDbContext` now passes an explicit
`EphemeralDataProtectionProvider`, making the test intent visible at the call site as
recommendation (a) asks. Regression tests added in `EphemeralEncryptionFallbackTests.cs`:
`SingleArgConstructor_WritingSecretColumn_FailsFast_DoesNotPersistThrowawayCiphertext`,
`SingleArgConstructor_WritingNonSecretColumn_Succeeds`, and
`ProviderConstructor_WritingSecretColumn_StillSucceeds`. The DI-hardening sub-point
(register only the factory) was not adopted — `AddConfigurationDatabase` already
overrides the EF-activator registration with a provider-bearing factory, and the
fail-fast guard now closes the residual gap for any other resolution path.
### ConfigurationDatabase-014 — Redundant, inconsistent cast on one `HasConversion` call
@@ -693,7 +754,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/ScadaLinkDbContext.cs:121-123` |
**Description**
@@ -716,4 +777,18 @@ that type once and use it for all three.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Partially re-triaged: the finding is correct
that the inline fully-qualified cast was inconsistent and noisy, but its premise that
the cast is *purely redundant* is wrong. `DatabaseConnectionDefinition.ConnectionString`
is a non-nullable `string`; `EncryptedStringConverter` is `ValueConverter<string?,
string?>`, so calling `HasConversion(converter)` with `converter` typed as
`EncryptedStringConverter` on that property raises a real `CS8620` nullability
error (verified by build). The cast to the non-generic `ValueConverter` base was
suppressing that — load-bearing, not noise. Applied the recommendation's second
option: the converter is now held once in a single `ValueConverter`-typed local
(with an explanatory comment), and all three `HasConversion(converter)` calls read
identically with no inline cast or fully-qualified name. Behaviour is unchanged, so
no behavioural regression test is meaningful (cf. CD-005); a forward guard was added
in `SchemaConfigurationTests.cs`
`SecretColumns_AllHaveEncryptedStringConverterApplied` (theory over all three secret
columns) — asserting each column keeps an `EncryptedStringConverter`.
+48 -9
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 |
| Open findings | 0 |
## Summary
@@ -688,7 +688,7 @@ guard), and it passes against the atomic fix.
|--|--|
| Severity | High |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:325`, `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:35-39,79-83` |
**Description**
@@ -724,7 +724,18 @@ the default is `false`.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). `RealOpcUaClientFactory` gained an
`ILoggerFactory` constructor parameter and `Create()` now passes
`_loggerFactory.CreateLogger<RealOpcUaClient>()` into every `RealOpcUaClient` it builds;
`DataConnectionFactory` (which already holds an `ILoggerFactory`) now constructs
`RealOpcUaClientFactory(globalOptions, _loggerFactory)`, so the DCL-012 auto-accept-cert
warning reaches a real logger in production instead of being discarded by `NullLogger`.
The parameterless / single-arg factory constructors still default to `NullLoggerFactory`
so existing callers are unaffected. Regression tests
`DCL014_RealOpcUaClientFactory_CreatesClientWithRealLogger` and
`DCL014_DataConnectionFactory_ThreadsLoggerToRealOpcUaClient` fail against the pre-fix
code (the first fails to compile — no logger ctor; the second observes a `NullLogger`)
and pass after.
### DataConnectionLayer-015 — Initial-connect failures never trigger failover; an unreachable primary at startup never tries the backup
@@ -732,7 +743,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:404-417`, `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:419-493` |
**Description**
@@ -769,7 +780,16 @@ backup".
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). The endpoint-switch failover logic was extracted
from `HandleReconnectResult` into a shared `CountFailureAndMaybeFailover` helper, and
`HandleConnectResult` (the initial-connect handler in the `Connecting` state) now
increments `_consecutiveFailures` and calls that helper on failure — so a primary that
is unreachable at startup fails over to the configured backup after
`FailoverRetryCount` attempts instead of retrying the primary forever. Single-endpoint
connections (no backup) still retry indefinitely. Regression test
`DCL015_PrimaryDownAtStartup_FailsOverToBackup` fails against the pre-fix code (the
backup adapter is never created) and passes after; `DCL015_SingleEndpointDownAtStartup_RetriesIndefinitely_NoFailover`
guards the no-backup path.
### DataConnectionLayer-016 — `HandleSubscribeCompleted` reports `SubscribeTagsResponse` success even on a connection-level subscribe failure
@@ -777,7 +797,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:606,666-672`, `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:232-240` |
**Description**
@@ -814,7 +834,16 @@ Instance Actor can reflect partial success. Add a test asserting the response is
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). `HandleSubscribeCompleted` now replies
`SubscribeTagsResponse(success: false, error: "connection unavailable — will
re-subscribe on reconnect")` when `connectionLevelFailure` is true — matching the
actor's own assessment as it drives into `Reconnecting` — instead of unconditionally
replying `success: true`. Genuine tag-resolution failures still reply `success: true`
(they are a runtime quality concern tracked via `_unresolvedTags`, with a `Bad`-quality
`TagValueUpdate` already pushed), so that case is not regressed. Regression test
`DCL016_ConnectionLevelSubscribeFailure_RepliesWithUnsuccessfulResponse` fails against
the pre-fix code (it returned `Success: true`) and passes after;
`DCL016_GenuineResolutionFailure_StillRepliesSuccess` guards the resolution-failure path.
### DataConnectionLayer-017 — `WriteBatchAsync` aborts the whole batch on a mid-batch disconnect
@@ -822,7 +851,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:229-237`, `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:218-227` |
**Description**
@@ -856,4 +885,14 @@ complete, consistent result map.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). `WriteBatchAsync` now wraps each per-tag
`WriteAsync` call in a try/catch — mirroring the DCL-007 fix for `ReadBatchAsync` — so a
mid-batch fault (the `InvalidOperationException` from `EnsureConnected` on a dropped
connection) is recorded as a failed `WriteResult(false, ex.Message)` for that tag and
the batch returns a complete result map; `OperationCanceledException` is still
propagated so a cancelled batch aborts as a whole. Callers (including
`WriteBatchAndWaitAsync`) now get a consistent per-tag result shape instead of an
unhandled exception. Regression test
`DCL017_WriteBatch_ReturnsPerTagResults_WhenConnectionDropsMidBatch` fails against the
pre-fix code (the batch throws, no map returned) and passes after;
`DCL017_WriteBatch_CancellationAbortsWholeBatch` guards that cancellation still aborts.
+24 -7
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 |
| Open findings | 0 |
## Summary
@@ -740,7 +740,7 @@ covers DeploymentManager-010), `DeployToAllSitesAsync_PartialFailure_ReportsPerS
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:631-655` |
**Description**
@@ -785,7 +785,15 @@ reconciled deployment leaves the instance `Enabled` and a snapshot stored.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending): extracted the shared post-success side
effects into `ApplyPostSuccessSideEffectsAsync` (sets instance `State =
Enabled` + `UpdateInstanceAsync`, stores/refreshes the `DeployedConfigSnapshot`)
and invoked it from both the normal deploy success path and the
`TryReconcileWithSiteAsync` reconciled-success branch, so a reconciled
deployment now performs the same instance-state and snapshot updates as a
normal one (`configJson` is now computed before the reconciliation call and
threaded into `TryReconcileWithSiteAsync`). Regression test:
`DeployInstanceAsync_Reconciled_SetsInstanceEnabledAndStoresSnapshot`.
### DeploymentManager-016 — Reconciled prior record keeps its stale `RevisionHash`
@@ -793,7 +801,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:639-651` |
**Description**
@@ -824,7 +832,11 @@ normal deploy.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending): the reconciled-success branch of
`TryReconcileWithSiteAsync` now also sets `prior.RevisionHash =
targetRevisionHash`, so the persisted record, the `DeployReconciled` audit
entry, and the site's actually-applied revision all agree. Regression test:
`DeployInstanceAsync_Reconciled_PriorRecordRevisionHashUpdatedToTarget`.
### DeploymentManager-017 — `GetDeploymentStatusAsync` XML doc describes behaviour it does not implement
@@ -832,7 +844,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:562-570` |
**Description**
@@ -855,4 +867,9 @@ configuration database" — and, if useful, cross-reference
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending): the `GetDeploymentStatusAsync` XML doc
now states it returns the persisted `DeploymentRecord` from the configuration
database as a pure local read, and cross-references `TryReconcileWithSiteAsync`
as where the query-the-site-before-redeploy reconciliation actually lives.
Documentation-only change; no regression test (a test asserting comment text
would be meaningless).
+36 -7
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 |
| Open findings | 0 |
## Summary
@@ -788,7 +788,7 @@ against regression.
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:120-127`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:102-108` |
**Description**
@@ -842,7 +842,18 @@ outcome (parked / not retried), not just the stored column value.
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed: the S&F engine treats a stored
`MaxRetries == 0` as "no limit / retry forever" (`StoreAndForwardMessage.MaxRetries`
doc "0 = no limit"; sweep guard `MaxRetries > 0 && RetryCount >= MaxRetries`), while
the entity's non-nullable `int MaxRetries` defaults to `0` — so passing it verbatim
buffered every cached call/write as an unbounded retry loop. Fix (ESG-side only,
recommendation (a)): `CachedCallAsync` and `CachedWriteAsync` now pass
`MaxRetries > 0 ? MaxRetries : null`, so an entity `0` is treated as "unset" and the
bounded S&F `DefaultMaxRetries` applies; the misleading "0 = never retry" inline
comments were corrected. The two `ZeroMaxRetries...` tests were rewritten to
`CachedCall_TransientFailure_ZeroMaxRetriesIsTreatedAsUnsetNotRetryForever` /
`CachedWrite_ZeroMaxRetriesIsTreatedAsUnsetNotRetryForever`, asserting the buffered
message carries the bounded default (99) and never `0`.
### ExternalSystemGateway-016 — `ConfigureHttpClientDefaults` applies the ESG connection cap to every `HttpClient` in the host process
@@ -850,7 +861,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:21-29` |
**Description**
@@ -891,7 +902,18 @@ preferred fix is to stop using `ConfigureHttpClientDefaults`.
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed: `ConfigureHttpClientDefaults` is
process-global and replaced the primary handler of every `IHttpClientFactory` client
in the host, leaking the ESG connection cap onto unrelated clients. Fix: the global
`ConfigureHttpClientDefaults` registration was replaced with an
`IConfigureNamedOptions<HttpClientFactoryOptions>` (`GatewayHttpClientConfigurator`)
that applies the `SocketsHttpHandler`/`MaxConnectionsPerServer` cap only to clients
whose name starts with `ExternalSystem_` (the gateway's own per-system clients), so
clients owned by other components keep their own (or the framework default) primary
handler. Regression test
`ServiceWiringTests.MaxConcurrentConnectionsPerSystem_IsNotAppliedToNonGatewayHttpClients`
asserts a non-gateway client does not inherit the cap while the gateway client still
does; it was verified to fail before the fix.
### ExternalSystemGateway-017 — `BuildUrl` appends a bare trailing `?` when a GET method's parameters are all null
@@ -899,7 +921,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:324-333` |
**Description**
@@ -921,4 +943,11 @@ produces a clean URL identical to the no-parameters case.
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed: `BuildUrl` appended `"?" + queryString`
whenever the GET/DELETE parameter dictionary was non-empty, even when every value
was null and `queryString` was the empty string, yielding a bare trailing `?`. Fix:
`BuildUrl` now appends `"?" + queryString` only when `queryString.Length > 0`, so a
method whose effective parameter set is empty produces a URL identical to the
no-parameters case. Regression test
`Call_GetWithAllNullParameters_DoesNotAppendTrailingQuestionMark` asserts the
captured request URI has no trailing `?`; it was verified to fail before the fix.
+60 -8
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 |
| Open findings | 1 |
## Summary
@@ -585,7 +585,7 @@ the contract is now honest and no further code change was required.
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:194-196` |
**Description**
@@ -617,7 +617,16 @@ that the cadence is derived from `OfflineTimeout` only (acceptable while the def
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed against source — `ExecuteAsync` derived the
`PeriodicTimer` cadence solely from `OfflineTimeout` while the comment claimed it
tracked the "(shorter)" timeout. Took the code-matches-comment option: extracted the
cadence into `CentralHealthAggregator.ComputeCheckInterval`, which now derives it from
half of the *shorter* of `OfflineTimeout` and `CentralOfflineTimeout`, so a
`CentralOfflineTimeout` configured below `OfflineTimeout` is still polled at least
twice within its window. The comment was rewritten to match. Regression test
`CentralHealthAggregatorTests.CheckInterval_IsHalfTheShorterTimeout` asserts the
default case (30s) and the shorter-`CentralOfflineTimeout` case (10s) — the latter
would have returned 30s against the pre-fix code.
### HealthMonitoring-014 — `HealthMonitoringOptions` intervals are unvalidated; a zero/negative value crashes the hosted service
@@ -625,7 +634,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/HealthMonitoringOptions.cs:3-20`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:196`, `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:67`, `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:63` |
**Description**
@@ -653,7 +662,21 @@ configuration fails fast with a message naming the section and key.
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed — `HealthMonitoringOptions` had no
validator, so a zero/negative interval reached `new PeriodicTimer(...)` and crashed
the hosted service with an opaque `ArgumentOutOfRangeException`. Added
`HealthMonitoringOptionsValidator : IValidateOptions<HealthMonitoringOptions>` that
rejects non-positive `ReportInterval`/`OfflineTimeout`/`CentralOfflineTimeout` and a
`CentralOfflineTimeout` shorter than `OfflineTimeout`, each failure naming the
`ScadaLink:HealthMonitoring` config key. It is registered (idempotently, via
`TryAddEnumerable`) by all three `ServiceCollectionExtensions` registration methods,
so it fires when the hosted services resolve `IOptions.Value` at startup — failing
fast with a clear message. (`ValidateOnStart()` lives in the Host module's binding
call, which is out of scope; the validator nonetheless runs at startup because the
hosted-service constructors resolve the options eagerly — matching the existing
`ClusterOptionsValidator` registration pattern.) Regression tests in
`HealthMonitoringOptionsValidatorTests` cover the valid default plus zero/negative
intervals and the `CentralOfflineTimeout < OfflineTimeout` case.
### HealthMonitoring-015 — Heartbeat-registered site is left with a year-0001 `LastReportReceivedAt`
@@ -689,7 +712,25 @@ nullable option is safer and matches the existing `LatestReport` treatment.
**Resolution**
_Unresolved._
_Unresolved — left Open; requires a coordinated cross-module change._ Root cause
confirmed against source: `CentralHealthAggregator.MarkHeartbeat` registers an
unknown site with `LastReportReceivedAt = default` (`0001-01-01`), and the audit of
the single UI reader (`src/ScadaLink.CentralUI/Components/Pages/Monitoring/Health.razor:74`)
confirms the bug is live — it passes `state.LastReportReceivedAt` straight into the
`TimestampDisplay` component, which would render the year-0001 sentinel verbatim for
a heartbeat-only site. The finding's recommended fix — change the field to
`DateTimeOffset?` — is the correct one, but it is a **breaking public-API change**:
`TimestampDisplay.Value` is a non-nullable `DateTimeOffset` parameter, so making
`LastReportReceivedAt` nullable stops `Health.razor` compiling, and the
`CentralUI` module is explicitly outside this task's edit scope. The only fix that
stays module-local would be to stamp the heartbeat time into `LastReportReceivedAt`,
which is semantically dishonest (the field documents "time the latest full report
was processed") and would simply move the bug rather than fix it. The correct
resolution therefore needs the HealthMonitoring change (`DateTimeOffset?`) **and** a
matching `CentralUI` change (a null-checked `TimestampDisplay` or an "awaiting first
report" placeholder) applied together — a design decision spanning two modules.
**Called out for follow-up: open as a coordinated HealthMonitoring + CentralUI work
item.**
### HealthMonitoring-016 — `SiteHealthCollector.CollectReport` reads `DateTimeOffset.UtcNow` directly instead of an injected `TimeProvider`
@@ -697,7 +738,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs:151` |
**Description**
@@ -722,4 +763,15 @@ testable and the module is consistent.
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed — `CollectReport` stamped `ReportTimestamp`
from `DateTimeOffset.UtcNow` directly while every other time-dependent class in the
module takes an injectable `TimeProvider`. Added an optional `TimeProvider`
constructor parameter to `SiteHealthCollector` (defaulting to `TimeProvider.System`,
mirroring `CentralHealthAggregator`/`HealthReportSender`/`CentralHealthReportLoop`)
and `CollectReport` now derives `ReportTimestamp` from `_timeProvider.GetUtcNow()`.
The DI registration (`AddSingleton<ISiteHealthCollector, SiteHealthCollector>`)
continues to work via the optional parameter. Regression test
`SiteHealthCollectorTests.CollectReport_StampsTimestamp_FromInjectedTimeProvider`
asserts the timestamp equals a fixed injected instant exactly (not just a
before/after window); it would not compile against the pre-fix single-arg-less
constructor.
+58 -9
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 |
| Open findings | 0 |
## Summary
@@ -585,7 +585,7 @@ key). Full Host suite green (175 passed).
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:146-148` |
**Description**
@@ -618,7 +618,15 @@ declared-but-dead.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`). Root cause confirmed against
`AkkaHostedService.cs`: `BuildHocon` emitted `keep-oldest { down-if-alone = on }` as
a literal, and `ClusterOptions.DownIfAlone` was referenced nowhere outside its own
declaration, so the bound option was dead. Fix: `BuildHocon` now emits
`down-if-alone = {(clusterOptions.DownIfAlone ? "on" : "off")}`, consuming the bound
property. Regression tests in `HoconBuilderTests`: `BuildHocon_DownIfAloneTrue_EmitsOn`
and `BuildHocon_DownIfAloneFalse_EmitsOff` parse the resulting HOCON and assert the
`keep-oldest.down-if-alone` token tracks the property; the `false` case failed before
the fix (token was still `on`) and passes after. Full Host suite green (182 passed).
### Host-013 — `:F0` rounding of cluster timing values silently degrades sub-second configuration
@@ -626,7 +634,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:135-136,145,151-152` |
**Description**
@@ -653,7 +661,17 @@ number of seconds and fail fast otherwise. Either way, do not silently round.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`). Root cause confirmed: `BuildHocon`
rendered every duration via `TotalSeconds:F0`, rounding sub-second values — a 750ms
heartbeat became `1s` and a 500ms value would collapse to a degenerate `0s`. Fix:
durations are now rendered by a new `DurationHocon` helper that emits whole
milliseconds with an `ms` suffix (Akka's HOCON parser accepts it), so sub-second
configuration survives exactly; `BuildHocon` takes the transport heartbeat/failure
as `TimeSpan` rather than pre-rounded `double` seconds. Regression test
`HoconBuilderTests.BuildHocon_SubSecondTimings_PreservedWithoutRounding` configures
750ms/2600ms/500ms/2.5s/7.5s timings and asserts each round-trips through the parsed
HOCON unchanged; it failed before the fix (750ms parsed as `00:00:01`) and passes
after. Full Host suite green (182 passed).
### Host-014 — Serilog sinks are hard-coded in `Program.cs`, not configuration-driven (REQ-HOST-8)
@@ -661,7 +679,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Host/Program.cs:43-48`, `src/ScadaLink.Host/appsettings.json:1-7` |
**Description**
@@ -690,7 +708,22 @@ state the sinks are intentionally code-defined — but the design doc currently
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`). Root cause confirmed: `LoggerConfigurationFactory.Build`
called `ReadFrom.Configuration` but neither `appsettings.json` nor either
role-specific file contained a `Serilog` section, so the two sinks that actually ran
were hard-coded `.WriteTo.Console(...)` / `.WriteTo.File(...)` calls in `Program.cs`
— uneditable without recompiling, contrary to REQ-HOST-8. Fix: added a `Serilog`
section to `appsettings.json` with `WriteTo` entries for the Console and File sinks
(carrying the output template, file path `logs/scadalink-.log` and `Day` rolling
interval as args) and removed the hard-coded `.WriteTo` calls from `Program.cs`; the
factory's `ReadFrom.Configuration` (backed by the transitive
`Serilog.Settings.Configuration` package) now drives the sinks. Regression tests in
new `SerilogSinkConfigTests`: `ShippedAppSettings_HasSerilogSection_WithConsoleAndFileSinks`
asserts the shipped `appsettings.json` declares both sinks in a `Serilog:WriteTo`
array (fails against the pre-fix file, which had no `Serilog` section), and
`LoggerConfigurationFactory_AppliesConfiguredFileSink` proves a configuration-supplied
File sink reaches the built logger and writes to the configured path. Full Host suite
green (182 passed).
### Host-015 — `StartupRetry` retries on every exception type, including permanent failures
@@ -698,7 +731,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Host/StartupRetry.cs:36-45` |
**Description**
@@ -727,4 +760,20 @@ single attempt.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`). Root cause confirmed: `ExecuteWithRetryAsync`
caught `Exception` with only the `when (attempt < maxAttempts)` guard, so it retried
any exception — including a permanent schema-version mismatch raised by the
migration call site, delaying the inevitable fatal exit by ~2 minutes. Fix: added an
optional `Func<Exception, bool> isTransient` parameter (default treats all
exceptions as transient, preserving prior behaviour for unclassified callers); the
catch guard is now `when (attempt < maxAttempts && isTransient(ex))`, so a
non-transient fault propagates after a single attempt. Added a
`StartupRetry.IsTransientDatabaseFault` classifier that returns `true` only for
connection-class faults (`TimeoutException`, `SocketException`, `SqlException` /
`DbException` by type-name match) and `false` for everything else — notably
schema-validation `InvalidOperationException`s — and the `Program.cs` migration call
site now passes it. Regression tests in `StartupRetryTests`:
`ExecuteWithRetry_NonTransientFailure_RethrowsAfterSingleAttempt` (runs exactly once
when `isTransient` returns false) and `ExecuteWithRetry_TransientThenPermanent_StopsAtPermanent`
(retries a `TimeoutException` then stops at a permanent `InvalidOperationException`).
Full Host suite green (182 passed).
+76 -9
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 |
| Open findings | 0 |
## Summary
@@ -606,7 +606,7 @@ from "key not approved"), but that doc edit is outside this module's editable sc
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:201-205`, `src/ScadaLink.Commons/Entities/InboundApi/ApiMethod.cs:10` |
**Description**
@@ -639,7 +639,24 @@ value serialized as-is. Code and design doc must be reconciled.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`): root cause confirmed — `ApiMethod` carries
`ReturnDefinition` but the executor did a blind `JsonSerializer.Serialize(result)`,
so a script returning the wrong shape silently emitted a malformed 200. Took
option (a): added `ReturnValueValidator`, the response-side mirror of
`ParameterValidator`. It parses `ReturnDefinition` (a JSON array of `{name,type}`
field definitions, same extended-type set as parameters), validates the serialized
script result against it — declared fields must be present with a compatible JSON
type, primitives type-checked, `Object`/`List` shape-checked — and a `null`/non-object
result is rejected when a structure is declared. `InboundScriptExecutor.ExecuteAsync`
now runs the validator after serialization and, on mismatch, logs and returns a
script failure (`"Method return value did not match its return definition"`, → 500)
instead of a malformed 200. A method with no `ReturnDefinition` stays unconstrained
(backward compatible). Doc-owner follow-up (outside this module's editable scope):
the `Component-InboundAPI.md` "Response Format" section may note that return shaping
is validation-only (no coercion). Regression tests: `ReturnValueValidatorTests`
(12 cases) plus executor-level `ReturnValue_MatchingReturnDefinition_Succeeds`,
`ReturnValue_NotMatchingReturnDefinition_ReturnsFailureNotMalformed200`, and
`ReturnValue_NoReturnDefinition_IsUnconstrained`.
### InboundAPI-015 — `ForbiddenApiChecker` is purely textual and is bypassable via reflection reachable without a forbidden namespace token
@@ -647,7 +664,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/ForbiddenApiChecker.cs:63-119`, `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:109-126` |
**Description**
@@ -691,7 +708,29 @@ that the current check is best-effort and does not stop a determined script.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`): the specific reflection-via-permitted-member
vector was confirmed and the textual checker materially hardened against it (full
sandboxing remains a separate, larger design effort — see below). `ForbiddenApiWalker`
now, in addition to the namespace deny-list, rejects a curated set of reflection-gateway
**member names** (`GetType`, `GetTypeInfo`, `Assembly`, `Module`, `CreateInstance`,
`InvokeMember`, `GetMethod(s)`, `GetConstructor(s)`, `GetField(s)`, `GetProperty(ies)`,
`GetMember(s)`, `GetRuntimeMethod(s)`, `MethodHandle`, `TypeHandle`) regardless of the
receiver expression — so `typeof(string).Assembly.GetType("System.IO.File")` is now
caught because `.Assembly` and `.GetType` appear as accessed member names. It also
rejects a bare `Activator` identifier and the `dynamic` keyword (which widens
late-bound access the static walker cannot see through). `Invoke` is deliberately
**not** flagged so legitimate `Action`/`Func` delegate invocation still compiles —
the reflection `MethodInfo.Invoke` path is cut off by rejecting the `GetMethod` that
produces the `MethodInfo`. **Documented limitation:** this is hardened defence-in-depth,
not a true sandbox — a determined author may still find a vector the syntax walker
cannot see (e.g. via `Microsoft.CSharp.RuntimeBinder` internals or generics tricks).
Genuine containment needs a runtime boundary (restricted `AssemblyLoadContext` /
curated reference set that does not expose reflection-to-arbitrary-type / out-of-process
sandbox); that is tracked as a future design change and noted in the `ForbiddenApiChecker`
XML summary. Regression tests: new `ForbiddenApiCheckerTests` suite (19 cases) covering
the `Assembly`/`GetType`/`Type.GetType`/`Activator.CreateInstance`/`InvokeMember`/
`GetMethod`/`GetTypeInfo`/`dynamic` bypass vectors plus permitted-script and
namespace-deny-list regression guards.
### InboundAPI-016 — Routed `Route.To().Call()` invocations are not bound by the method timeout
@@ -699,7 +738,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/RouteHelper.cs:59-152`, `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:177`, `:199` |
**Description**
@@ -738,7 +777,24 @@ and abandon the in-flight request when it fires.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`): root cause confirmed — every `RouteTarget`
method took `CancellationToken cancellationToken = default`, so a natural script
`Route.To("inst").Call("doWork", p)` routed with `CancellationToken.None` and was not
bound by the method timeout at all. `RouteHelper` now carries the executing method's
deadline token: `InboundScriptExecutor.ExecuteAsync` calls the new
`RouteHelper.WithDeadline(cts.Token)` when it builds the script context, so the route
helper handed to the script is bound to the method-level timeout CTS. Each
`RouteTarget` method resolves an *effective* token — the explicitly-supplied token if
the caller passed one (tighter bound preserved), otherwise the method deadline — and
forwards it into both `IInstanceLocator` site resolution and the routed call. The
deadline token therefore flows through to `CommunicationService.RouteTo*Async`, so
an in-flight routed call observes cancellation when the method timeout fires instead
of running orphaned. Regression tests (in the new `RouteHelperTests`):
`Call_WithNoExplicitToken_InheritsMethodDeadlineToken`,
`Call_WhenMethodDeadlineCancelled_RoutedCallObservesCancellation`,
`Call_ExplicitToken_OverridesDeadlineToken`,
`GetAttributes_WithNoExplicitToken_InheritsMethodDeadlineToken`,
`SetAttributes_WithNoExplicitToken_InheritsMethodDeadlineToken`.
### InboundAPI-017 — `RouteHelper` / `RouteTarget` has no test coverage
@@ -746,7 +802,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/RouteHelper.cs:1-165`, `tests/ScadaLink.InboundAPI.Tests/` |
**Description**
@@ -776,4 +832,15 @@ wiring is added.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`): confirmed — `ScadaLink.InboundAPI.Tests` had
no file exercising `RouteHelper`/`RouteTarget`. To make the surface testable without a
live actor system, an `IInstanceRouter` seam was introduced in the module (the routing
transport `RouteHelper` depends on); the production `CommunicationServiceInstanceRouter`
delegates to `CommunicationService` and is registered by `AddInboundAPI`. `RouteHelper`
now depends on `IInstanceLocator` + `IInstanceRouter` (both substitutable). Added the
`RouteHelperTests` suite (15 cases) covering: the happy path of `Call`/`GetAttribute(s)`/
`SetAttribute(s)`, correlation-ID generation, the unresolved-instance
`InvalidOperationException` path, the `!Success``InvalidOperationException` mapping
for each routed method, `GetAttribute` delegating to the batch `GetAttributes` and
returning `null` for an absent key, `SetAttribute` delegating to `SetAttributes`, and
the InboundAPI-016 deadline-token inheritance behaviour. All 15 pass.
+47 -9
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 (1 Deferred — see ManagementService-012) |
| Open findings | 0 (1 Deferred — see ManagementService-012) |
## Summary
@@ -576,7 +576,7 @@ pre-existing site-scope and DebugStreamHub suites. `dotnet test` -> 48 passed.
|--|--|
| Severity | High |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:306`, `:1174` |
**Description**
@@ -609,7 +609,20 @@ instances.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). **Re-triage:** the finding understated the gap — it
claimed `QueryDeploymentsCommand` was already "gated to the `Deployment` role by
`GetRequiredRole` (lines 170177)". Verified against the source: `QueryDeploymentsCommand`
appeared nowhere in `GetRequiredRole`, so it required *no* role at all — any authenticated
user could read every deployment record system-wide. Fix applied both gates: added
`QueryDeploymentsCommand` to the `Deployment`-role group in `GetRequiredRole`, and threaded
`AuthenticatedUser` into `HandleQueryDeployments` — the `InstanceId` branch now calls
`EnforceSiteScopeForInstance`; the unfiltered branch resolves each record's instance to its
`SiteId` (cached) and drops records outside `PermittedSiteIds`, mirroring `HandleListInstances`.
Regression tests: `QueryDeployments_WithDesignRole_ReturnsUnauthorized`,
`QueryDeployments_FilteredByOutOfScopeInstance_ReturnsUnauthorized`,
`QueryDeployments_FilteredByInScopeInstance_ReturnsRecords`,
`QueryDeployments_UnfilteredForSiteScopedUser_DropsOutOfScopeRecords`,
`QueryDeployments_UnfilteredForAdminUser_ReturnsAllRecords`.
### ManagementService-015 — HandleSetInstanceOverrides applies overrides non-atomically
@@ -617,7 +630,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:647``:659` |
**Description**
@@ -644,7 +657,16 @@ can reconcile.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Confirmed: each `SetAttributeOverrideAsync` call
commits independently, so a mid-batch failure left earlier overrides persisted.
`HandleSetInstanceOverrides` now validates every requested attribute up front against the
instance's template (exists, not locked) and only begins applying once the whole batch is
known valid — eliminating the realistic partial-mutation failure modes (unknown / locked
attribute). `InstanceService` is outside this module's edit scope and offers no batch/
transactional method, so a genuine database fault mid-apply remains theoretically possible;
this residual is documented in a code comment on the handler. Regression tests:
`SetInstanceOverrides_WithOneInvalidAttribute_PersistsNoOverrides`,
`SetInstanceOverrides_AllValid_PersistsAllOverrides`.
### ManagementService-016 — Unexpected exception messages returned verbatim to HTTP callers
@@ -652,7 +674,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:121``:131` |
**Description**
@@ -679,7 +701,17 @@ can still correlate to the server log via the correlation ID.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Added a `ManagementCommandException` type for curated,
caller-safe failures and converted every curated `throw` in `ManagementActor` (the 34
`result.Error` rethrows and 15 "not found" / delete-blocked messages) to it. `MapFault` now
returns the message verbatim only for `ManagementCommandException`; any other fault (DB
errors, `Enum.Parse` `ArgumentException`, `NullReferenceException`, the unknown-command
`NotSupportedException`, etc.) yields a generic `"An internal error occurred
(CorrelationId=...)"` string while the full exception is still logged server-side. Regression
tests: `UnexpectedFault_ReturnsGenericMessage_NotRawExceptionText`,
`CuratedHandlerFailure_SurfacesTheCuratedMessage`; the two pre-existing
`*_WhenRepoThrows_*` tests were updated to assert the raw repository message is no longer
leaked.
### ManagementService-017 — QueryDeploymentsCommand has no test coverage
@@ -687,7 +719,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Status | Resolved |
| Location | `tests/ScadaLink.ManagementService.Tests/ManagementActorTests.cs:1` |
**Description**
@@ -709,4 +741,10 @@ Deployment user against in-scope and out-of-scope deployment records.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit pending). Added seven `QueryDeployments_*` tests to
`ManagementActorTests`: role gate (`_WithDesignRole_ReturnsUnauthorized`), the
`InstanceId`-filtered and unfiltered branches (`_FilteredByInstanceId_ReturnsInstanceRecords`,
`_UnfilteredWithDeploymentRole_ReturnsAllRecords`), and site-scope coverage for a site-scoped
Deployment user and an Admin user, in- and out-of-scope
(`_FilteredByOutOfScopeInstance_ReturnsUnauthorized`, `_FilteredByInScopeInstance_ReturnsRecords`,
`_UnfilteredForSiteScopedUser_DropsOutOfScopeRecords`, `_UnfilteredForAdminUser_ReturnsAllRecords`).
+11 -11
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 5 |
| Open findings | 0 |
## Summary
@@ -497,7 +497,7 @@ Module test suite is green at 56 tests.
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:214-228`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:308-312`, `src/ScadaLink.NotificationService/OAuth2TokenService.cs:56-84` |
**Description**
@@ -510,7 +510,7 @@ Add a catch-all to `DeliverBufferedAsync` for exceptions that `ClassifySmtpError
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed against source — `DeliverBufferedAsync` caught only `SmtpPermanentException`, so an OAuth2 token-fetch `HttpRequestException`/`InvalidOperationException` escaped the handler and the S&F engine reinterpreted any throw as transient. Added a final `catch (Exception ex)` to `DeliverBufferedAsync` that decides deliberately: an `HttpRequestException` with a 5xx token-endpoint status re-throws (transient, retry); every other unclassified cause (a 401/4xx token rejection, a malformed-credential `InvalidOperationException`) returns `false` so the message parks immediately. Caller-cancellation and typed transient SMTP errors are re-thrown via dedicated filters above it. Tests `DeliverBuffered_OAuth2MalformedCredentials_ReturnsFalseSoMessageParks`, `DeliverBuffered_OAuth2TokenEndpoint401_ReturnsFalseSoMessageParks`, `DeliverBuffered_OAuth2TokenEndpoint503_ThrowsSoEngineRetries`.
### NotificationService-015 — Unclassified exceptions (OAuth2 token fetch, non-cancellation OCE) escape `SendAsync` to the calling script
@@ -518,7 +518,7 @@ _Unresolved._
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:96-148`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:308-312` |
**Description**
@@ -531,7 +531,7 @@ Add a final `catch (Exception ex)` to `SendAsync` that converts any otherwise-un
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed — `SendAsync` had only three catch clauses and an `Unknown`-classified exception (OAuth2 `HttpRequestException`/`InvalidOperationException`) fell through all of them and escaped to the calling script. Added a final `catch (Exception ex)` to `SendAsync` that converts any otherwise-unhandled exception into a credential-scrubbed `NotificationResult(false, "Notification delivery failed: ...")` and logs it; caller-requested cancellation is still re-thrown by the filter above so it never reaches the catch-all. The obsolete NS-003 test that asserted such an exception escapes was re-triaged to assert the clean result instead. Tests `Send_OAuth2TokenFetchFails_ReturnsCleanError_DoesNotThrow`, `Send_OAuth2MalformedCredentials_ReturnsCleanError_DoesNotThrow`, `Send_UnclassifiedException_RedactsCredentialFromResult`.
### NotificationService-016 — `AuthenticateAsync` silently sends unauthenticated for an unknown auth type or empty credentials
@@ -539,7 +539,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:46-67` |
**Description**
@@ -552,7 +552,7 @@ Make missing/unparseable credentials and an unrecognised `AuthType` hard errors:
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed — `AuthenticateAsync` returned silently for null/empty credentials, had no `default:` arm, and skipped a "basic" credential that did not split into two parts, so the connection sent mail unauthenticated. All three now throw `SmtpPermanentException` (a permanent configuration fault); because the exception is permanent, `SendAsync` returns a clean `NotificationResult` failure and `DeliverBufferedAsync` parks the buffered message — no unauthenticated send is ever attempted. Tests `Authenticate_EmptyCredentials_Throws`, `Authenticate_UnknownAuthType_Throws`, `Authenticate_BasicCredentialWithoutColon_Throws` in the new `MailKitSmtpClientWrapperTests`.
### NotificationService-017 — `NotificationOptions` is bound from configuration but never read (dead config)
@@ -560,7 +560,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationOptions.cs:1-15`, `src/ScadaLink.NotificationService/ServiceCollectionExtensions.cs:10-11`, `src/ScadaLink.Host/SiteServiceRegistration.cs:70` |
**Description**
@@ -573,7 +573,7 @@ Either delete `NotificationOptions` and both of its registrations, or genuinely
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed — `NotificationOptions` was bound but never read. Implemented the documented-fallback intent rather than deleting it: `NotificationDeliveryService` now takes an optional `IOptions<NotificationOptions>` and uses its `ConnectionTimeoutSeconds`/`MaxConcurrentConnections` whenever the deployed `SmtpConfiguration` field is non-positive (a value on the row still wins). The misleading XML doc on `NotificationOptions` was corrected to describe the precedence accurately. The duplicate `services.Configure<NotificationOptions>` in `Host/SiteServiceRegistration.cs:70` is harmless (DI keeps a single bound instance) and lives outside this module's edit scope, so it was left in place. Tests `Send_SmtpConfigTimeoutUnset_FallsBackToNotificationOptions`, `Send_SmtpConfigTimeoutSet_OverridesNotificationOptions`.
### NotificationService-018 — Concurrency limiter: lock-free read of a non-volatile field, never resized on redeployment, never disposed
@@ -581,7 +581,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:237-255` |
**Description**
@@ -594,4 +594,4 @@ Replace the hand-rolled double-checked init with `Lazy<SemaphoreSlim>` or `LazyI
**Resolution**
_Unresolved._
Resolved 2026-05-17. All three issues confirmed against source. The hand-rolled double-checked init was replaced with a `Lazy<SemaphoreSlim>` — its publication is correctly synchronised, eliminating the lock-free read of a non-`volatile` reference. `NotificationDeliveryService` now implements `IDisposable` and disposes the limiter (if created) under the existing lock, with idempotent re-entry and an `ObjectDisposedException` guard in `SendAsync`/`GetConcurrencyLimiter`; the scoped DI registration disposes it per scope. The limiter remains scoped (not hoisted to a site singleton) — the design doc deploys one SMTP config per site and the per-instance capture is bounded; the redeploy-resize concern is acknowledged as low-impact and not changed here, since hoisting would require a registration change for marginal benefit. Tests `Service_Dispose_DisposesConcurrencyLimiter` plus the existing `Send_MaxConcurrentConnections_LimitsConcurrentDeliveries`.
+30 -96
View File
@@ -40,34 +40,34 @@ module file and counted in **Total**.
| Severity | Open findings |
|----------|---------------|
| Critical | 0 |
| High | 8 |
| Medium | 26 |
| Low | 32 |
| **Total** | **66** |
| High | 0 |
| Medium | 2 |
| Low | 0 |
| **Total** | **2** |
## Module Status
| Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total |
|--------|---------------|--------|----------------|------|-------|
| [CLI](CLI/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/2 | 3 | 16 |
| [CentralUI](CentralUI/findings.md) | 2026-05-16 | `9c60592` | 0/1/2/3 | 6 | 25 |
| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/1 | 2 | 10 |
| [Commons](Commons/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/2 | 2 | 14 |
| [Communication](Communication/findings.md) | 2026-05-16 | `9c60592` | 0/1/1/2 | 4 | 15 |
| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-16 | `9c60592` | 0/0/2/1 | 3 | 14 |
| [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-16 | `9c60592` | 0/1/2/1 | 4 | 17 |
| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-16 | `9c60592` | 0/1/1/1 | 3 | 17 |
| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-16 | `9c60592` | 0/1/1/1 | 3 | 17 |
| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/3 | 4 | 16 |
| [Host](Host/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/3 | 4 | 15 |
| [InboundAPI](InboundAPI/findings.md) | 2026-05-16 | `9c60592` | 0/0/3/1 | 4 | 17 |
| [ManagementService](ManagementService/findings.md) | 2026-05-16 | `9c60592` | 0/1/1/2 | 4 | 17 |
| [NotificationService](NotificationService/findings.md) | 2026-05-16 | `9c60592` | 0/2/1/2 | 5 | 18 |
| [Security](Security/findings.md) | 2026-05-16 | `9c60592` | 0/0/2/2 | 4 | 15 |
| [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/2 | 3 | 14 |
| [SiteRuntime](SiteRuntime/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/2 | 3 | 19 |
| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-16 | `9c60592` | 0/0/2/1 | 3 | 17 |
| [TemplateEngine](TemplateEngine/findings.md) | 2026-05-16 | `9c60592` | 0/0/2/0 | 2 | 16 |
| [CLI](CLI/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 16 |
| [CentralUI](CentralUI/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 25 |
| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 10 |
| [Commons](Commons/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 14 |
| [Communication](Communication/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 15 |
| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/0 | 1 | 14 |
| [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 17 |
| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 17 |
| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 17 |
| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/0 | 1 | 16 |
| [Host](Host/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 15 |
| [InboundAPI](InboundAPI/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 17 |
| [ManagementService](ManagementService/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 17 |
| [NotificationService](NotificationService/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 18 |
| [Security](Security/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 15 |
| [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 14 |
| [SiteRuntime](SiteRuntime/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 19 |
| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 17 |
| [TemplateEngine](TemplateEngine/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/0 | 0 | 16 |
## Pending Findings
@@ -80,83 +80,17 @@ description, location, recommendation — lives in the module's `findings.md`.
_None open._
### High (8)
### High (0)
_None open._
### Medium (2)
| ID | Module | Title |
|----|--------|-------|
| CentralUI-020 | [CentralUI](CentralUI/findings.md) | Idle-session redirect never fires: `SessionExpiry` polls a frozen auth-state snapshot |
| Communication-012 | [Communication](Communication/findings.md) | gRPC client factory ignores the endpoint on a cache hit, breaking NodeA→NodeB stream failover |
| DataConnectionLayer-014 | [DataConnectionLayer](DataConnectionLayer/findings.md) | DCL-012 security warning is never logged in production: `RealOpcUaClient` is created without a logger |
| DeploymentManager-015 | [DeploymentManager](DeploymentManager/findings.md) | Site-query reconciliation marks a deployment `Success` but skips instance-state and snapshot updates |
| ExternalSystemGateway-015 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `MaxRetries == 0` is buffered as "retry forever", contradicting the ExternalSystemGateway-004 "never retry" claim |
| ManagementService-014 | [ManagementService](ManagementService/findings.md) | HandleQueryDeployments bypasses site-scope enforcement |
| NotificationService-014 | [NotificationService](NotificationService/findings.md) | OAuth2 token-fetch failure escapes `DeliverBufferedAsync`; a permanently-broken config is retried forever |
| NotificationService-015 | [NotificationService](NotificationService/findings.md) | Unclassified exceptions (OAuth2 token fetch, non-cancellation OCE) escape `SendAsync` to the calling script |
### Medium (26)
| ID | Module | Title |
|----|--------|-------|
| CLI-014 | [CLI](CLI/findings.md) | `update` commands require "core" fields, making partial updates impossible |
| CentralUI-021 | [CentralUI](CentralUI/findings.md) | `DebugView` stream callback mutates `Dictionary` off the render thread |
| CentralUI-022 | [CentralUI](CentralUI/findings.md) | `Deployments` push handler fires `InvokeAsync` with no disposal guard |
| ClusterInfrastructure-009 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | `DownIfAlone` is an inert configuration knob — never consumed by the HOCON builder |
| Communication-013 | [Communication](Communication/findings.md) | Site gRPC address changes are never applied; `RemoveSiteAsync` has no production caller |
| ConfigurationDatabase-012 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Inbound-API `ApiKey.KeyValue` bearer credential stored in plaintext |
| ConfigurationDatabase-013 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Secret-column encryption silently falls back to an ephemeral (throwaway) key |
| DataConnectionLayer-015 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Initial-connect failures never trigger failover; an unreachable primary at startup never tries the backup |
| DataConnectionLayer-016 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `HandleSubscribeCompleted` reports `SubscribeTagsResponse` success even on a connection-level subscribe failure |
| DeploymentManager-016 | [DeploymentManager](DeploymentManager/findings.md) | Reconciled prior record keeps its stale `RevisionHash` |
| ExternalSystemGateway-016 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `ConfigureHttpClientDefaults` applies the ESG connection cap to every `HttpClient` in the host process |
| HealthMonitoring-015 | [HealthMonitoring](HealthMonitoring/findings.md) | Heartbeat-registered site is left with a year-0001 `LastReportReceivedAt` |
| Host-012 | [Host](Host/findings.md) | `down-if-alone` hard-coded in HOCON; `ClusterOptions.DownIfAlone` is never read |
| InboundAPI-014 | [InboundAPI](InboundAPI/findings.md) | `ReturnDefinition` is loaded but never used; script return value is unshaped/unvalidated |
| InboundAPI-015 | [InboundAPI](InboundAPI/findings.md) | `ForbiddenApiChecker` is purely textual and is bypassable via reflection reachable without a forbidden namespace token |
| InboundAPI-016 | [InboundAPI](InboundAPI/findings.md) | Routed `Route.To().Call()` invocations are not bound by the method timeout |
| ManagementService-015 | [ManagementService](ManagementService/findings.md) | HandleSetInstanceOverrides applies overrides non-atomically |
| NotificationService-016 | [NotificationService](NotificationService/findings.md) | `AuthenticateAsync` silently sends unauthenticated for an unknown auth type or empty credentials |
| Security-012 | [Security](Security/findings.md) | Partial LDAP failure during login yields a roleless authenticated session |
| Security-014 | [Security](Security/findings.md) | `RefreshToken` re-issues a token without checking the idle timeout |
| SiteEventLogging-012 | [SiteEventLogging](SiteEventLogging/findings.md) | Dropped events report success: `Task` is completed, not faulted, when the event cannot be persisted |
| SiteRuntime-017 | [SiteRuntime](SiteRuntime/findings.md) | Instance Actor's live `_attributes` dictionary is shared by reference into child actor constructors |
| StoreAndForward-015 | [StoreAndForward](StoreAndForward/findings.md) | `EnqueueAsync`'s public contract never documents that `maxRetries == 0` means "retry forever" |
| StoreAndForward-016 | [StoreAndForward](StoreAndForward/findings.md) | Operator-initiated parked-message retry and discard are not replicated to the standby |
| TemplateEngine-015 | [TemplateEngine](TemplateEngine/findings.md) | `RenameCompositionAsync` does not cascade-rename nested derived templates |
| TemplateEngine-016 | [TemplateEngine](TemplateEngine/findings.md) | Composed-script `ScriptScope.ParentPath` is always empty, breaking `Parent.X` resolution for nested modules |
### Low (32)
### Low (0)
| ID | Module | Title |
|----|--------|-------|
| CLI-015 | [CLI](CLI/findings.md) | `Component-CLI.md` command surface has drifted again in two places |
| CLI-016 | [CLI](CLI/findings.md) | `WriteAsTable` derives columns from the first array element only |
| CentralUI-023 | [CentralUI](CentralUI/findings.md) | Residual bare `catch {}` blocks swallow JS interop errors |
| CentralUI-024 | [CentralUI](CentralUI/findings.md) | Claim lookups use magic strings instead of `JwtTokenService` constants |
| CentralUI-025 | [CentralUI](CentralUI/findings.md) | `SessionExpiry` polling/redirect path has no test coverage |
| ClusterInfrastructure-010 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | Validator does not enforce `DownIfAlone = true` despite the design doc requiring it |
| Commons-013 | [Commons](Commons/findings.md) | `DynamicJsonElement.TryGetIndex` rejects non-`int` index values |
| Commons-014 | [Commons](Commons/findings.md) | `OpcUaEndpointConfigSerializer.Deserialize` can mislabel a corrupt typed row as `Legacy` |
| Communication-014 | [Communication](Communication/findings.md) | Untrusted gRPC `correlation_id` flows directly into an Akka actor name |
| Communication-015 | [Communication](Communication/findings.md) | No test exercises the real gRPC client factory across a node flip |
| ConfigurationDatabase-014 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Redundant, inconsistent cast on one `HasConversion` call |
| DataConnectionLayer-017 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `WriteBatchAsync` aborts the whole batch on a mid-batch disconnect |
| DeploymentManager-017 | [DeploymentManager](DeploymentManager/findings.md) | `GetDeploymentStatusAsync` XML doc describes behaviour it does not implement |
| ExternalSystemGateway-017 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `BuildUrl` appends a bare trailing `?` when a GET method's parameters are all null |
| HealthMonitoring-013 | [HealthMonitoring](HealthMonitoring/findings.md) | Offline-check interval comment claims "shorter timeout" but only ever uses `OfflineTimeout` |
| HealthMonitoring-014 | [HealthMonitoring](HealthMonitoring/findings.md) | `HealthMonitoringOptions` intervals are unvalidated; a zero/negative value crashes the hosted service |
| HealthMonitoring-016 | [HealthMonitoring](HealthMonitoring/findings.md) | `SiteHealthCollector.CollectReport` reads `DateTimeOffset.UtcNow` directly instead of an injected `TimeProvider` |
| Host-013 | [Host](Host/findings.md) | `:F0` rounding of cluster timing values silently degrades sub-second configuration |
| Host-014 | [Host](Host/findings.md) | Serilog sinks are hard-coded in `Program.cs`, not configuration-driven (REQ-HOST-8) |
| Host-015 | [Host](Host/findings.md) | `StartupRetry` retries on every exception type, including permanent failures |
| InboundAPI-017 | [InboundAPI](InboundAPI/findings.md) | `RouteHelper` / `RouteTarget` has no test coverage |
| ManagementService-016 | [ManagementService](ManagementService/findings.md) | Unexpected exception messages returned verbatim to HTTP callers |
| ManagementService-017 | [ManagementService](ManagementService/findings.md) | QueryDeploymentsCommand has no test coverage |
| NotificationService-017 | [NotificationService](NotificationService/findings.md) | `NotificationOptions` is bound from configuration but never read (dead config) |
| NotificationService-018 | [NotificationService](NotificationService/findings.md) | Concurrency limiter: lock-free read of a non-volatile field, never resized on redeployment, never disposed |
| Security-013 | [Security](Security/findings.md) | `ExtractFirstRdnValue` mis-parses group DNs containing escaped commas |
| Security-015 | [Security](Security/findings.md) | Username is not trimmed before use in the LDAP filter, fallback DN, and JWT claims |
| SiteEventLogging-013 | [SiteEventLogging](SiteEventLogging/findings.md) | Keyword search does not escape SQL `LIKE` wildcards in user input |
| SiteEventLogging-014 | [SiteEventLogging](SiteEventLogging/findings.md) | Initial purge runs synchronously on the host startup thread |
| SiteRuntime-018 | [SiteRuntime](SiteRuntime/findings.md) | `ScriptExecutionActor` XML doc still claims a "dedicated blocking I/O dispatcher" |
| SiteRuntime-019 | [SiteRuntime](SiteRuntime/findings.md) | Dead `DisableInstanceCommand` / `EnableInstanceCommand` handlers in `InstanceActor` |
| StoreAndForward-017 | [StoreAndForward](StoreAndForward/findings.md) | Retry/Discard activity-log entries hard-code the `ExternalSystem` category |
_None open._
+49 -9
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 4 (1 deferred — Security-008) |
| Open findings | 0 (1 deferred — Security-008) |
## Summary
@@ -483,7 +483,7 @@ LDAP-timeout coverage (Security-009) plus extra no-service-account / DN-path edg
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:78-118` |
**Description**
@@ -515,7 +515,19 @@ or be surfaced.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `pending`). Confirmed: the group/attribute `Search` was
wrapped in a `catch (LdapException)` that logged a warning and returned
`new LdapAuthResult(true, …, groups: [])` — a partial LDAP outage produced an
authenticated, zero-role session — and the inner `while`-loop `catch { break; }` masked
real mid-enumeration errors as end-of-results. `AuthenticateAsync` now tracks a
`groupLookupSucceeded` flag (false when any `LdapException` is thrown by the search or
`Next()`); the inner swallowing catch was removed so such errors propagate; the new
`BuildAuthResultFromGroupLookup` helper returns a FAILED `LdapAuthResult`
("directory temporarily unavailable, please try again") on lookup failure while still
treating a genuine empty-groups result as a successful login. Regression tests
`BuildAuthResultFromGroupLookup_LookupFailed_FailsTheLogin`,
`BuildAuthResultFromGroupLookup_LookupSucceededNoGroups_IsAuthenticated`,
`BuildAuthResultFromGroupLookup_LookupSucceededWithGroups_CarriesGroups`.
### Security-013 — `ExtractFirstRdnValue` mis-parses group DNs containing escaped commas
@@ -523,7 +535,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:258-269` |
**Description**
@@ -545,7 +557,17 @@ library's DN-parsing API (`LdapDN` / RDN accessors) rather than hand-rolled `Ind
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `pending`). Confirmed: `ExtractFirstRdnValue` took the
substring between the first `=` and the first `,`, splitting on an RFC 4514
backslash-escaped comma so a CN like `Acme\, Inc Operators` became `Acme\`. Replaced the
naive `IndexOf` scan with an escape-aware character scan that ignores a backslash-escaped
`,`, unescapes single-character and `\XX` hex escape sequences, and only terminates the
value on an unescaped comma — so a comma-containing group CN is returned intact and
matches its configured `LdapGroupName`. Method was also made `public` for direct
testing. Regression tests `ExtractFirstRdnValue_EscapedComma_KeepsWholeGroupName`,
`ExtractFirstRdnValue_PlainDn_ReturnsFirstRdnValue`,
`ExtractFirstRdnValue_SingleRdn_ReturnsValue`,
`ExtractFirstRdnValue_EscapedSpecials_AreUnescaped`.
### Security-014 — `RefreshToken` re-issues a token without checking the idle timeout
@@ -553,7 +575,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Security/JwtTokenService.cs:156-169` |
**Description**
@@ -580,7 +602,16 @@ regression test covering refresh of a 31-minute-idle token.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `pending`). Confirmed: `RefreshToken` carried `LastActivity`
forward and minted a fresh 15-minute token without ever calling `IsIdleTimedOut`, so the
idle policy depended entirely on caller discipline and a background refresh could keep an
idle-expired session alive indefinitely. `RefreshToken` now calls
`IsIdleTimedOut(currentPrincipal)` after the missing-claims check and returns `null`
(its existing "cannot refresh" signal) when the session is past the idle window, so the
30-minute idle policy holds regardless of caller order; XML doc updated to state the
invariant. Regression tests `RefreshToken_IdleExpiredPrincipal_ReturnsNull`,
`RefreshToken_ActiveSessionWithinIdleWindow_StillRefreshes`,
`RefreshToken_MissingLastActivityClaim_ReturnsNull`.
### Security-015 — Username is not trimmed before use in the LDAP filter, fallback DN, and JWT claims
@@ -588,7 +619,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:20-21`, `:80`, `:122`, `:169`, `:193` |
**Description**
@@ -613,4 +644,13 @@ trail.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `pending`). Confirmed: an otherwise-valid username with
leading/trailing whitespace passed the `IsNullOrWhiteSpace` guard and flowed verbatim
into the LDAP filter, the fallback DN, and the returned `LdapAuthResult` (and thus the
JWT `Username` claim). Added a `NormalizeUsername` helper (trims whitespace) called once
at the top of `AuthenticateAsync` immediately after the guard; the local `username`
variable is reassigned to the trimmed value so the filter, fallback DN, and result all
use the single canonical identity. Regression tests
`NormalizeUsername_TrimsLeadingAndTrailingWhitespace`,
`BuildFallbackUserDn_TrimmedUsername_NoLeadingTrailingSpace`,
`AuthenticateAsync_UsernameWithSurroundingWhitespace_StillRejectedForInsecure`.
+51 -10
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 |
| Open findings | 0 |
## Summary
@@ -42,8 +42,9 @@ gap left by the SiteEventLogging-005 background-writer rework: when an event can
be persisted because the logger has been disposed, the returned `Task` is completed
*successfully* rather than faulted, so an `await`-ing caller is told a dropped audit
event was written. The other two are minor: unescaped SQL `LIKE` wildcards in the
keyword-search filter (SiteEventLogging-013) and the initial purge running
synchronously on the host startup thread (SiteEventLogging-014).
keyword-search filter (SiteEventLogging-013) and a claimed initial-purge block on the
host startup thread (SiteEventLogging-014 — later re-triaged to Won't Fix, the
premise does not hold on .NET 8+).
## Checklist coverage
@@ -54,7 +55,7 @@ synchronously on the host startup thread (SiteEventLogging-014).
| 3 | Concurrency & thread safety | ☑ | Shared `SqliteConnection` used by purge/query without the write lock (-003). |
| 4 | Error handling & resilience | ☑ | `LogEventAsync` swallows write failures silently into a log line only (-008); purge catches broadly. |
| 5 | Security | ☑ | Queries fully parameterised. No authz in module (delegated to caller) — noted, not a finding. |
| 6 | Performance & resource management | ☑ | Synchronous I/O on actor threads (-005); missing indexes for severity/source/message (-006). Re-review: initial purge blocks host startup thread (-014). |
| 6 | Performance & resource management | ☑ | Synchronous I/O on actor threads (-005); missing indexes for severity/source/message (-006). Re-review: claimed initial-purge startup-thread block (-014 — Won't Fix, premise invalid on .NET 8+). |
| 7 | Design-document adherence | ☑ | Singleton placement contradicts "active node" model (-004); cap purge does not honour "oldest first within budget" cleanly (-002). |
| 8 | Code organization & conventions | ☑ | Concrete-type downcast of `ISiteEventLogger` (-007); `internal Connection` leaks DB handle (-007). |
| 9 | Testing coverage | ☑ | No tests for purge interaction with live writes, vacuum effectiveness, the actor bridge, or query error path (-010). |
@@ -553,7 +554,7 @@ full module suite still passing.
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:160-166,193-197` |
**Description**
@@ -593,7 +594,16 @@ deliberate "drop silently on shutdown" semantics is chosen instead.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`): both disposed-drop paths now fault the
returned `Task` with `ObjectDisposedException` instead of `TrySetResult()`
`LogEventAsync` on a failed `TryWrite`, and `ProcessWriteQueueAsync` now inspects the
`written` flag from `WithConnection` and only calls `TrySetResult()` when the row was
actually persisted. An `await`-ing caller can now distinguish a dropped audit event
from a persisted one. Regression tests
`LogEventAsync_AfterDispose_FaultsTask_NotReportsSuccess` and
`LogEventAsync_EnqueuedThenDisposed_FaultsTask_WhenWriteCannotComplete` (the prior
`LogEventAsync_AfterDispose_CompletesWithoutThrowing` test asserted the now-incorrect
silent-success behaviour and was replaced).
### SiteEventLogging-013 — Keyword search does not escape SQL `LIKE` wildcards in user input
@@ -601,7 +611,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:79-83` |
**Description**
@@ -628,7 +638,13 @@ design implies.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `<pending>`): `EventLogQueryService` now escapes `\`,
`%`, and `_` in `request.KeywordFilter` via a new `EscapeLikePattern` helper before
wrapping it in `%...%`, and the `LIKE` clauses carry an explicit `ESCAPE '\'` so the
keyword is matched as a literal substring as the design intends. Regression tests
`Query_KeywordSearch_TreatsUnderscoreAsLiteral_NotWildcard`,
`Query_KeywordSearch_TreatsPercentAsLiteral_NotWildcard`, and
`Query_KeywordSearch_StillMatchesPlainSubstring`.
### SiteEventLogging-014 — Initial purge runs synchronously on the host startup thread
@@ -636,7 +652,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Status | Won't Fix |
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:34-48` |
**Description**
@@ -662,6 +678,31 @@ startup thread — e.g. `await Task.Yield();` as the first statement of `Execute
or move the initial `RunPurge()` to after the first `await timer.WaitForNextTickAsync`
(accepting a one-interval delay), or offload it with `await Task.Run(RunPurge, stoppingToken)`.
**Re-triage note (2026-05-17)**
The finding's central premise — "the host's startup pipeline does not proceed past a
`BackgroundService` until its `ExecuteAsync` yields at the first real `await`", so the
synchronous `RunPurge()` prelude blocks the startup thread — is **incorrect for the
.NET version this project targets**. The module targets `net10.0`. Since .NET 8,
`BackgroundService.StartAsync` no longer runs `ExecuteAsync` inline on the calling
(host startup) thread: `ExecuteAsync` is dispatched onto the thread pool, and
`StartAsync` returns immediately regardless of how long the synchronous prelude runs.
This was verified empirically. A standalone `BackgroundService` whose `ExecuteAsync`
prelude does a 1.5 s synchronous `Thread.Sleep` before its first `await` showed
`StartAsync` returning in **0 ms**, with the prelude running on a *different* thread
pool thread than the caller. Applied to `EventLogPurgeService`, `StartAsync` returns
promptly and the initial `RunPurge()` executes on the background scheduler — host
startup and the `/health/ready` gate are not blocked. There is therefore no defect
to fix.
**Resolution**
_Unresolved._
Won't Fix — 2026-05-17 (commit `<pending>`). Re-triaged: the asserted startup-thread
block cannot occur on .NET 8+ (this module targets `net10.0`), where
`BackgroundService` dispatches `ExecuteAsync` to the thread pool rather than running
its synchronous prelude on the host startup thread — verified empirically (see the
re-triage note). No code change made. A verification test
`StartAsync_DoesNotBlock_OnTheInitialPurge` was added to pin this behaviour
(asserts `StartAsync` returns in under 1 s and the initial purge still runs on the
background scheduler).
+36 -8
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 |
| Open findings | 0 |
## Summary
@@ -44,7 +44,8 @@ dictionary is handed by reference into child `ScriptActor`/`AlarmActor`
constructors (SiteRuntime-017, Medium); a stale `ScriptExecutionActor` XML doc
that still claims a "dedicated blocking I/O dispatcher" (SiteRuntime-018, Low);
and two dead lifecycle handlers in `InstanceActor` that the Deployment Manager
never routes to (SiteRuntime-019, Low). Open findings: 3.
never routes to (SiteRuntime-019, Low). All three were subsequently resolved on
2026-05-17. Open findings: 0.
## Checklist coverage
@@ -758,7 +759,7 @@ green.
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:625`, `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:675`, `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:83`, `src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs:93` |
**Description**
@@ -800,7 +801,18 @@ private dictionary to seed from.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (`commit pending`): root cause confirmed — `CreateChildActors`
captured the live `_attributes` field directly in every child `Props.Create`
closure. `CreateChildActors` now takes a single `new Dictionary<string,object?>(_attributes)`
snapshot on the Instance Actor thread and hands each `ScriptActor`/`AlarmActor` that
private copy, so no child constructor ever enumerates a dictionary the Instance
Actor is concurrently mutating. Regression test:
`InstanceActorChildAttributeRaceTests.ChildActors_AreSeededFromAnIsolatedCopy_NotTheLiveAttributesDictionary`
asserts every child's seed dictionary is a distinct object from the Instance
Actor's live `_attributes` (confirmed to fail — "seeded ... by reference" — against
the pre-fix code and pass after). `ScriptActor`/`AlarmActor` expose an internal
`SeedAttributesReference` for this assertion (`InternalsVisibleTo` added for the
test project).
### SiteRuntime-018 — `ScriptExecutionActor` XML doc still claims a "dedicated blocking I/O dispatcher"
@@ -808,7 +820,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs:17` |
**Description**
@@ -835,7 +847,13 @@ comment already present at `ScriptExecutionActor.cs:71-73`.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (`commit pending`): root cause confirmed — the stale
"Runs on a dedicated blocking I/O dispatcher" line in the `ScriptExecutionActor`
class summary was missed when SiteRuntime-009 was resolved. The summary now states
the actual model: the actor and its mailbox run on the default Akka dispatcher and
only the script body is dispatched onto the dedicated `ScriptExecutionScheduler`
(SiteRuntime-009). Documentation-only change with no observable behaviour, so no
regression test was added; the existing suite continues to pass.
### SiteRuntime-019 — Dead `DisableInstanceCommand` / `EnableInstanceCommand` handlers in `InstanceActor`
@@ -843,7 +861,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:106`, `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:113` |
**Description**
@@ -873,4 +891,14 @@ reserved placeholder — but removal is preferred.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (`commit pending`): re-verified as genuinely dead code — a
codebase-wide search confirms `DisableInstanceCommand`/`EnableInstanceCommand` are
only ever sent (from central) to the site `DeploymentManagerActor`, whose
`HandleDisable`/`HandleEnable` own the lifecycle entirely (`Context.Stop` /
`CreateInstanceActor`) and never `Forward`/`Tell` the command to the Instance
Actor. The two unreachable `Receive<...>` registrations and their no-op
"success" handler bodies were removed from `InstanceActor`, replaced with a comment
stating the Deployment Manager owns this lifecycle. Regression test:
`InstanceActorTests.InstanceActor_DoesNotHandleDisableOrEnableCommands` asserts the
Instance Actor produces no `InstanceLifecycleResponse` for either command
(confirmed to fail against the pre-fix dead handlers and pass after removal).
+38 -7
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 3 (3 Deferred: 002, 011, 012 — see notes) |
| Open findings | 0 (3 Deferred: 002, 011, 012 — see notes) |
## Summary
@@ -735,7 +735,7 @@ all six `SiteActorPathTests` now pass. Fixed by the commit whose message referen
|--|--|
| Severity | Medium |
| Category | Documentation & comments |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:114``:130`, `:285` |
**Description**
@@ -784,7 +784,17 @@ public `EnqueueAsync` contract must state the chosen meaning; today it states no
**Resolution**
_Unresolved._
Resolved 2026-05-17. Documentation-only fix — retry semantics confirmed correct and
left unchanged. Root cause verified against the source: `EnqueueAsync`'s `maxRetries`
parameter had no `<param>` documentation and the method/class summaries described only
the "park on max retries" path, never the `0 = no limit / retry forever` special case
that `RetryMessageAsync`'s `MaxRetries > 0` guard actually enforces. Added an explicit
`<param>` tag for every `EnqueueAsync` parameter — `maxRetries` now states in bold that
`0` means "no limit, never parked for retry exhaustion" and is **not** a "never retry"
value — extended the method summary with a retry-count lifecycle paragraph, updated the
class-level lifecycle bullet, and tightened the `StoreAndForwardMessage.MaxRetries` field
doc to the same wording. No behavioural code touched; an XML comment is not
test-observable so no regression test was added.
### StoreAndForward-016 — Operator-initiated parked-message retry and discard are not replicated to the standby
@@ -792,7 +802,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:339``:362`; `src/ScadaLink.StoreAndForward/ReplicationService.cs:131``:136` |
**Description**
@@ -842,7 +852,18 @@ paths).
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed: the two operator paths
(`RetryParkedMessageAsync`, `DiscardParkedMessageAsync`) changed local SQLite state but
never touched `_replication`, so a failover after an operator action diverged the
standby buffer. `DiscardParkedMessageAsync` now calls `_replication?.ReplicateRemove`
after a successful local delete (the existing `Remove` op deletes on the standby). A new
`ReplicationOperationType.Requeue` was added; `RetryParkedMessageAsync` re-loads the
requeued row and calls `_replication?.ReplicateRequeue`, and the standby's
`ApplyReplicatedOperationAsync` `Requeue` case resets its matching row to `Pending` with
`retry_count = 0`. Regression tests `DiscardingAParkedMessage_ReplicatesARemoveOperation`,
`RetryingAParkedMessage_ReplicatesARequeueOperation` and
`ApplyReplicatedOperation_Requeue_MovesStandbyRowBackToPending` (in
`StoreAndForwardReplicationTests`) all pass; the first two fail against the pre-fix code.
### StoreAndForward-017 — Retry/Discard activity-log entries hard-code the `ExternalSystem` category
@@ -850,7 +871,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:344`, `:358` |
**Description**
@@ -882,4 +903,14 @@ allow a nullable category for management actions rather than asserting a false o
**Resolution**
_Unresolved._
Resolved 2026-05-17. Root cause confirmed: `RetryParkedMessageAsync` and
`DiscardParkedMessageAsync` raised activity notifications with a hard-coded
`StoreAndForwardCategory.ExternalSystem`, mislabelling parked `Notification` and
`CachedDbWrite` messages in the site event log. Both methods now obtain the message's
real category — `DiscardParkedMessageAsync` loads the row via `GetMessageByIdAsync`
before the delete, `RetryParkedMessageAsync` re-loads the requeued row (also used for
the StoreAndForward-016 replication) — and pass it to `RaiseActivity` (falling back to
`ExternalSystem` only if the row is unexpectedly absent). Regression tests
`RetryParkedMessageAsync_ActivityUsesMessageRealCategory` and
`DiscardParkedMessageAsync_ActivityUsesMessageRealCategory` assert the activity carries
`Notification` / `CachedDbWrite` respectively; both fail against the pre-fix code.
+19 -5
View File
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-17 |
| Reviewer | claude-agent |
| Commit reviewed | `39d737e` |
| Open findings | 2 |
| Open findings | 0 |
## Summary
@@ -674,7 +674,7 @@ verifies all three constraint categories are surfaced together.
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:680` |
**Description**
@@ -719,7 +719,14 @@ two-level cascade rename.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `pending`): `RenameCompositionAsync` now recurses
into `derived.Compositions` via a new `CollectCascadeRenamesAsync` helper
(mirroring `CascadeDeleteDerivedAsync`), re-deriving each cascaded inner derived
template's name from its renamed parent and slot instance name, and runs the
same-name collision pre-check across every name in the cascade before any row
mutates. Regression tests:
`RenameComposition_CascadesRenameToNestedDerivedTemplates`,
`RenameComposition_NestedCascadeNameCollision_Fails`.
### TemplateEngine-016 — Composed-script `ScriptScope.ParentPath` is always empty, breaking `Parent.X` resolution for nested modules
@@ -727,7 +734,7 @@ _Unresolved._
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:750` |
**Description**
@@ -765,4 +772,11 @@ two-level composed script.
**Resolution**
_Unresolved._
Resolved 2026-05-17 (commit `pending`): threaded a `parentPath` parameter
through `ResolveComposedScriptsRecursive` — the top-level caller passes `""`
(a depth-1 composition's parent is the root template) and each nested call
passes the enclosing module's `prefix` — and the `ScriptScope` now sets
`ParentPath` to that value instead of a hard-coded `""`, so a depth-2 script's
`SelfPath = "Outer.Inner"` pairs with `ParentPath = "Outer"` and `Parent.X`
resolves against the real parent module. Regression test:
`Flatten_NestedComposedScript_ScopeCarriesCorrectParentPath`.
+19 -3
View File
@@ -181,9 +181,25 @@ internal static class CommandHelpers
return;
}
var headers = items[0].ValueKind == JsonValueKind.Object
? items[0].EnumerateObject().Select(p => p.Name).ToArray()
: new[] { "Value" };
// Derive the header set as the union of property names across *every*
// element, in first-seen order. Using only items[0] would silently drop
// columns for any later element with a different shape (CLI-016).
var objectItems = items.Where(i => i.ValueKind == JsonValueKind.Object).ToList();
string[] headers;
if (objectItems.Count > 0)
{
var seen = new List<string>();
var known = new HashSet<string>(StringComparer.Ordinal);
foreach (var item in objectItems)
foreach (var prop in item.EnumerateObject())
if (known.Add(prop.Name))
seen.Add(prop.Name);
headers = seen.ToArray();
}
else
{
headers = new[] { "Value" };
}
var rows = items.Select(item =>
{
+62 -48
View File
@@ -120,16 +120,18 @@ scadalink --url <url> template create --name <string> [--description <string>] [
#### `template update`
Update an existing template's name, description, or parent.
Update an existing template. An update **replaces** the whole entity — every required
field below must be supplied with the value it should have after the update, even if
it is unchanged.
```sh
scadalink --url <url> template update --id <int> [--name <string>] [--description <string>] [--parent-id <int>]
scadalink --url <url> template update --id <int> --name <string> [--description <string>] [--parent-id <int>]
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Template ID |
| `--name` | no | Updated template name |
| `--name` | yes | Template name |
| `--description` | no | Updated description |
| `--parent-id` | no | Updated parent template ID |
@@ -313,16 +315,15 @@ scadalink --url <url> template composition add --template-id <int> --module-temp
#### `template composition delete`
Remove a feature module composition from a template.
Remove a feature module composition from a template, identified by its own composition ID.
```sh
scadalink --url <url> template composition delete --template-id <int> --instance-name <string>
scadalink --url <url> template composition delete --id <int>
```
| Option | Required | Description |
|--------|----------|-------------|
| `--template-id` | yes | Target template ID |
| `--instance-name` | yes | Instance name of the composed module to remove |
| `--id` | yes | Composition ID to remove |
---
@@ -520,17 +521,17 @@ scadalink --url <url> site area create --site-id <int> --name <string> [--parent
#### `site area update`
Update an area's name or parent.
Update an area's name. An update **replaces** the whole entity — the required field
below must be supplied, even if unchanged.
```sh
scadalink --url <url> site area update --id <int> [--name <string>] [--parent-area-id <int>]
scadalink --url <url> site area update --id <int> --name <string>
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Area ID |
| `--name` | no | Updated area name |
| `--parent-area-id` | no | Updated parent area ID |
| `--name` | yes | Area name |
#### `site area delete`
@@ -620,7 +621,7 @@ scadalink --url <url> data-connection list [--site-id <int>]
Create a new data connection belonging to a specific site.
```sh
scadalink --url <url> data-connection create --site-id <int> --name <string> --protocol <string> [--configuration <json>]
scadalink --url <url> data-connection create --site-id <int> --name <string> --protocol <string> [--primary-config <json>] [--backup-config <json>] [--failover-retry-count <int>]
```
| Option | Required | Description |
@@ -628,22 +629,27 @@ scadalink --url <url> data-connection create --site-id <int> --name <string> --p
| `--site-id` | yes | Site ID the connection belongs to |
| `--name` | yes | Connection name |
| `--protocol` | yes | Protocol identifier (e.g. `OpcUa`) |
| `--configuration` | no | Protocol-specific configuration as a JSON string |
| `--primary-config` | no | Primary protocol-specific configuration as a JSON string (alias `--configuration`) |
| `--backup-config` | no | Backup protocol-specific configuration as a JSON string |
| `--failover-retry-count` | no | Retries before failover to the backup configuration (default: `3`) |
#### `data-connection update`
Update a data connection definition.
Update a data connection definition. An update **replaces** the whole entity — every
required field below must be supplied, even if unchanged.
```sh
scadalink --url <url> data-connection update --id <int> [--name <string>] [--protocol <string>] [--configuration <json>]
scadalink --url <url> data-connection update --id <int> --name <string> --protocol <string> [--primary-config <json>] [--backup-config <json>] [--failover-retry-count <int>]
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Data connection ID |
| `--name` | no | Updated connection name |
| `--protocol` | no | Updated protocol identifier |
| `--configuration` | no | Updated protocol-specific configuration as a JSON string |
| `--name` | yes | Connection name |
| `--protocol` | yes | Protocol identifier |
| `--primary-config` | no | Primary protocol-specific configuration as a JSON string (alias `--configuration`) |
| `--backup-config` | no | Backup protocol-specific configuration as a JSON string |
| `--failover-retry-count` | no | Retries before failover to the backup configuration (default: `3`) |
#### `data-connection delete`
@@ -698,18 +704,19 @@ scadalink --url <url> external-system create --name <string> --endpoint-url <url
#### `external-system update`
Update an external system definition.
Update an external system definition. An update **replaces** the whole entity — every
required field below must be supplied, even if unchanged.
```sh
scadalink --url <url> external-system update --id <int> [--name <string>] [--endpoint-url <url>] [--auth-type <string>] [--auth-config <json>]
scadalink --url <url> external-system update --id <int> --name <string> --endpoint-url <url> --auth-type <string> [--auth-config <json>]
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | External system ID |
| `--name` | no | Updated display name |
| `--endpoint-url` | no | Updated base URL |
| `--auth-type` | no | Updated authentication type |
| `--name` | yes | Display name |
| `--endpoint-url` | yes | Base URL |
| `--auth-type` | yes | Authentication type |
| `--auth-config` | no | Updated auth credentials as a JSON string |
#### `external-system delete`
@@ -763,17 +770,18 @@ scadalink --url <url> notification create --name <string> --emails <email1,email
#### `notification update`
Update a notification list's name or recipients.
Update a notification list. An update **replaces** the whole entity — every required
field below must be supplied, even if unchanged.
```sh
scadalink --url <url> notification update --id <int> [--name <string>] [--emails <email1,email2,...>]
scadalink --url <url> notification update --id <int> --name <string> --emails <email1,email2,...>
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Notification list ID |
| `--name` | no | Updated list name |
| `--emails` | no | Updated comma-separated list of recipient email addresses |
| `--name` | yes | List name |
| `--emails` | yes | Comma-separated list of recipient email addresses |
#### `notification delete`
@@ -885,17 +893,18 @@ scadalink --url <url> security role-mapping create --ldap-group <string> --role
#### `security role-mapping update`
Update an LDAP role mapping.
Update an LDAP role mapping. An update **replaces** the whole entity — every required
field below must be supplied, even if unchanged.
```sh
scadalink --url <url> security role-mapping update --id <int> [--ldap-group <string>] [--role <string>]
scadalink --url <url> security role-mapping update --id <int> --ldap-group <string> --role <string>
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Mapping ID |
| `--ldap-group` | no | Updated LDAP group distinguished name or CN |
| `--role` | no | Updated ScadaLink role |
| `--ldap-group` | yes | LDAP group distinguished name or CN |
| `--role` | yes | ScadaLink role |
#### `security role-mapping delete`
@@ -1099,17 +1108,20 @@ scadalink --url <url> shared-script create --name <string> --code <string>
#### `shared-script update`
Update a shared script's name or code.
Update a shared script. An update **replaces** the whole entity — every required field
below must be supplied, even if unchanged.
```sh
scadalink --url <url> shared-script update --id <int> [--name <string>] [--code <string>]
scadalink --url <url> shared-script update --id <int> --name <string> --code <string> [--parameters <json>] [--return-def <json>]
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Shared script ID |
| `--name` | no | Updated script name |
| `--code` | no | Updated script source code (or `@filepath`) |
| `--name` | yes | Shared script name |
| `--code` | yes | Script source code |
| `--parameters` | no | Parameter definitions JSON |
| `--return-def` | no | Return type definition JSON |
#### `shared-script delete`
@@ -1163,18 +1175,18 @@ scadalink --url <url> db-connection create --name <string> --connection-string <
#### `db-connection update`
Update a database connection definition.
Update a database connection definition. An update **replaces** the whole entity —
every required field below must be supplied, even if unchanged.
```sh
scadalink --url <url> db-connection update --id <int> [--name <string>] [--connection-string <string>] [--provider <string>]
scadalink --url <url> db-connection update --id <int> --name <string> --connection-string <string>
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | Database connection ID |
| `--name` | no | Updated connection name |
| `--connection-string` | no | Updated connection string |
| `--provider` | no | Updated database provider |
| `--name` | yes | Connection name |
| `--connection-string` | yes | Database connection string |
#### `db-connection delete`
@@ -1228,18 +1240,20 @@ scadalink --url <url> api-method create --name <string> --code <string> [--descr
#### `api-method update`
Update an inbound API method.
Update an inbound API method. An update **replaces** the whole entity — every required
field below must be supplied, even if unchanged.
```sh
scadalink --url <url> api-method update --id <int> [--name <string>] [--code <string>] [--description <string>]
scadalink --url <url> api-method update --id <int> --script <string> [--timeout <int>] [--parameters <json>] [--return-def <json>]
```
| Option | Required | Description |
|--------|----------|-------------|
| `--id` | yes | API method ID |
| `--name` | no | Updated method name |
| `--code` | no | Updated script source code (or `@filepath`) |
| `--description` | no | Updated description |
| Option | Required | Default | Description |
|--------|----------|---------|-------------|
| `--id` | yes | — | API method ID |
| `--script` | yes | — | Script source code implementing the method |
| `--timeout` | no | `30` | Timeout in seconds |
| `--parameters` | no | — | Parameter definitions JSON |
| `--return-def` | no | — | Return type definition JSON |
Script changes take effect immediately — the updated code is recompiled in-memory on the active central node. No restart is required.
@@ -134,9 +134,35 @@ public static class AuthEndpoints
context.Response.Redirect("/login");
});
// CentralUI-020: liveness probe for the client-side idle-logout check.
// The Blazor circuit's CookieAuthenticationStateProvider serves a frozen
// constructor-time principal (CentralUI-004), so a circuit can never
// observe a server-side cookie expiry by polling the auth state.
// SessionExpiry instead polls this endpoint via fetch(): being a normal
// HTTP request, the cookie middleware re-validates (and slides) the
// cookie on every hit. It deliberately does NOT use RequireAuthorization
// — that would make the middleware answer a lapsed request with a 302 to
// /login, which fetch() follows transparently and reads as a 200 login
// page. Allowing anonymous access and returning 200/401 ourselves gives
// the client an unambiguous expiry signal.
endpoints.MapGet("/auth/ping", HandlePing);
return endpoints;
}
/// <summary>
/// Handler for <c>GET /auth/ping</c>. Returns <c>200</c> while the caller's
/// cookie session is still valid and <c>401</c> once it has lapsed
/// server-side. See CentralUI-020.
/// </summary>
public static Task HandlePing(HttpContext context)
{
context.Response.StatusCode = context.User.Identity?.IsAuthenticated == true
? StatusCodes.Status200OK
: StatusCodes.Status401Unauthorized;
return Task.CompletedTask;
}
/// <summary>
/// Builds the <see cref="AuthenticationProperties"/> for the login sign-in.
/// CentralUI-005: deliberately does <b>not</b> set <see cref="AuthenticationProperties.ExpiresUtc"/>.
@@ -0,0 +1,43 @@
using System.Security.Claims;
using Microsoft.AspNetCore.Components.Authorization;
using ScadaLink.Security;
namespace ScadaLink.CentralUI.Auth;
/// <summary>
/// Claim-lookup helpers for the Central UI. CentralUI-024: claim types are owned
/// by <see cref="JwtTokenService"/> (the single source of truth). These helpers
/// resolve them through the <c>JwtTokenService</c> constants so a rename there
/// propagates here instead of silently breaking ten copy-pasted call sites.
/// </summary>
public static class ClaimsPrincipalExtensions
{
/// <summary>Fallback returned when no username claim is present.</summary>
public const string UnknownUser = "unknown";
/// <summary>
/// The audit username for <paramref name="principal"/>, or
/// <see cref="UnknownUser"/> when the claim is absent.
/// </summary>
public static string GetUsername(this ClaimsPrincipal principal)
=> principal.FindFirst(JwtTokenService.UsernameClaimType)?.Value ?? UnknownUser;
/// <summary>
/// The display name for <paramref name="principal"/>, or <c>null</c> when
/// the claim is absent.
/// </summary>
public static string? GetDisplayName(this ClaimsPrincipal principal)
=> principal.FindFirst(JwtTokenService.DisplayNameClaimType)?.Value;
/// <summary>
/// Resolves the current user's audit username from the auth state provider.
/// Replaces the <c>GetCurrentUserAsync</c> helper that was copy-pasted into
/// ten components (CentralUI-024).
/// </summary>
public static async Task<string> GetCurrentUsernameAsync(
this AuthenticationStateProvider authStateProvider)
{
var authState = await authStateProvider.GetAuthenticationStateAsync();
return authState.User.GetUsername();
}
}
@@ -99,7 +99,8 @@
<Authorized>
<div class="border-top border-secondary px-3 py-2">
<div class="d-flex justify-content-between align-items-center">
<span class="text-light small">@context.User.FindFirst("DisplayName")?.Value</span>
@* CentralUI-024: claim type resolved via JwtTokenService. *@
<span class="text-light small">@context.User.GetDisplayName()</span>
<form method="post" action="/auth/logout" data-enhance="false">
@* CentralUI-017: logout is a state-changing POST and is
CSRF-protected — the antiforgery token is required. *@
@@ -160,11 +160,10 @@
</div>
@code {
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
private List<Site> _sites = new();
private Dictionary<int, List<DataConnection>> _siteConnections = new();
@@ -11,7 +11,8 @@
<AuthorizeView>
<Authorized>
<span class="text-muted small">
Signed in as <strong>@context.User.FindFirst("DisplayName")?.Value</strong>
@* CentralUI-024: claim type resolved via JwtTokenService. *@
Signed in as <strong>@context.User.GetDisplayName()</strong>
</span>
</Authorized>
</AuthorizeView>
@@ -401,23 +401,7 @@
{
var session = await DebugStreamService.StartStreamAsync(
_selectedInstanceId,
onEvent: evt =>
{
// CentralUI-009: the component may have been disposed while
// this event was in flight on the Akka/gRPC thread.
if (_disposed) return;
switch (evt)
{
case AttributeValueChanged av:
UpsertWithCap(_attributeValues, av.AttributeName, av);
SafeInvokeStateHasChanged();
break;
case AlarmStateChanged al:
UpsertWithCap(_alarmStates, al.AlarmName, al);
SafeInvokeStateHasChanged();
break;
}
},
onEvent: HandleStreamEvent,
onTerminated: () =>
{
_connected = false;
@@ -503,10 +487,51 @@
_alarmStates.Clear();
}
/// <summary>
/// Handles one debug-stream event. The callback is invoked on an Akka/gRPC
/// thread, but <see cref="_attributeValues"/>/<see cref="_alarmStates"/> are
/// <see cref="Dictionary{TKey,TValue}"/> instances also enumerated by the
/// render thread via <see cref="FilteredAttributeValues"/>/
/// <see cref="FilteredAlarmStates"/>. <c>Dictionary</c> is not thread-safe
/// (CentralUI-021): a write racing an enumeration can throw or corrupt the
/// buckets. The mutation (<see cref="UpsertWithCap"/>) is therefore
/// marshalled onto the renderer's dispatcher via <see cref="SafeInvokeAsync"/>
/// so every access to the dictionaries — read and write — happens on the
/// render thread.
/// </summary>
private void HandleStreamEvent(object evt)
{
// CentralUI-009: the component may have been disposed while this event
// was in flight on the Akka/gRPC thread.
if (_disposed) return;
_ = SafeInvokeAsync(() =>
{
if (_disposed) return;
switch (evt)
{
case AttributeValueChanged av:
UpsertWithCap(_attributeValues, av.AttributeName, av);
break;
case AlarmStateChanged al:
UpsertWithCap(_alarmStates, al.AlarmName, al);
break;
default:
return;
}
StateHasChanged();
});
}
/// <summary>
/// Replace or insert a value keyed by name, then trim the oldest entries
/// (queue-style) so the table size never exceeds MaxRows. Dictionary
/// preserves insertion order, so the first key is always the oldest.
/// <para>
/// Must be called on the render thread only (CentralUI-021) — see
/// <see cref="HandleStreamEvent"/>. The cap-trim loop is in the same
/// critical section as the upsert so the dictionary is never observed
/// over-capacity.
/// </para>
/// </summary>
private static void UpsertWithCap<T>(Dictionary<string, T> map, string key, T value)
{
@@ -577,8 +602,6 @@
}
}
private void SafeInvokeStateHasChanged() => _ = SafeInvokeAsync(StateHasChanged);
public void Dispose()
{
// CentralUI-009: mark disposed first so any in-flight stream callback
@@ -204,6 +204,17 @@
private int _totalPages;
private const int PageSize = 25;
// CentralUI-022: IDeploymentStatusNotifier is a process singleton that
// raises StatusChanged on the DeploymentManager service thread. Dispose()
// unsubscribes, but the notifier can read its subscriber list and begin
// invoking OnDeploymentStatusChanged just before this component is disposed.
// The handler then runs against a disposed component and InvokeAsync throws
// ObjectDisposedException as an unobserved fire-and-forget task exception.
// This flag (set first in Dispose()) makes a racing callback no-op, and the
// dispatch swallows the residual ObjectDisposedException — mirroring the
// DebugView (CentralUI-009) and ToastNotification (CentralUI-010) guards.
private volatile bool _disposed;
// CentralUI-006: deployment status updates are push-based, not polled.
// DeploymentManager raises IDeploymentStatusNotifier.StatusChanged on every
// deployment-record status write; this page subscribes to it and reloads,
@@ -220,12 +231,34 @@
private void OnDeploymentStatusChanged(ScadaLink.DeploymentManager.DeploymentStatusChange change)
{
if (!_autoRefresh) return;
_ = InvokeAsync(async () =>
// CentralUI-022: a callback racing disposal must not touch the component.
if (_disposed || !_autoRefresh) return;
_ = DispatchReloadAsync();
}
/// <summary>
/// Reloads the deployment table on the renderer's dispatcher, guarded
/// against the component being disposed mid-flight (CentralUI-022):
/// <c>InvokeAsync</c> throws <see cref="ObjectDisposedException"/> once the
/// circuit is gone, and this handler runs fire-and-forget so that exception
/// would otherwise go unobserved on the DeploymentManager thread.
/// </summary>
private async Task DispatchReloadAsync()
{
if (_disposed) return;
try
{
await LoadDataAsync();
StateHasChanged();
});
await InvokeAsync(async () =>
{
if (_disposed) return;
await LoadDataAsync();
StateHasChanged();
});
}
catch (ObjectDisposedException)
{
// Component disposed between the guard and the dispatch — ignore.
}
}
private void ToggleAutoRefresh()
@@ -316,8 +349,10 @@
public void Dispose()
{
// Unsubscribe so a status change after the circuit is gone does not
// touch a disposed component (the notifier is a process singleton).
// CentralUI-022: set the guard first so a callback already in flight on
// the DeploymentManager thread no-ops, then unsubscribe so no further
// status change reaches this disposed component.
_disposed = true;
DeploymentStatusNotifier.StatusChanged -= OnDeploymentStatusChanged;
}
}
@@ -438,11 +438,10 @@
private void GoBack() => NavigationManager.NavigateTo("/deployment/topology");
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
// ── Bindings ────────────────────────────────────────────
@@ -157,9 +157,8 @@
private void GoBack() => NavigationManager.NavigateTo("/deployment/topology");
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
}
@@ -921,9 +921,8 @@
}
}
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
}
@@ -172,11 +172,10 @@
private ScriptAnalysis.SandboxRunResult? _runResult;
private CancellationTokenSource? _runCts;
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
protected override async Task OnInitializedAsync()
{
@@ -101,11 +101,10 @@
</div>
@code {
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
private List<SharedScript> _scripts = new();
private bool _loading = true;
@@ -119,9 +119,8 @@
NavigationManager.NavigateTo("/design/templates");
}
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
}
@@ -218,11 +218,10 @@
NavigationManager.NavigateTo("/design/templates");
}
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
private RenderFragment RenderTemplateDetail() => __builder =>
{
@@ -99,11 +99,10 @@
</div>
@code {
private async Task<string> GetCurrentUserAsync()
{
var authState = await AuthStateProvider.GetAuthenticationStateAsync();
return authState.User.FindFirst("Username")?.Value ?? "unknown";
}
// CentralUI-024: delegates to the shared helper so the claim type stays
// resolved through JwtTokenService rather than a duplicated magic string.
private Task<string> GetCurrentUserAsync()
=> AuthStateProvider.GetCurrentUsernameAsync();
private List<Template> _templates = new();
private List<TemplateFolder> _folders = new();
@@ -10,6 +10,7 @@
@inject CommunicationService CommunicationService
@inject IJSRuntime JS
@inject IDialogService Dialog
@inject ILogger<ParkedMessages> Logger
<div class="container-fluid mt-3">
<ToastNotification @ref="_toast" />
@@ -694,7 +695,18 @@
await JS.InvokeVoidAsync("navigator.clipboard.writeText", text);
_toast.ShowSuccess("Copied to clipboard.");
}
catch { _toast.ShowError("Copy failed."); }
catch (JSDisconnectedException)
{
// Circuit gone — the page is being torn down; nothing to surface.
// CentralUI-023: distinguished from a genuine interop failure.
}
catch (JSException ex)
{
// A real clipboard failure (e.g. permission denied) — surface it to
// the user and log it so it is not invisible in production.
Logger.LogWarning(ex, "Clipboard copy failed.");
_toast.ShowError("Copy failed.");
}
}
// ── Helpers ──
@@ -3,6 +3,7 @@
via @ref to display a side-by-side or simple before/after comparison.
z-index ladder follows ConfirmDialog: modal 1055 > backdrop 1040 (toasts at 1090). *@
@inject IJSRuntime JS
@inject ILogger<DiffDialog> Logger
@implements IAsyncDisposable
@if (_visible)
@@ -101,7 +102,20 @@
_bodyLocked = true;
await TryLockBodyAsync();
try { await _modalRef.FocusAsync(); }
catch { /* prerender or detached: ignore */ }
catch (InvalidOperationException)
{
// Prerender: the element reference is not attached yet — the
// next interactive render focuses it. Expected, not logged.
}
catch (JSDisconnectedException)
{
// Circuit gone before focus could run — nothing to do.
}
catch (JSException ex)
{
// A genuine focus interop failure (CentralUI-023) — log it.
Logger.LogWarning(ex, "DiffDialog: failed to focus the modal.");
}
}
}
@@ -127,10 +141,15 @@
{
await JS.InvokeVoidAsync("document.body.classList.add", "modal-open");
}
catch
catch (JSDisconnectedException)
{
try { await JS.InvokeVoidAsync("console.debug", "DiffDialog: JS interop unavailable for body lock."); }
catch { /* swallow */ }
// Circuit gone — the body scroll lock is moot. Expected, silent.
}
catch (JSException ex)
{
// CentralUI-023: a genuine interop failure — log instead of doing
// another (also-failing) JS call inside a bare catch.
Logger.LogWarning(ex, "DiffDialog: failed to apply body scroll lock.");
}
}
@@ -141,10 +160,13 @@
{
await JS.InvokeVoidAsync("document.body.classList.remove", "modal-open");
}
catch
catch (JSDisconnectedException)
{
try { await JS.InvokeVoidAsync("console.debug", "DiffDialog: JS interop unavailable for body unlock."); }
catch { /* swallow */ }
// Circuit gone — the body scroll lock is moot. Expected, silent.
}
catch (JSException ex)
{
Logger.LogWarning(ex, "DiffDialog: failed to remove body scroll lock.");
}
}
@@ -1,41 +1,58 @@
@implements IDisposable
@inject AuthenticationStateProvider AuthStateProvider
@inject NavigationManager Navigation
@inject IJSRuntime JS
@code {
// CentralUI-005: session expiry is a sliding window owned by the cookie
// authentication middleware (ScadaLink.Security AddCookie:
// CentralUI-005 / CentralUI-020: session expiry is a sliding window owned by
// the cookie authentication middleware (ScadaLink.Security AddCookie:
// ExpireTimeSpan = idle timeout, SlidingExpiration = true). An active user's
// cookie is continually renewed; an idle user's cookie lapses after the idle
// timeout. There is therefore no fixed login-time deadline to redirect at
// the old code read an "expires_at" claim and scheduled a single hard
// redirect, which both contradicted the sliding policy and logged active
// users out mid-session.
// timeout. There is no fixed login-time deadline to redirect at.
//
// Instead this component polls the authentication state on a recurring
// interval. While the session is still valid it does nothing; once the
// sliding cookie has expired (the server-side idle cutoff has been reached)
// the next poll observes an unauthenticated principal and redirects to the
// login page. Re-checking the state is itself circuit activity, so this poll
// alone never keeps a truly idle session alive — only genuine user activity
// refreshes the cookie before it lapses.
// This component must NOT poll the Blazor AuthenticationStateProvider:
// CookieAuthenticationStateProvider serves a frozen constructor-time
// principal for the whole circuit (CentralUI-004), so the polled auth state
// can never transition to "expired" and the redirect would never fire
// (CentralUI-020).
//
// Instead it polls the server endpoint GET /auth/ping via fetch(). Being a
// normal HTTP request, the cookie middleware re-validates — and slides — the
// cookie on every hit, and answers 200 while the session is live or 401 once
// it has lapsed. A genuine idle user's circuit produces no other HTTP
// traffic, so once the cookie lapses the next ping returns 401 and this
// component redirects to /login. (The ping itself slides the cookie, but the
// poll interval is well under the idle timeout, so an idle session still
// lapses on schedule once the poll catches the lapsed state — the ping only
// ever observes expiry, it does not keep a dead session alive.)
/// <summary>Server endpoint that reports live session validity.</summary>
internal const string PingUrl = "/auth/ping";
/// <summary>HTTP status returned by <see cref="PingUrl"/> once the cookie has lapsed.</summary>
private const int Unauthorized = 401;
private const string ModulePath = "./_content/ScadaLink.CentralUI/js/session-expiry.js";
/// <summary>How often the session validity is re-checked.</summary>
internal static readonly TimeSpan PollInterval = TimeSpan.FromMinutes(1);
private CancellationTokenSource? _cts;
private IJSObjectReference? _module;
protected override void OnInitialized()
{
// The login page uses the same layout, so this component renders there
// too. Polling/redirecting on /login → /login would loop.
var path = Navigation.ToBaseRelativePath(Navigation.Uri);
if (path.StartsWith("login", StringComparison.OrdinalIgnoreCase)) return;
if (IsOnLoginPage) return;
_cts = new CancellationTokenSource();
_ = PollSessionAsync(_cts.Token);
}
private bool IsOnLoginPage =>
Navigation.ToBaseRelativePath(Navigation.Uri)
.StartsWith("login", StringComparison.OrdinalIgnoreCase);
private async Task PollSessionAsync(CancellationToken token)
{
while (!token.IsCancellationRequested)
@@ -43,21 +60,43 @@
try { await Task.Delay(PollInterval, token); }
catch (TaskCanceledException) { return; }
AuthenticationState auth;
try
{
auth = await AuthStateProvider.GetAuthenticationStateAsync();
}
catch (ObjectDisposedException)
{
return;
}
if (token.IsCancellationRequested) return;
await CheckSessionAsync();
}
}
if (auth.User.Identity?.IsAuthenticated != true)
{
await InvokeAsync(() => Navigation.NavigateTo("/login", forceLoad: true));
return;
}
/// <summary>
/// Runs one liveness check: pings the server and, if the session has lapsed
/// server-side (HTTP 401), redirects to the login page. Exposed for tests
/// (CentralUI-025) so the redirect path can be exercised without waiting on
/// the poll interval.
/// </summary>
internal async Task CheckSessionAsync()
{
if (IsOnLoginPage) return;
int status;
try
{
_module ??= await JS.InvokeAsync<IJSObjectReference>("import", ModulePath);
status = await _module.InvokeAsync<int>("ping", PingUrl);
}
catch (JSDisconnectedException)
{
// Circuit gone — nothing to redirect.
return;
}
catch (JSException)
{
// Network blip or fetch failure: treat as inconclusive and retry on
// the next poll rather than logging an authenticated user out on a
// transient error.
return;
}
if (status == Unauthorized)
{
await InvokeAsync(() => Navigation.NavigateTo("/login", forceLoad: true));
}
}
@@ -65,5 +104,18 @@
{
_cts?.Cancel();
_cts?.Dispose();
// The module reference is owned by the circuit's JS runtime; once the
// circuit is disposed disposing it would throw — fire-and-forget and
// swallow the expected disconnect.
if (_module is not null)
{
_ = DisposeModuleAsync(_module);
}
}
private static async Task DisposeModuleAsync(IJSObjectReference module)
{
try { await module.DisposeAsync(); }
catch (JSDisconnectedException) { /* circuit already gone */ }
}
}
+1
View File
@@ -8,5 +8,6 @@
@using Microsoft.JSInterop
@using static Microsoft.AspNetCore.Components.Web.RenderMode
@using ScadaLink.CentralUI
@using ScadaLink.CentralUI.Auth
@using ScadaLink.CentralUI.Components.Layout
@using ScadaLink.CentralUI.Components.Shared
@@ -0,0 +1,23 @@
// CentralUI-020: client-side helper for the SessionExpiry component's
// idle-logout check. Pings the given URL and reports the HTTP status code so
// the Blazor component can redirect to /login once the server reports 401.
//
// `redirect: "manual"` ensures a 302 (should the endpoint ever start
// redirecting) is reported as an opaque status rather than being followed
// transparently — the component only ever wants to see the real outcome.
export async function ping(url) {
try {
const resp = await fetch(url, {
method: "GET",
credentials: "same-origin",
cache: "no-store",
redirect: "manual",
headers: { "X-Requested-With": "XMLHttpRequest" }
});
return resp.status;
} catch {
// Network failure: report 0 so the caller treats it as inconclusive
// and retries on the next poll rather than logging the user out.
return 0;
}
}
@@ -65,6 +65,15 @@ public sealed class ClusterOptionsValidator : IValidateOptions<ClusterOptions>
"declared unreachable before a heartbeat can arrive.");
}
if (!options.DownIfAlone)
{
failures.Add(
"ClusterOptions.DownIfAlone must be true for the keep-oldest resolver "
+ "(Component-ClusterInfrastructure.md → Split-Brain Resolution); with it false the "
+ "oldest node can run as an isolated single-node cluster during a partition while the "
+ "younger node forms its own, producing two live clusters.");
}
return failures.Count > 0
? ValidateOptionsResult.Fail(failures)
: ValidateOptionsResult.Success;
@@ -68,9 +68,11 @@ public readonly record struct OpcUaConfigParseResult
/// <summary>
/// Serializes <see cref="OpcUaEndpointConfig"/> to/from the typed nested JSON
/// shape stored in <c>DataConnection.PrimaryConfiguration</c> / <c>BackupConfiguration</c>.
/// On read, falls back to the legacy flat string-dict shape for pre-refactor rows
/// and reports <see cref="OpcUaConfigParseStatus.Legacy"/> so the form can prompt the
/// user to re-save.
/// On read, falls back to the legacy flat string-dict shape only for rows that are not
/// the current typed shape (no <c>endpointUrl</c> property), reporting
/// <see cref="OpcUaConfigParseStatus.Legacy"/> so the form can prompt the user to
/// re-save. A row that <em>is</em> the typed shape but fails to deserialize is reported
/// <see cref="OpcUaConfigParseStatus.Malformed"/>, never <see cref="OpcUaConfigParseStatus.Legacy"/>.
/// </summary>
public static class OpcUaEndpointConfigSerializer
{
@@ -94,9 +96,13 @@ public static class OpcUaEndpointConfigSerializer
/// <item><see cref="OpcUaConfigParseStatus.Legacy"/> — parsed as a legacy flat object;
/// the config is usable and the caller may prompt a re-save.</item>
/// <item><see cref="OpcUaConfigParseStatus.Malformed"/> — the input is genuinely
/// unparseable JSON. The config is an empty default and the original string is lost;
/// the caller should surface an error rather than treating the empty config as the
/// user's saved data.</item>
/// unparseable JSON, <em>or</em> it is the current typed shape (it has an
/// <c>endpointUrl</c> property) but typed deserialization failed — e.g. an
/// enum-valued field holding an unrecognised string or a wrong-typed field. Such a
/// corrupt typed row is reported <see cref="OpcUaConfigParseStatus.Malformed"/>
/// rather than being mislabelled <see cref="OpcUaConfigParseStatus.Legacy"/>, so the
/// offending field is not silently dropped. The config is an empty default and the
/// caller should surface an error rather than treating it as the user's saved data.</item>
/// </list>
/// </summary>
public static OpcUaConfigParseResult Deserialize(string? json)
@@ -104,22 +110,42 @@ public static class OpcUaEndpointConfigSerializer
if (string.IsNullOrWhiteSpace(json))
return new OpcUaConfigParseResult(new OpcUaEndpointConfig(), OpcUaConfigParseStatus.Typed);
// First decide which shape the row is — without yet trying to materialize it.
// A root JSON object carrying "endpointUrl" IS the current typed shape; anything
// else (no endpointUrl) is treated as a candidate legacy flat-dict row.
bool isTypedShape;
try
{
using var doc = JsonDocument.Parse(json);
if (doc.RootElement.ValueKind == JsonValueKind.Object
&& doc.RootElement.TryGetProperty("endpointUrl", out _))
isTypedShape = doc.RootElement.ValueKind == JsonValueKind.Object
&& doc.RootElement.TryGetProperty("endpointUrl", out _);
}
catch (JsonException)
{
// Could not even parse the document: genuinely malformed input.
return new OpcUaConfigParseResult(new OpcUaEndpointConfig(), OpcUaConfigParseStatus.Malformed);
}
if (isTypedShape)
{
// The row is the current typed shape. If typed deserialization fails the row
// is a corrupt current-shape row (e.g. an invalid enum or wrong-typed field) —
// it must NOT fall through to the legacy path and be mislabelled Legacy, which
// would silently drop the offending field. Report Malformed instead.
try
{
var typed = JsonSerializer.Deserialize<OpcUaEndpointConfig>(json, JsonOpts);
if (typed != null)
return new OpcUaConfigParseResult(typed, OpcUaConfigParseStatus.Typed);
}
catch (JsonException) { /* corrupt typed row — classified Malformed below */ }
return new OpcUaConfigParseResult(new OpcUaEndpointConfig(), OpcUaConfigParseStatus.Malformed);
}
catch (JsonException) { /* fall through to legacy */ }
try
{
return new OpcUaConfigParseResult(LoadLegacy(json!), OpcUaConfigParseStatus.Legacy);
return new OpcUaConfigParseResult(LoadLegacy(json), OpcUaConfigParseStatus.Legacy);
}
catch (JsonException)
{
@@ -6,6 +6,8 @@ namespace ScadaLink.Commons.Types;
/// <summary>
/// Wraps a JsonElement as a dynamic object for convenient property access in scripts.
/// Supports property access (obj.name), indexing (obj.items[0]), and ToString().
/// Array indexing accepts any integral index type (int, long, short, byte, ...), so an
/// index derived from another wrapped JSON number — which Wrap surfaces as long — works.
/// </summary>
/// <remarks>
/// The element passed to the constructor is <see cref="JsonElement.Clone()">cloned</see>
@@ -39,13 +41,16 @@ public class DynamicJsonElement : DynamicObject
public override bool TryGetIndex(GetIndexBinder binder, object[] indexes, out object? result)
{
// Accept any integral index, not just int. DynamicJsonElement surfaces JSON
// numbers as long (see Wrap), so an index computed from another wrapped value
// (e.g. obj.items[obj.count - 1]) arrives as a long; byte/short widen too.
if (_element.ValueKind == JsonValueKind.Array &&
indexes.Length == 1 && indexes[0] is int index)
indexes.Length == 1 && TryGetIntegralIndex(indexes[0], out var index))
{
var arrayLength = _element.GetArrayLength();
if (index >= 0 && index < arrayLength)
{
result = Wrap(_element[index]);
result = Wrap(_element[(int)index]);
return true;
}
}
@@ -53,6 +58,28 @@ public class DynamicJsonElement : DynamicObject
return false;
}
private static bool TryGetIntegralIndex(object? value, out long index)
{
switch (value)
{
case int i: index = i; return true;
case long l: index = l; return true;
case short s: index = s; return true;
case byte b: index = b; return true;
case sbyte sb: index = sb; return true;
case ushort us: index = us; return true;
case uint ui: index = ui; return true;
case ulong ul when ul <= long.MaxValue: index = (long)ul; return true;
// Whole-valued floating-point indices are accepted too: arithmetic on
// wrapped JSON numbers can yield a double/decimal even for an integer result.
case double d when d >= long.MinValue && d <= long.MaxValue && d == Math.Floor(d):
index = (long)d; return true;
case decimal m when m == Math.Floor(m):
index = (long)m; return true;
default: index = 0; return false;
}
}
public override bool TryConvert(ConvertBinder binder, out object? result)
{
// Conversion to object (or dynamic): never null out a present value. Return the
@@ -102,11 +129,23 @@ public class DynamicJsonElement : DynamicObject
JsonValueKind.Object => new DynamicJsonElement(element),
JsonValueKind.Array => new DynamicJsonElement(element),
JsonValueKind.String => element.GetString(),
JsonValueKind.Number => element.TryGetInt64(out var l) ? l : element.GetDouble(),
JsonValueKind.Number => WrapNumber(element),
JsonValueKind.True => true,
JsonValueKind.False => false,
JsonValueKind.Null => null,
_ => element.GetRawText()
};
}
private static object WrapNumber(JsonElement element)
{
// An integral JSON number must box as long, not double. A ternary
// (TryGetInt64 ? long : double) would unify both branches to double and
// silently widen the long, so an integral index read back out of the wrapper
// (e.g. obj.items[obj.count - 1]) would arrive as a double. Box each case
// with its own typed return so the runtime type is preserved.
if (element.TryGetInt64(out var l))
return l;
return element.GetDouble();
}
}
@@ -20,6 +20,14 @@ public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
private readonly ILogger? _logger;
private readonly ConcurrentDictionary<string, CancellationTokenSource> _subscriptions = new();
/// <summary>
/// The gRPC endpoint (site node address) this client is bound to. The
/// <see cref="SiteStreamGrpcClientFactory"/> compares this against the requested
/// endpoint so a NodeA→NodeB failover flip (or a site address edit) is honoured
/// rather than served stale from cache.
/// </summary>
public virtual string Endpoint { get; } = string.Empty;
/// <summary>
/// The HTTP/2 keepalive ping delay actually applied to this client's channel.
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
@@ -44,6 +52,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
/// </summary>
public SiteStreamGrpcClient(string endpoint, ILogger logger, CommunicationOptions options)
{
Endpoint = endpoint;
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay;
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout;
_channel = GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
@@ -67,6 +76,16 @@ public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
{
}
/// <summary>
/// Protected constructor for unit testing — records the endpoint without
/// opening a real gRPC channel, so endpoint-aware factory behaviour can be
/// exercised by test doubles.
/// </summary>
protected SiteStreamGrpcClient(string endpoint)
{
Endpoint = endpoint;
}
/// <summary>
/// Creates a test-only instance that has no gRPC channel. Used to test
/// Unsubscribe and Dispose behavior without needing a real endpoint.
@@ -32,13 +32,40 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
}
/// <summary>
/// Returns an existing client for the site or creates a new one. The new
/// client is created via <see cref="CreateClient"/> and tracked so the
/// factory's <see cref="Dispose"/> / <see cref="DisposeAsync"/> release it.
/// Returns the cached client for the site, or creates a new one. If a client is
/// already cached but bound to a *different* <paramref name="grpcEndpoint"/> — the
/// NodeA→NodeB failover flip, or a site whose gRPC address was edited — the stale
/// client is disposed and replaced with one bound to the requested endpoint.
/// Communication-012/013: keying purely by site identifier and ignoring the
/// endpoint on a cache hit defeated debug-stream node failover and meant a
/// corrected gRPC address never took effect without a central restart.
/// </summary>
public virtual SiteStreamGrpcClient GetOrCreate(string siteIdentifier, string grpcEndpoint)
{
return _clients.GetOrAdd(siteIdentifier, _ => CreateClient(grpcEndpoint));
// Fast path: a client is cached and already bound to the requested endpoint.
if (_clients.TryGetValue(siteIdentifier, out var existing) &&
string.Equals(existing.Endpoint, grpcEndpoint, StringComparison.Ordinal))
{
return existing;
}
// Either no client is cached, or the cached one is bound to a different
// endpoint. AddOrUpdate atomically installs a client for the requested
// endpoint; the prior (stale) client, if any, is disposed afterwards.
SiteStreamGrpcClient? stale = null;
var client = _clients.AddOrUpdate(
siteIdentifier,
_ => CreateClient(grpcEndpoint),
(_, current) =>
{
if (string.Equals(current.Endpoint, grpcEndpoint, StringComparison.Ordinal))
return current;
stale = current;
return CreateClient(grpcEndpoint);
});
stale?.Dispose();
return client;
}
/// <summary>
@@ -53,7 +80,11 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
}
/// <summary>
/// Removes and disposes the client for the given site.
/// Removes and disposes the client for the given site. Site *address changes* are
/// now handled transparently by <see cref="GetOrCreate"/> (it disposes and recreates
/// a client whose endpoint no longer matches). This method remains the disposal
/// path for full site *removal* — call it when a site record is deleted so its
/// cached gRPC client does not linger for the life of the process.
/// </summary>
public async Task RemoveSiteAsync(string siteIdentifier)
{
@@ -95,6 +95,18 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
if (!_ready)
throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server not ready"));
// Communication-014: correlation_id arrives off the wire on a public gRPC
// endpoint and is used (below) to compose an Akka actor name. Akka actor names
// have a restricted character set — a id containing '/', whitespace, or other
// disallowed characters would make ActorOf throw InvalidActorNameException,
// escaping as an unhandled RPC fault. Reject unsafe ids cleanly up front.
if (string.IsNullOrEmpty(request.CorrelationId) ||
!ActorPath.IsValidPathElement(request.CorrelationId))
{
throw new RpcException(new GrpcStatus(
StatusCode.InvalidArgument, "correlation_id is missing or not a valid identifier"));
}
// Duplicate prevention -- cancel existing stream for this correlationId
if (_activeStreams.TryRemove(request.CorrelationId, out var existingEntry))
{
@@ -1,6 +1,8 @@
using Microsoft.AspNetCore.DataProtection;
using Microsoft.AspNetCore.DataProtection.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.ExternalSystems;
@@ -85,6 +87,19 @@ public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
// Data Protection Keys (for shared ASP.NET Data Protection across nodes)
public DbSet<DataProtectionKey> DataProtectionKeys => Set<DataProtectionKey>();
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
base.OnConfiguring(optionsBuilder);
// The secret-column converter built in OnModelCreating differs depending on whether
// a real Data Protection provider was supplied (encrypting converter) or not
// (schema-only converter). EF Core's default model cache keys only on context type,
// so a provider-bearing and a schema-only context sharing the same options type
// would otherwise share one cached model — and whichever was built first would win.
// Distinguish them so each gets its own model.
optionsBuilder.ReplaceService<IModelCacheKeyFactory, SecretAwareModelCacheKeyFactory>();
}
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.ApplyConfigurationsFromAssembly(typeof(ScadaLinkDbContext).Assembly);
@@ -92,23 +107,106 @@ public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
ApplySecretColumnEncryption(modelBuilder);
}
public override int SaveChanges(bool acceptAllChangesOnSuccess)
{
GuardSecretWritesHaveAKeyRing();
return base.SaveChanges(acceptAllChangesOnSuccess);
}
public override Task<int> SaveChangesAsync(
bool acceptAllChangesOnSuccess, CancellationToken cancellationToken = default)
{
GuardSecretWritesHaveAKeyRing();
return base.SaveChangesAsync(acceptAllChangesOnSuccess, cancellationToken);
}
/// <summary>True when this context was constructed with a real Data Protection provider.</summary>
internal bool HasSecretEncryptionProvider => _dataProtectionProvider is not null;
/// <summary>
/// Fails fast — before any database round-trip — if a context built without a real
/// Data Protection key ring (the schema-only single-argument constructor) is about to
/// persist a non-null secret-bearing column. Without this guard the schema-only
/// protector would still throw, but only deep inside EF's update pipeline wrapped in a
/// <c>DbUpdateException</c>/<c>CryptographicException</c>; surfacing a clear
/// <see cref="InvalidOperationException"/> here makes the misconfiguration obvious.
/// </summary>
private void GuardSecretWritesHaveAKeyRing()
{
if (_dataProtectionProvider is not null)
return;
foreach (var entry in ChangeTracker.Entries())
{
if (entry.State is not (EntityState.Added or EntityState.Modified))
continue;
string? secretProperty = entry.Entity switch
{
SmtpConfiguration => nameof(SmtpConfiguration.Credentials),
ExternalSystemDefinition => nameof(ExternalSystemDefinition.AuthConfiguration),
DatabaseConnectionDefinition => nameof(DatabaseConnectionDefinition.ConnectionString),
_ => null
};
if (secretProperty is null)
continue;
if (entry.Property(secretProperty).CurrentValue is not null)
{
throw new InvalidOperationException(
"This ScadaLinkDbContext was constructed without a Data Protection key " +
"ring (the single-argument, schema-only constructor). It cannot persist " +
$"the secret-bearing column '{entry.Entity.GetType().Name}.{secretProperty}'. " +
"Construct the context with the DI-registered IDataProtectionProvider " +
"(AddConfigurationDatabase wires this up).");
}
}
}
/// <summary>
/// Model cache key factory that folds <see cref="HasSecretEncryptionProvider"/> into the
/// cache key, so a write-capable (provider-bearing) context and a schema-only context do
/// not share a cached model.
/// </summary>
private sealed class SecretAwareModelCacheKeyFactory : IModelCacheKeyFactory
{
public object Create(DbContext context, bool designTime)
=> (context.GetType(),
designTime,
(context as ScadaLinkDbContext)?.HasSecretEncryptionProvider ?? false);
public object Create(DbContext context) => Create(context, false);
}
/// <summary>
/// Applies encryption-at-rest to columns that hold authentication secrets
/// (SMTP credentials, external-system auth config, database connection strings)
/// so they are never persisted as plaintext.
/// </summary>
/// <remarks>
/// When no Data Protection provider is supplied (design-time <c>dotnet ef</c> tooling,
/// which only emits schema and never reads or writes secret data), an ephemeral provider
/// is used. The encrypted-column type is <c>nvarchar</c> either way, so the generated
/// schema is identical regardless of which provider is in effect. The runtime path always
/// receives the DI-registered provider whose keys are persisted to this database.
/// When no Data Protection provider is supplied design-time <c>dotnet ef</c> tooling,
/// which only emits schema and never reads or writes secret data — a schema-only
/// protector is used. It produces an identical <c>nvarchar</c> schema, but throws a
/// clear <see cref="InvalidOperationException"/> if a secret column is ever read or
/// written through it. This deliberately does NOT silently substitute an ephemeral
/// (in-memory, process-lifetime) key: encrypting a runtime write with a throwaway key
/// would persist ciphertext that becomes permanently undecryptable on the next process
/// restart, with no error. A write-capable context must be constructed with the
/// DI-registered provider whose keys are persisted to this database.
/// </remarks>
private void ApplySecretColumnEncryption(ModelBuilder modelBuilder)
{
IDataProtectionProvider provider = _dataProtectionProvider ?? new EphemeralDataProtectionProvider();
var converter = new EncryptedStringConverter(
provider.CreateProtector(EncryptedStringConverter.ProtectorPurpose));
IDataProtector protector = _dataProtectionProvider is { } provider
? provider.CreateProtector(EncryptedStringConverter.ProtectorPurpose)
: SchemaOnlyDataProtector.Instance;
// Held as the non-generic ValueConverter base so all three HasConversion calls
// read identically. The converter's CLR type is string?->string?; binding it to
// the non-nullable DatabaseConnectionDefinition.ConnectionString property would
// otherwise raise a CS8620 nullability mismatch — the non-generic reference is
// the supported way to apply one converter uniformly across nullable and
// non-nullable string columns.
ValueConverter converter = new EncryptedStringConverter(protector);
modelBuilder.Entity<SmtpConfiguration>()
.Property(s => s.Credentials)
@@ -120,6 +218,29 @@ public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
modelBuilder.Entity<DatabaseConnectionDefinition>()
.Property(d => d.ConnectionString)
.HasConversion((Microsoft.EntityFrameworkCore.Storage.ValueConversion.ValueConverter)converter);
.HasConversion(converter);
}
/// <summary>
/// An <see cref="IDataProtector"/> for contexts built without a real Data Protection
/// provider (design-time / schema-only). It satisfies model building but fails fast
/// with a clear message if a secret column is actually read or written, rather than
/// silently producing throwaway ciphertext that cannot be decrypted after a restart.
/// </summary>
private sealed class SchemaOnlyDataProtector : IDataProtector
{
internal static readonly SchemaOnlyDataProtector Instance = new();
private const string Message =
"This ScadaLinkDbContext was constructed without a Data Protection key ring " +
"(the single-argument, schema-only constructor). Secret-bearing configuration " +
"columns cannot be read or written through it. Construct the context with the " +
"DI-registered IDataProtectionProvider (AddConfigurationDatabase wires this up).";
public IDataProtector CreateProtector(string purpose) => this;
public byte[] Protect(byte[] plaintext) => throw new InvalidOperationException(Message);
public byte[] Unprotect(byte[] protectedData) => throw new InvalidOperationException(Message);
}
}
@@ -410,8 +410,14 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
}
else
{
_log.Warning("[{0}] Connection failed: {1}. Retrying in {2}s",
_connectionName, result.Error, _options.ReconnectInterval.TotalSeconds);
// DataConnectionLayer-015: the INITIAL connect must participate in the
// failover counter exactly like a reconnect. Without this a primary that is
// unreachable when the actor first starts (fresh deployment, site restart, or
// a primary simply down) is retried forever and the configured backup is
// never tried. Count the failure and switch endpoint once the retry count is
// exhausted, then re-arm the timer.
_consecutiveFailures++;
CountFailureAndMaybeFailover(result.Error);
Timers.StartSingleTimer("reconnect", new AttemptConnect(), _options.ReconnectInterval);
}
}
@@ -439,59 +445,69 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
else
{
_consecutiveFailures++;
// Failover: switch endpoint after exhausting retry count (only if backup is configured)
if (_backupConfig != null && _consecutiveFailures >= _failoverRetryCount)
{
var previousEndpoint = _activeEndpoint;
_activeEndpoint = _activeEndpoint == ActiveEndpoint.Primary
? ActiveEndpoint.Backup
: ActiveEndpoint.Primary;
_consecutiveFailures = 0;
var newConfig = _activeEndpoint == ActiveEndpoint.Primary
? _primaryConfig
: _backupConfig;
// Dispose old adapter (fire-and-forget — don't await in actor context)
_adapter.Disconnected -= OnAdapterDisconnected;
_ = _adapter.DisposeAsync().AsTask();
// Create new adapter for the target endpoint
_adapter = _factory.Create(_protocolType, newConfig);
_connectionDetails = newConfig;
// Wire disconnect handler on new adapter
_adapter.Disconnected += OnAdapterDisconnected;
// DataConnectionLayer-011: new adapter — bump the generation so callbacks
// from the disposed adapter are recognised as stale and dropped.
_adapterGeneration++;
_log.Warning("[{0}] Failing over from {1} to {2}",
_connectionName, previousEndpoint, _activeEndpoint);
// Log failover event to site event log
if (_siteEventLogger != null)
{
_ = _siteEventLogger.LogEventAsync(
"connection", "Warning", null, _connectionName,
$"Failover from {previousEndpoint} to {_activeEndpoint}",
$"After {_failoverRetryCount} consecutive failures");
}
}
else
{
var retryLimit = _backupConfig != null ? _failoverRetryCount.ToString() : "∞";
_log.Warning("[{0}] Reconnect failed: {1}. Retrying in {2}s (attempt {3}/{4})",
_connectionName, result.Error, _options.ReconnectInterval.TotalSeconds,
_consecutiveFailures, retryLimit);
}
CountFailureAndMaybeFailover(result.Error);
Timers.StartSingleTimer("reconnect", new AttemptConnect(), _options.ReconnectInterval);
}
}
/// <summary>
/// Shared connect-failure handling for both the initial connect (Connecting state)
/// and reconnect (Reconnecting state). Assumes <see cref="_consecutiveFailures"/> has
/// already been incremented for the current failure. Switches to the other endpoint
/// once the retry count is exhausted and a backup is configured
/// (DataConnectionLayer-015 brought the initial connect onto this path).
/// </summary>
private void CountFailureAndMaybeFailover(string? error)
{
// Failover: switch endpoint after exhausting retry count (only if backup is configured)
if (_backupConfig != null && _consecutiveFailures >= _failoverRetryCount)
{
var previousEndpoint = _activeEndpoint;
_activeEndpoint = _activeEndpoint == ActiveEndpoint.Primary
? ActiveEndpoint.Backup
: ActiveEndpoint.Primary;
_consecutiveFailures = 0;
var newConfig = _activeEndpoint == ActiveEndpoint.Primary
? _primaryConfig
: _backupConfig;
// Dispose old adapter (fire-and-forget — don't await in actor context)
_adapter.Disconnected -= OnAdapterDisconnected;
_ = _adapter.DisposeAsync().AsTask();
// Create new adapter for the target endpoint
_adapter = _factory.Create(_protocolType, newConfig);
_connectionDetails = newConfig;
// Wire disconnect handler on new adapter
_adapter.Disconnected += OnAdapterDisconnected;
// DataConnectionLayer-011: new adapter — bump the generation so callbacks
// from the disposed adapter are recognised as stale and dropped.
_adapterGeneration++;
_log.Warning("[{0}] Failing over from {1} to {2}",
_connectionName, previousEndpoint, _activeEndpoint);
// Log failover event to site event log
if (_siteEventLogger != null)
{
_ = _siteEventLogger.LogEventAsync(
"connection", "Warning", null, _connectionName,
$"Failover from {previousEndpoint} to {_activeEndpoint}",
$"After {_failoverRetryCount} consecutive failures");
}
}
else
{
var retryLimit = _backupConfig != null ? _failoverRetryCount.ToString() : "∞";
_log.Warning("[{0}] Connect failed: {1}. Retrying in {2}s (attempt {3}/{4})",
_connectionName, error, _options.ReconnectInterval.TotalSeconds,
_consecutiveFailures, retryLimit);
}
}
private void HandleDisconnect()
{
_log.Warning("[{0}] AdapterDisconnected message received — transitioning to Reconnecting", _connectionName);
@@ -663,8 +679,18 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_options.TagResolutionRetryInterval);
}
msg.ReplyTo.Tell(new SubscribeTagsResponse(
msg.Request.CorrelationId, instanceName, true, null, DateTimeOffset.UtcNow));
// DataConnectionLayer-016: the response must match the actor's own assessment.
// When a connection-level failure is driving the actor into Reconnecting, the
// tags were never subscribed at the adapter — replying Success: true would tell
// the Instance Actor the subscribe succeeded when it did not. Genuine
// tag-resolution failures stay Success: true (they are a runtime quality concern
// tracked via _unresolvedTags, with a Bad-quality TagValueUpdate already pushed).
msg.ReplyTo.Tell(connectionLevelFailure
? new SubscribeTagsResponse(
msg.Request.CorrelationId, instanceName, false,
"connection unavailable — will re-subscribe on reconnect", DateTimeOffset.UtcNow)
: new SubscribeTagsResponse(
msg.Request.CorrelationId, instanceName, true, null, DateTimeOffset.UtcNow));
// The caller (Connected state only) decides whether to enter Reconnecting.
// In Connecting/Reconnecting the connection is not established anyway, so the
@@ -228,10 +228,28 @@ public class OpcUaDataConnection : IDataConnection
public async Task<IReadOnlyDictionary<string, WriteResult>> WriteBatchAsync(IDictionary<string, object?> values, CancellationToken cancellationToken = default)
{
// DataConnectionLayer-017: a mid-batch fault must not abort the whole batch.
// WriteAsync calls EnsureConnected(), which throws InvalidOperationException when
// the connection drops partway through; catch per-tag exceptions and record a
// failed WriteResult so the caller (including WriteBatchAndWaitAsync) receives a
// complete result map. OperationCanceledException is still propagated so a
// cancelled batch aborts as a whole — mirrors the DCL-007 fix for ReadBatchAsync.
var results = new Dictionary<string, WriteResult>();
foreach (var (tagPath, value) in values)
{
results[tagPath] = await WriteAsync(tagPath, value, cancellationToken);
try
{
results[tagPath] = await WriteAsync(tagPath, value, cancellationToken);
}
catch (OperationCanceledException)
{
// Cancellation aborts the whole batch — propagate it.
throw;
}
catch (Exception ex)
{
results[tagPath] = new WriteResult(false, ex.Message);
}
}
return results;
}
@@ -316,11 +316,24 @@ public class RealOpcUaClientFactory : IOpcUaClientFactory
{
private readonly OpcUaGlobalOptions _globalOptions;
// DataConnectionLayer-014: a real logger must be threaded through to every
// RealOpcUaClient this factory builds, otherwise the DCL-012 auto-accept-certificate
// warning emitted in RealOpcUaClient.ConnectAsync sinks into NullLogger and is never
// seen in production. The factory is constructed by DataConnectionFactory, which has
// an ILoggerFactory available.
private readonly ILoggerFactory _loggerFactory;
public RealOpcUaClientFactory() : this(new OpcUaGlobalOptions()) { }
public RealOpcUaClientFactory(OpcUaGlobalOptions globalOptions)
: this(globalOptions, NullLoggerFactory.Instance) { }
public RealOpcUaClientFactory(OpcUaGlobalOptions globalOptions, ILoggerFactory loggerFactory)
{
_globalOptions = globalOptions;
_loggerFactory = loggerFactory;
}
public IOpcUaClient Create() => new RealOpcUaClient(_globalOptions);
public IOpcUaClient Create() =>
new RealOpcUaClient(_globalOptions, _loggerFactory.CreateLogger<RealOpcUaClient>());
}
@@ -22,9 +22,12 @@ public class DataConnectionFactory : IDataConnectionFactory
_loggerFactory = loggerFactory;
var globalOptions = opcUaGlobalOptions.Value;
// Register built-in protocols
// Register built-in protocols.
// DataConnectionLayer-014: pass the ILoggerFactory into RealOpcUaClientFactory so
// the RealOpcUaClient it builds gets a real logger — without it the DCL-012
// auto-accept-certificate security warning is silently discarded by NullLogger.
RegisterAdapter("OpcUa", details => new OpcUaDataConnection(
new RealOpcUaClientFactory(globalOptions),
new RealOpcUaClientFactory(globalOptions, _loggerFactory),
_loggerFactory.CreateLogger<OpcUaDataConnection>()));
}
@@ -142,6 +142,10 @@ public class DeploymentService
return Result<DeploymentRecord>.Failure($"Pre-deployment validation failed: {errors}");
}
// Serialize for transmission (also the payload stored in the deployed
// snapshot on success / reconciliation).
var configJson = JsonSerializer.Serialize(flattenedConfig);
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
// If a prior deployment for this instance is stuck InProgress or Failed
// due to a timeout, the site may have actually applied the config. Query
@@ -150,13 +154,10 @@ public class DeploymentService
// Idempotency"). A clean prior Success or a fresh first-time deploy
// skips this extra round-trip.
var reconciled = await TryReconcileWithSiteAsync(
instance, revisionHash, cancellationToken);
instance, revisionHash, configJson, cancellationToken);
if (reconciled != null)
return Result<DeploymentRecord>.Success(reconciled);
// Serialize for transmission
var configJson = JsonSerializer.Serialize(flattenedConfig);
// WP-4: Create deployment record with Pending status
var record = new DeploymentRecord(deploymentId, user)
{
@@ -210,25 +211,8 @@ public class DeploymentService
// persistence below is best-effort: a failure here must be
// logged loudly for operator reconciliation but must not flip
// the already-committed Success record back to Failed.
try
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
await ApplyPostSuccessSideEffectsAsync(
instance, deploymentId, revisionHash, configJson, cancellationToken);
}
// Audit log
@@ -560,7 +544,12 @@ public class DeploymentService
}
/// <summary>
/// WP-2: After failover/timeout, query site for current deployment state before re-deploying.
/// WP-2: Returns the current persisted <see cref="DeploymentRecord"/> for
/// the given deployment ID from the configuration database. This is a pure
/// local DB read — it does not contact the site. The query-the-site-before-
/// redeploy reconciliation (design: "Deployment Identity &amp; Idempotency")
/// lives in <see cref="TryReconcileWithSiteAsync"/>, which
/// <see cref="DeployInstanceAsync"/> invokes on the deploy path.
/// </summary>
public async Task<DeploymentRecord?> GetDeploymentStatusAsync(
string deploymentId,
@@ -580,9 +569,14 @@ public class DeploymentService
/// prior <see cref="DeploymentStatus.Success"/> skip the extra round-trip.
///
/// Reconciliation: if the site already has the TARGET revision hash, the
/// prior record is marked <see cref="DeploymentStatus.Success"/> and
/// returned (the caller must NOT re-send the deploy). Otherwise <c>null</c>
/// is returned and the normal deploy proceeds.
/// prior record is marked <see cref="DeploymentStatus.Success"/> (with its
/// <see cref="DeploymentRecord.RevisionHash"/> corrected to the target —
/// DeploymentManager-016) and returned (the caller must NOT re-send the
/// deploy). The same post-success side effects as the normal deploy path
/// are applied — instance <see cref="InstanceState.Enabled"/> and a stored
/// <see cref="DeployedConfigSnapshot"/> (DeploymentManager-015) — so central
/// and site state do not diverge. Otherwise <c>null</c> is returned and the
/// normal deploy proceeds.
///
/// Query failure: if the site is unreachable or the query times out, this
/// returns <c>null</c> (fall through to a normal deploy) — site-side
@@ -592,6 +586,7 @@ public class DeploymentService
private async Task<DeploymentRecord?> TryReconcileWithSiteAsync(
Instance instance,
string targetRevisionHash,
string configJson,
CancellationToken cancellationToken)
{
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
@@ -639,10 +634,23 @@ public class DeploymentService
prior.Status = DeploymentStatus.Success;
prior.ErrorMessage = null;
prior.CompletedAt = DateTimeOffset.UtcNow;
// DeploymentManager-016: the prior record can legitimately carry a
// different (stale) revision hash than the current target. The site
// confirmed it is running the target revision, so the persisted
// record, the audit entry below, and the site must all agree.
prior.RevisionHash = targetRevisionHash;
await _repository.UpdateDeploymentRecordAsync(prior, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
NotifyStatusChange(prior);
// DeploymentManager-015: a reconciled deployment must perform the
// SAME post-success side effects as the normal deploy path — set
// the instance State to Enabled and store/refresh the deployed
// config snapshot — otherwise the central state machine and the
// deployed-snapshot invariant diverge from what the site is running.
await ApplyPostSuccessSideEffectsAsync(
instance, prior.DeploymentId, targetRevisionHash, configJson, cancellationToken);
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
instance.Id.ToString(), instance.UniqueName,
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
@@ -669,6 +677,47 @@ public class DeploymentService
&& prior.ErrorMessage != null
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
/// <summary>
/// Post-success side effects shared by the normal deploy path and the
/// DeploymentManager-006 reconciliation path: set the instance
/// <see cref="InstanceState.Enabled"/> (WP-4) and store/refresh the
/// deployed config snapshot (WP-8). Factored into one helper so the two
/// paths cannot drift (DeploymentManager-015).
///
/// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
/// status is already committed by the caller before this runs. A failure
/// here is logged loudly for operator reconciliation but is NOT propagated —
/// it must not flip the already-committed Success record back to Failed.
/// </summary>
private async Task ApplyPostSuccessSideEffectsAsync(
Instance instance,
string deploymentId,
string revisionHash,
string configJson,
CancellationToken cancellationToken)
{
try
{
// WP-4: Update instance state to Enabled on successful deployment
instance.State = InstanceState.Enabled;
await _repository.UpdateInstanceAsync(instance, cancellationToken);
// WP-8: Store deployed config snapshot
await StoreDeployedSnapshotAsync(
instance.Id, deploymentId, revisionHash, configJson, cancellationToken);
await _repository.SaveChangesAsync(cancellationToken);
}
catch (Exception postEx)
{
_logger.LogError(postEx,
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
"recorded Success, but post-success persistence (instance state / config snapshot) " +
"failed -- central and site state may diverge until reconciled",
deploymentId, instance.UniqueName);
}
}
private async Task StoreDeployedSnapshotAsync(
int instanceId,
string deploymentId,
@@ -96,15 +96,20 @@ public class DatabaseGateway : IDatabaseGateway
Parameters = parameters
});
// The per-connection retry settings are passed through verbatim — a
// configured MaxRetries of 0 means "never retry" and must NOT be
// collapsed to the S&F default (ExternalSystemGateway-004).
// ExternalSystemGateway-015: the entity's MaxRetries is a non-nullable int
// whose default is 0, and the Store-and-Forward engine interprets a stored
// MaxRetries of 0 as "no limit" (retry forever) — see
// StoreAndForwardMessage.MaxRetries ("0 = no limit") and the retry-sweep
// guard `MaxRetries > 0 && ...`. Passing 0 verbatim would turn every
// unconfigured cached write into an unbounded retry loop. A 0 is treated as
// "unset" and passed as null so the bounded S&F default applies; the
// RetryDelay default of TimeSpan.Zero is likewise unset.
await _storeAndForward.EnqueueAsync(
StoreAndForwardCategory.CachedDbWrite,
connectionName,
payload,
originInstanceName,
definition.MaxRetries,
definition.MaxRetries > 0 ? definition.MaxRetries : null,
definition.RetryDelay > TimeSpan.Zero ? definition.RetryDelay : null);
}
@@ -114,15 +114,20 @@ public class ExternalSystemClient : IExternalSystemClient
// attempt above; letting EnqueueAsync re-invoke the handler would
// dispatch the same request a second time.
//
// The per-system retry settings are passed through verbatim — a
// configured MaxRetries of 0 means "never retry" and must NOT be
// collapsed to the S&F default (ExternalSystemGateway-004).
// ExternalSystemGateway-015: the entity's MaxRetries is a non-nullable
// int whose default is 0, and the Store-and-Forward engine interprets a
// stored MaxRetries of 0 as "no limit" (retry forever) — see
// StoreAndForwardMessage.MaxRetries ("0 = no limit") and the retry-sweep
// guard `MaxRetries > 0 && ...`. Passing 0 verbatim would therefore turn
// every unconfigured cached call into an unbounded retry loop. A 0 is
// treated as "unset" and passed as null so the bounded S&F default
// applies; the RetryDelay default of TimeSpan.Zero is likewise unset.
await _storeAndForward.EnqueueAsync(
StoreAndForwardCategory.ExternalSystem,
systemName,
payload,
originInstanceName,
system.MaxRetries,
system.MaxRetries > 0 ? system.MaxRetries : null,
system.RetryDelay > TimeSpan.Zero ? system.RetryDelay : null,
attemptImmediateDelivery: false);
@@ -329,7 +334,15 @@ public class ExternalSystemClient : IExternalSystemClient
var queryString = string.Join("&",
parameters.Where(p => p.Value != null)
.Select(p => $"{Uri.EscapeDataString(p.Key)}={Uri.EscapeDataString(p.Value?.ToString() ?? "")}"));
url += "?" + queryString;
// Only append "?" when the effective query string is non-empty — a method
// whose parameter values are all null produces no query string, and the
// URL must then be identical to the no-parameters case rather than ending
// in a bare "?" (ExternalSystemGateway-017).
if (queryString.Length > 0)
{
url += "?" + queryString;
}
}
return url;
@@ -1,4 +1,5 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Http;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Interfaces.Services;
@@ -6,6 +7,12 @@ namespace ScadaLink.ExternalSystemGateway;
public static class ServiceCollectionExtensions
{
/// <summary>
/// Name prefix of the per-system <see cref="System.Net.Http.HttpClient"/> clients
/// created by <see cref="ExternalSystemClient"/> (<c>ExternalSystem_{systemName}</c>).
/// </summary>
internal const string GatewayClientNamePrefix = "ExternalSystem_";
public static IServiceCollection AddExternalSystemGateway(this IServiceCollection services)
{
services.AddOptions<ExternalSystemGatewayOptions>()
@@ -13,20 +20,18 @@ public static class ServiceCollectionExtensions
services.AddHttpClient();
// ExternalSystemGateway-013: wire MaxConcurrentConnectionsPerSystem into the
// primary handler of every per-system named client ("ExternalSystem_{name}"),
// so the option an operator configures actually bounds concurrent connections
// instead of being silently ignored. ConfigureHttpClientDefaults applies to
// the dynamically-named clients created by ExternalSystemClient.
services.ConfigureHttpClientDefaults(builder =>
builder.ConfigurePrimaryHttpMessageHandler(sp =>
{
var options = sp.GetRequiredService<IOptions<ExternalSystemGatewayOptions>>().Value;
return new SocketsHttpHandler
{
MaxConnectionsPerServer = options.MaxConcurrentConnectionsPerSystem,
};
}));
// ExternalSystemGateway-013 / -016: wire MaxConcurrentConnectionsPerSystem
// into the primary handler of the gateway's per-system named clients
// ("ExternalSystem_{name}") only. The names are created dynamically, so a
// static AddHttpClient("name") registration is not possible; instead a
// post-configure on HttpClientFactoryOptions is applied, filtered by the
// client-name prefix. ConfigureHttpClientDefaults is deliberately NOT used —
// it is process-global and would replace the primary handler of every
// HttpClient in the host (e.g. the Notification Service's OAuth2 token
// client), silently capping and overriding unrelated components.
services.AddSingleton<IConfigureOptions<HttpClientFactoryOptions>>(sp =>
new GatewayHttpClientConfigurator(
sp.GetRequiredService<IOptionsMonitor<ExternalSystemGatewayOptions>>()));
services.AddScoped<ExternalSystemClient>();
services.AddScoped<IExternalSystemClient>(sp => sp.GetRequiredService<ExternalSystemClient>());
@@ -42,4 +47,41 @@ public static class ServiceCollectionExtensions
// Script Execution Actors run on dedicated blocking I/O dispatcher.
return services;
}
/// <summary>
/// ExternalSystemGateway-016: configures the primary HTTP message handler with the
/// gateway's <see cref="ExternalSystemGatewayOptions.MaxConcurrentConnectionsPerSystem"/>
/// cap, but only for the gateway's own named clients
/// (<see cref="GatewayClientNamePrefix"/>). Clients owned by other host components
/// are left untouched, so the cap does not leak process-wide.
/// </summary>
private sealed class GatewayHttpClientConfigurator
: IConfigureNamedOptions<HttpClientFactoryOptions>
{
private readonly IOptionsMonitor<ExternalSystemGatewayOptions> _options;
public GatewayHttpClientConfigurator(IOptionsMonitor<ExternalSystemGatewayOptions> options)
{
_options = options;
}
public void Configure(HttpClientFactoryOptions options)
{
// The default (unnamed) client is not a gateway client — do nothing.
}
public void Configure(string? name, HttpClientFactoryOptions options)
{
if (name == null || !name.StartsWith(GatewayClientNamePrefix, StringComparison.Ordinal))
{
return;
}
options.HttpMessageHandlerBuilderActions.Add(builder =>
builder.PrimaryHandler = new SocketsHttpHandler
{
MaxConnectionsPerServer = _options.CurrentValue.MaxConcurrentConnectionsPerSystem,
});
}
}
}
@@ -191,9 +191,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
// Check at half the (shorter) offline timeout interval for timely detection
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
using var timer = new PeriodicTimer(checkInterval);
// Check at half the shorter of the two offline timeouts so detection is
// timely for whichever site class (real or "central") has the tighter
// window — see ComputeCheckInterval.
using var timer = new PeriodicTimer(ComputeCheckInterval(_options));
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
{
@@ -201,6 +202,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
}
}
/// <summary>
/// Computes the offline-check timer cadence: half of the <em>shorter</em> of
/// <see cref="HealthMonitoringOptions.OfflineTimeout"/> and
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/>. Deriving it
/// from the shorter timeout guarantees that whichever site class has the
/// tighter window is still polled at least twice within it — so if an
/// operator configures <c>CentralOfflineTimeout</c> smaller than
/// <c>OfflineTimeout</c>, central offline detection is not delayed by up to a
/// full <c>OfflineTimeout / 2</c>.
/// </summary>
internal static TimeSpan ComputeCheckInterval(HealthMonitoringOptions options)
{
var shorter = options.OfflineTimeout < options.CentralOfflineTimeout
? options.OfflineTimeout
: options.CentralOfflineTimeout;
return TimeSpan.FromMilliseconds(shorter.TotalMilliseconds / 2);
}
internal void CheckForOfflineSites()
{
var now = _timeProvider.GetUtcNow();
@@ -0,0 +1,59 @@
using Microsoft.Extensions.Options;
namespace ScadaLink.HealthMonitoring;
/// <summary>
/// HealthMonitoring-014: validates <see cref="HealthMonitoringOptions"/> at
/// startup. The interval values are fed straight into <c>new PeriodicTimer(...)</c>
/// (and into a division for the offline-check cadence); a zero or negative value
/// makes <see cref="PeriodicTimer"/>'s constructor throw
/// <see cref="ArgumentOutOfRangeException"/>, crashing the
/// <see cref="HealthReportSender"/> / <see cref="CentralHealthReportLoop"/> /
/// <see cref="CentralHealthAggregator"/> hosted service with an opaque exception
/// that does not name the offending config key. Registered with
/// <c>ValidateOnStart()</c> so a bad <c>ScadaLink:HealthMonitoring</c> section
/// fails fast at boot with a clear, key-naming message.
/// </summary>
public sealed class HealthMonitoringOptionsValidator : IValidateOptions<HealthMonitoringOptions>
{
public ValidateOptionsResult Validate(string? name, HealthMonitoringOptions options)
{
var failures = new List<string>();
if (options.ReportInterval <= TimeSpan.Zero)
{
failures.Add(
$"ScadaLink:HealthMonitoring:ReportInterval must be a positive duration " +
$"(was {options.ReportInterval}); it is used directly as a PeriodicTimer period.");
}
if (options.OfflineTimeout <= TimeSpan.Zero)
{
failures.Add(
$"ScadaLink:HealthMonitoring:OfflineTimeout must be a positive duration " +
$"(was {options.OfflineTimeout}); it drives the offline-check PeriodicTimer cadence.");
}
if (options.CentralOfflineTimeout <= TimeSpan.Zero)
{
failures.Add(
$"ScadaLink:HealthMonitoring:CentralOfflineTimeout must be a positive duration " +
$"(was {options.CentralOfflineTimeout}).");
}
if (options.OfflineTimeout > TimeSpan.Zero
&& options.CentralOfflineTimeout > TimeSpan.Zero
&& options.CentralOfflineTimeout < options.OfflineTimeout)
{
failures.Add(
$"ScadaLink:HealthMonitoring:CentralOfflineTimeout ({options.CentralOfflineTimeout}) " +
$"must be >= OfflineTimeout ({options.OfflineTimeout}): the synthetic 'central' site has " +
"no heartbeat source and is fed only by the slower self-report loop, so it needs at " +
"least as much offline grace as a real site.");
}
return failures.Count > 0
? ValidateOptionsResult.Fail(failures)
: ValidateOptionsResult.Success;
}
}
@@ -1,4 +1,6 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Options;
namespace ScadaLink.HealthMonitoring;
@@ -10,6 +12,7 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddSiteHealthMonitoring(this IServiceCollection services)
{
AddOptionsValidation(services);
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
services.AddHostedService<HealthReportSender>();
return services;
@@ -21,6 +24,7 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddHealthMonitoring(this IServiceCollection services)
{
AddOptionsValidation(services);
services.AddSingleton<ISiteHealthCollector, SiteHealthCollector>();
return services;
}
@@ -32,10 +36,27 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddCentralHealthAggregation(this IServiceCollection services)
{
AddOptionsValidation(services);
services.AddSingleton<CentralHealthAggregator>();
services.AddSingleton<ICentralHealthAggregator>(sp => sp.GetRequiredService<CentralHealthAggregator>());
services.AddHostedService(sp => sp.GetRequiredService<CentralHealthAggregator>());
services.AddHostedService<CentralHealthReportLoop>();
return services;
}
/// <summary>
/// HealthMonitoring-014: register the <see cref="HealthMonitoringOptionsValidator"/>
/// so a misconfigured <c>ScadaLink:HealthMonitoring</c> section (zero/negative
/// intervals, or a <c>CentralOfflineTimeout</c> shorter than
/// <c>OfflineTimeout</c>) is rejected with a clear, key-naming message when the
/// hosted services resolve their options at startup — rather than crashing
/// later inside a <see cref="PeriodicTimer"/> constructor with an opaque
/// <see cref="ArgumentOutOfRangeException"/>. Idempotent so it is safe when
/// more than one of the registration methods above is called.
/// </summary>
private static void AddOptionsValidation(IServiceCollection services)
{
services.TryAddEnumerable(
ServiceDescriptor.Singleton<IValidateOptions<HealthMonitoringOptions>, HealthMonitoringOptionsValidator>());
}
}
@@ -23,6 +23,18 @@ public class SiteHealthCollector : ISiteHealthCollector
private volatile string _nodeHostname = "";
private volatile IReadOnlyList<Commons.Messages.Health.NodeStatus>? _clusterNodes;
private volatile bool _isActiveNode;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a collector. The <paramref name="timeProvider"/> stamps each
/// report's timestamp; it defaults to <see cref="TimeProvider.System"/> and
/// is injectable so the report timestamp is deterministically testable —
/// consistent with the rest of the module's time-dependent classes.
/// </summary>
public SiteHealthCollector(TimeProvider? timeProvider = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
}
/// <summary>
/// Increment the script error counter. Covers unhandled exceptions,
@@ -148,7 +160,7 @@ public class SiteHealthCollector : ISiteHealthCollector
return new SiteHealthReport(
SiteId: siteId,
SequenceNumber: 0, // Caller (HealthReportSender) assigns the sequence number
ReportTimestamp: DateTimeOffset.UtcNow,
ReportTimestamp: _timeProvider.GetUtcNow(),
DataConnectionStatuses: connectionStatuses,
TagResolutionCounts: tagResolution,
ScriptErrorCount: scriptErrors,
+30 -9
View File
@@ -67,7 +67,8 @@ public class AkkaHostedService : IHostedService
// interpolated value, so a hostname, seed node or strategy containing a quote,
// backslash or whitespace cannot corrupt the configuration document.
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
transportHeartbeatSec, transportFailureSec);
_communicationOptions.TransportHeartbeatInterval,
_communicationOptions.TransportFailureThreshold);
var config = ConfigurationFactory.ParseString(hocon);
_actorSystem = ActorSystem.Create("scadalink", config);
@@ -106,13 +107,21 @@ public class AkkaHostedService : IHostedService
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
/// seed-node URI, role or split-brain strategy containing a quote, backslash or
/// whitespace cannot corrupt the document or be silently misparsed (Host-006).
///
/// Host-012: the <c>keep-oldest down-if-alone</c> flag is emitted from
/// <see cref="ClusterOptions.DownIfAlone"/> rather than hard-coded, so the bound
/// configuration value is actually consumed.
///
/// Host-013: every duration is rendered via <see cref="DurationHocon"/> in
/// milliseconds, so sub-second cluster timing values (e.g. a 750ms heartbeat) are
/// preserved exactly instead of being rounded to whole seconds.
/// </summary>
public static string BuildHocon(
NodeOptions nodeOptions,
ClusterOptions clusterOptions,
IEnumerable<string> roles,
double transportHeartbeatSec,
double transportFailureSec)
TimeSpan transportHeartbeat,
TimeSpan transportFailure)
{
var seedNodesStr = string.Join(",",
clusterOptions.SeedNodes.Select(QuoteHocon));
@@ -132,8 +141,8 @@ akka {{
port = {nodeOptions.RemotingPort}
}}
transport-failure-detector {{
heartbeat-interval = {transportHeartbeatSec:F0}s
acceptable-heartbeat-pause = {transportFailureSec:F0}s
heartbeat-interval = {DurationHocon(transportHeartbeat)}
acceptable-heartbeat-pause = {DurationHocon(transportFailure)}
}}
}}
cluster {{
@@ -142,14 +151,14 @@ akka {{
min-nr-of-members = {clusterOptions.MinNrOfMembers}
split-brain-resolver {{
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
stable-after = {clusterOptions.StableAfter.TotalSeconds:F0}s
stable-after = {DurationHocon(clusterOptions.StableAfter)}
keep-oldest {{
down-if-alone = on
down-if-alone = {(clusterOptions.DownIfAlone ? "on" : "off")}
}}
}}
failure-detector {{
heartbeat-interval = {clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
acceptable-heartbeat-pause = {clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
heartbeat-interval = {DurationHocon(clusterOptions.HeartbeatInterval)}
acceptable-heartbeat-pause = {DurationHocon(clusterOptions.FailureDetectionThreshold)}
}}
run-coordinated-shutdown-when-down = on
}}
@@ -159,6 +168,18 @@ akka {{
}}";
}
/// <summary>
/// Renders a <see cref="TimeSpan"/> as a HOCON duration in milliseconds. Akka's
/// HOCON parser accepts a <c>ms</c> suffix, so emitting whole milliseconds
/// preserves sub-second configuration exactly — a 750ms heartbeat stays 750ms
/// rather than being rounded to <c>1s</c> (or, for sub-half-second values,
/// silently collapsing to a degenerate <c>0s</c>) — Host-013.
/// </summary>
private static string DurationHocon(TimeSpan duration)
{
return $"{(long)Math.Round(duration.TotalMilliseconds)}ms";
}
/// <summary>
/// Renders a value as a HOCON double-quoted string, escaping backslashes and
/// double quotes so the resulting token cannot break out of its string literal.
@@ -8,11 +8,13 @@ namespace ScadaLink.Host;
///
/// REQ-HOST-8 / Host-011: the configured minimum level comes from
/// <c>ScadaLink:Logging:MinimumLevel</c> (bound to <see cref="LoggingOptions"/>) so an
/// operator editing that key changes the effective log level. The standard
/// <c>Serilog</c> configuration section is still read (via
/// <see cref="Serilog.Configuration.ConfigurationLoggerConfigurationExtensions"/>)
/// for sink/override customisation; the explicit <c>MinimumLevel.Is</c> below pins
/// the floor from <see cref="LoggingOptions"/>.
/// operator editing that key changes the effective log level.
///
/// REQ-HOST-8 / Host-014: the console and file sinks are read from the standard
/// <c>Serilog</c> configuration section via <c>ReadFrom.Configuration</c> — the sink
/// set, console output template, file path and rolling interval are all
/// configuration-driven (defined in <c>appsettings.json</c>), not hard-coded. The
/// explicit <c>MinimumLevel.Is</c> below pins the floor from <see cref="LoggingOptions"/>.
/// </summary>
public static class LoggerConfigurationFactory
{
+8 -4
View File
@@ -40,11 +40,12 @@ var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central";
// WP-14: Serilog structured logging.
// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions).
// Host-014: console and file sinks are defined in the `Serilog` configuration
// section (appsettings.json) and applied via ReadFrom.Configuration inside the
// factory — the sink set, output template, file path and rolling interval are all
// configuration-driven per REQ-HOST-8, not hard-coded here.
Log.Logger = ScadaLink.Host.LoggerConfigurationFactory
.Build(configuration, nodeRole, siteId, nodeHostname)
.WriteTo.Console(outputTemplate:
"[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}")
.WriteTo.File("logs/scadalink-.log", rollingInterval: Serilog.RollingInterval.Day)
.CreateLogger();
try
@@ -121,6 +122,8 @@ try
// Host-010: tolerate a database that is briefly unreachable at boot
// (e.g. app and DB containers starting together) with a bounded
// exponential backoff before failing fatally.
// Host-015: only connection-class (transient) faults are retried — a
// schema-version mismatch is permanent and must fail fast on attempt 1.
await StartupRetry.ExecuteWithRetryAsync(
"database-migration",
async () =>
@@ -131,7 +134,8 @@ try
},
maxAttempts: 8,
initialDelay: TimeSpan.FromSeconds(2),
migrationLogger);
migrationLogger,
isTransient: StartupRetry.IsTransientDatabaseFault);
}
// Middleware pipeline
+48 -2
View File
@@ -9,6 +9,12 @@ namespace ScadaLink.Host;
/// unreachable. Rather than crashing the process on the first connection failure,
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
/// short outage and only fails fatally once attempts are exhausted.
///
/// Host-015: only <em>transient</em> faults are retried. The optional
/// <c>isTransient</c> predicate classifies each exception; a permanent failure
/// (e.g. a database schema-version mismatch — which no amount of waiting can fix)
/// is rethrown immediately rather than being retried for minutes before the
/// inevitable fatal exit.
/// </summary>
public static class StartupRetry
{
@@ -18,8 +24,13 @@ public static class StartupRetry
int maxAttempts,
TimeSpan initialDelay,
ILogger logger,
Func<Exception, bool>? isTransient = null,
CancellationToken cancellationToken = default)
{
// Default: treat every exception as transient (preserves the pre-Host-015
// behaviour for callers that do not classify faults).
isTransient ??= static _ => true;
var delay = initialDelay;
for (var attempt = 1; ; attempt++)
{
@@ -33,10 +44,10 @@ public static class StartupRetry
operationName, attempt);
return;
}
catch (Exception ex) when (attempt < maxAttempts)
catch (Exception ex) when (attempt < maxAttempts && isTransient(ex))
{
logger.LogWarning(ex,
"Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " +
"Startup operation '{Operation}' failed (transient) on attempt {Attempt}/{MaxAttempts}; " +
"retrying in {Delay}.",
operationName, attempt, maxAttempts, delay);
await Task.Delay(delay, cancellationToken);
@@ -45,4 +56,39 @@ public static class StartupRetry
}
}
}
/// <summary>
/// Transient-fault classifier for the database-migration startup step (Host-015).
/// Returns <c>true</c> only for connection-class faults that a brief wait can
/// resolve — a SQL connection/transport error or a timeout — and <c>false</c>
/// for everything else (notably schema-validation <see cref="InvalidOperationException"/>s
/// raised by <c>MigrationHelper.ApplyOrValidateMigrationsAsync</c>, which are
/// permanent and must fail fast).
/// </summary>
public static bool IsTransientDatabaseFault(Exception ex)
{
// Unwrap a single layer of aggregation so a faulted Task surfaces correctly.
if (ex is AggregateException agg && agg.InnerException != null)
ex = agg.InnerException;
if (ex is TimeoutException)
return true;
// Socket / network errors raised while opening the connection.
if (ex is System.Net.Sockets.SocketException)
return true;
// Microsoft.Data.SqlClient throws SqlException; matching by type name keeps
// the Host free of a direct SqlClient package reference. A SqlException at
// the migration stage is, in practice, a connection failure (the server is
// not yet reachable) rather than a schema fault — schema mismatches surface
// as InvalidOperationException from the migration helper.
var typeName = ex.GetType().FullName;
if (typeName != null &&
(typeName.EndsWith("SqlException", StringComparison.Ordinal) ||
typeName.EndsWith("DbException", StringComparison.Ordinal)))
return true;
return false;
}
}
+21
View File
@@ -3,5 +3,26 @@
"LogLevel": {
"Default": "Information"
}
},
"Serilog": {
"Using": [
"Serilog.Sinks.Console",
"Serilog.Sinks.File"
],
"WriteTo": [
{
"Name": "Console",
"Args": {
"outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}"
}
},
{
"Name": "File",
"Args": {
"path": "logs/scadalink-.log",
"rollingInterval": "Day"
}
}
]
}
}
@@ -0,0 +1,31 @@
using ScadaLink.Commons.Messages.InboundApi;
using ScadaLink.Communication;
namespace ScadaLink.InboundAPI;
/// <summary>
/// Default <see cref="IInstanceRouter"/> implementation. Delegates every routed
/// call to <see cref="CommunicationService"/>, which dispatches to the target
/// site cluster via the central communication actor.
/// </summary>
public sealed class CommunicationServiceInstanceRouter : IInstanceRouter
{
private readonly CommunicationService _communicationService;
public CommunicationServiceInstanceRouter(CommunicationService communicationService)
{
_communicationService = communicationService;
}
public Task<RouteToCallResponse> RouteToCallAsync(
string siteId, RouteToCallRequest request, CancellationToken cancellationToken) =>
_communicationService.RouteToCallAsync(siteId, request, cancellationToken);
public Task<RouteToGetAttributesResponse> RouteToGetAttributesAsync(
string siteId, RouteToGetAttributesRequest request, CancellationToken cancellationToken) =>
_communicationService.RouteToGetAttributesAsync(siteId, request, cancellationToken);
public Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken) =>
_communicationService.RouteToSetAttributesAsync(siteId, request, cancellationToken);
}
@@ -15,6 +15,21 @@ namespace ScadaLink.InboundAPI;
/// sandbox — so a script can fully-qualify any referenced type. This static check
/// walks the script syntax tree and rejects any reference to a forbidden namespace,
/// whether reached through a <c>using</c> directive or a fully-qualified name.
///
/// <para>
/// InboundAPI-015: a purely namespace-textual deny-list is bypassable because
/// reflection is reachable through members of <em>permitted</em> types that never
/// spell a forbidden namespace, e.g.
/// <c>typeof(string).Assembly.GetType("System.IO.File")</c>. The walker therefore
/// also rejects a curated set of reflection-gateway member names (<c>GetType</c>,
/// <c>Assembly</c>, <c>GetMethod</c>, <c>InvokeMember</c>, <c>CreateInstance</c>, …)
/// and the <c>dynamic</c> keyword. This is hardening of a best-effort static check,
/// <strong>not</strong> a true sandbox — a determined script author may still find
/// a vector the syntax walker cannot see (see the security notes in
/// <c>code-reviews/InboundAPI/findings.md</c>, InboundAPI-015). The check is
/// defence-in-depth; genuine containment needs a runtime boundary (restricted
/// <c>AssemblyLoadContext</c> / curated reference set / out-of-process sandbox).
/// </para>
/// </summary>
public static class ForbiddenApiChecker
{
@@ -42,6 +57,40 @@ public static class ForbiddenApiChecker
"System.Threading.Tasks",
};
/// <summary>
/// InboundAPI-015: member names that are reflection gateways. Reaching any of
/// these — even off a permitted type such as <c>typeof(string)</c> or a plain
/// <c>string</c> — lets a script escape the namespace deny-list (obtain an
/// arbitrary <c>Type</c>, load an assembly, late-bind a method). They are
/// rejected regardless of the receiver expression. <c>Invoke</c> is deliberately
/// excluded because <c>Action</c>/<c>Func</c> delegate invocation is legitimate;
/// the reflection <c>MethodInfo.Invoke</c> path is already cut off by rejecting
/// the <c>GetMethod</c>/<c>GetConstructor</c> that produces the <c>MethodInfo</c>.
/// </summary>
private static readonly HashSet<string> ForbiddenMemberNames = new(StringComparer.Ordinal)
{
"GetType", // object.GetType() / Type.GetType(string) — yields a System.Type
"GetTypeInfo", // -> TypeInfo (reflection)
"Assembly", // Type.Assembly — yields a System.Reflection.Assembly
"Module", // Type.Module / MethodBase.Module
"CreateInstance", // Activator.CreateInstance / Assembly.CreateInstance
"InvokeMember", // Type.InvokeMember — late-bound dispatch
"GetMethod",
"GetMethods",
"GetConstructor",
"GetConstructors",
"GetField",
"GetFields",
"GetProperty",
"GetProperties",
"GetMember",
"GetMembers",
"GetRuntimeMethod",
"GetRuntimeMethods",
"MethodHandle", // RuntimeMethodHandle escape
"TypeHandle",
};
/// <summary>
/// Analyses the script source and returns the list of trust-model violations.
/// An empty list means the script is acceptable.
@@ -115,7 +164,42 @@ public static class ForbiddenApiChecker
return;
}
// InboundAPI-015: reject reflection-gateway members regardless of the
// receiver. typeof(string).Assembly.GetType("System.IO.File") never
// spells a forbidden namespace, but '.Assembly' and '.GetType' do
// appear here as the accessed member name.
var memberName = node.Name.Identifier.ValueText;
if (ForbiddenMemberNames.Contains(memberName))
{
_violations.Add($"forbidden reflection member access '.{memberName}'");
// Still descend: the receiver may contain a further violation.
}
base.VisitMemberAccessExpression(node);
}
public override void VisitIdentifierName(IdentifierNameSyntax node)
{
// InboundAPI-015: 'dynamic' widens late-bound member access that the
// static walker cannot see through — reject its use outright. The
// 'dynamic' contextual keyword surfaces as an identifier name.
if (node.Identifier.ValueText == "dynamic")
{
_violations.Add("forbidden use of the 'dynamic' keyword");
return;
}
// InboundAPI-015: a bare reference to the reflection entry-point types
// (e.g. 'Activator', 'Type') as an identifier. 'Activator' has no
// non-reflection use; flag it. ('Type' as an identifier is too broad
// to flag here — the gateway members above already cut off its use.)
if (node.Identifier.ValueText == "Activator")
{
_violations.Add("forbidden reflection type reference 'Activator'");
return;
}
base.VisitIdentifierName(node);
}
}
}
@@ -0,0 +1,22 @@
using ScadaLink.Commons.Messages.InboundApi;
namespace ScadaLink.InboundAPI;
/// <summary>
/// Seam over the cross-site routing transport used by <see cref="RouteHelper"/>.
/// The production implementation (<see cref="CommunicationServiceInstanceRouter"/>)
/// delegates to <c>ScadaLink.Communication.CommunicationService</c>; the interface
/// exists so <see cref="RouteHelper"/>/<see cref="RouteTarget"/> can be unit tested
/// without a live actor system (InboundAPI-017).
/// </summary>
public interface IInstanceRouter
{
Task<RouteToCallResponse> RouteToCallAsync(
string siteId, RouteToCallRequest request, CancellationToken cancellationToken);
Task<RouteToGetAttributesResponse> RouteToGetAttributesAsync(
string siteId, RouteToGetAttributesRequest request, CancellationToken cancellationToken);
Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken);
}
@@ -174,7 +174,10 @@ public class InboundScriptExecutor
try
{
var context = new InboundScriptContext(parameters, route, cts.Token);
// InboundAPI-016: bind the route helper to the method deadline so a
// routed Route.To(...).Call(...) inherits the method-level timeout
// without the script having to thread the context token by hand.
var context = new InboundScriptContext(parameters, route.WithDeadline(cts.Token), cts.Token);
if (!_scriptHandlers.TryGetValue(method.Name, out var handler))
{
@@ -202,6 +205,19 @@ public class InboundScriptExecutor
? JsonSerializer.Serialize(result)
: null;
// InboundAPI-014: validate the script's return value against the
// method's declared ReturnDefinition. A method whose script returns a
// shape inconsistent with its definition must not silently emit a
// malformed 200 — surface it as a script failure (500) and log.
var returnValidation = ReturnValueValidator.Validate(resultJson, method.ReturnDefinition);
if (!returnValidation.IsValid)
{
_logger.LogWarning(
"API method {Method} return value rejected: {Error}",
method.Name, returnValidation.ErrorMessage);
return new InboundScriptResult(false, null, "Method return value did not match its return definition");
}
return new InboundScriptResult(true, resultJson, null);
}
catch (OperationCanceledException)
@@ -0,0 +1,144 @@
using System.Text.Json;
namespace ScadaLink.InboundAPI;
/// <summary>
/// InboundAPI-014: validates a method script's return value against the method's
/// declared <c>ReturnDefinition</c>. <c>Component-InboundAPI.md</c> ("Return Value
/// Definition" / "Response Format") states the success body has "fields matching
/// the return value definition"; this is the response-side mirror of
/// <see cref="ParameterValidator"/>.
///
/// <para>
/// The return definition is a JSON array of <see cref="ReturnFieldDefinition"/>
/// (the same <c>{name,type}</c> shape as a parameter definition). A method whose
/// <c>ReturnDefinition</c> is null/empty is unconstrained — its return value is
/// serialized as-is (backward compatible). Primitive fields (Boolean / Integer /
/// Float / String) are type-checked; the extended <c>Object</c>/<c>List</c> types
/// are shape-checked only (object vs. array), consistent with how
/// <see cref="ParameterValidator"/> treats inbound extended types.
/// </para>
/// </summary>
public static class ReturnValueValidator
{
/// <summary>
/// Validates the serialized script result JSON against the method's return
/// definition. Returns <see cref="ReturnValidationResult.Valid"/> when no
/// definition is configured or the result conforms to it.
/// </summary>
public static ReturnValidationResult Validate(string? resultJson, string? returnDefinition)
{
if (string.IsNullOrWhiteSpace(returnDefinition))
{
// No declared return shape — the script's return value is unconstrained.
return ReturnValidationResult.Valid();
}
List<ReturnFieldDefinition> fields;
try
{
fields = JsonSerializer.Deserialize<List<ReturnFieldDefinition>>(
returnDefinition,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true })
?? [];
}
catch (JsonException)
{
return ReturnValidationResult.Invalid(
"Invalid return definition in method configuration");
}
if (fields.Count == 0)
{
return ReturnValidationResult.Valid();
}
if (string.IsNullOrWhiteSpace(resultJson))
{
return ReturnValidationResult.Invalid(
"Method declares a return structure but the script returned no value");
}
JsonElement root;
try
{
using var doc = JsonDocument.Parse(resultJson);
root = doc.RootElement.Clone();
}
catch (JsonException)
{
return ReturnValidationResult.Invalid("Script return value is not valid JSON");
}
if (root.ValueKind != JsonValueKind.Object)
{
return ReturnValidationResult.Invalid(
"Method declares a return structure but the script did not return an object");
}
var errors = new List<string>();
foreach (var field in fields)
{
if (!root.TryGetProperty(field.Name, out var value))
{
errors.Add($"missing return field '{field.Name}'");
continue;
}
var typeError = CheckFieldType(value, field.Type, field.Name);
if (typeError != null)
errors.Add(typeError);
}
return errors.Count > 0
? ReturnValidationResult.Invalid(
$"Return value does not match the declared return definition: {string.Join("; ", errors)}")
: ReturnValidationResult.Valid();
}
private static string? CheckFieldType(JsonElement value, string declaredType, string fieldName)
{
// A null value satisfies any field type — the script may legitimately omit
// optional data; only a missing field (handled by the caller) is an error.
if (value.ValueKind == JsonValueKind.Null)
return null;
var ok = declaredType.ToLowerInvariant() switch
{
"boolean" => value.ValueKind is JsonValueKind.True or JsonValueKind.False,
"integer" => value.ValueKind == JsonValueKind.Number && value.TryGetInt64(out _),
"float" => value.ValueKind == JsonValueKind.Number,
"string" => value.ValueKind == JsonValueKind.String,
"object" => value.ValueKind == JsonValueKind.Object,
"list" => value.ValueKind == JsonValueKind.Array,
_ => true, // unknown declared type — do not block the response
};
return ok ? null : $"return field '{fieldName}' must be {declaredType}";
}
}
/// <summary>
/// InboundAPI-014: one field of a method's declared return structure — the
/// deserialized form of an entry in <c>ApiMethod.ReturnDefinition</c>. Defined in
/// this module (not Commons) because the inbound API is currently its only consumer.
/// </summary>
public sealed class ReturnFieldDefinition
{
public string Name { get; set; } = string.Empty;
public string Type { get; set; } = "String";
}
/// <summary>
/// Result of validating a script return value against a method's return definition.
/// </summary>
public sealed class ReturnValidationResult
{
public bool IsValid { get; private init; }
public string ErrorMessage { get; private init; } = string.Empty;
public static ReturnValidationResult Valid() => new() { IsValid = true };
public static ReturnValidationResult Invalid(string message) =>
new() { IsValid = false, ErrorMessage = message };
}
+57 -19
View File
@@ -1,34 +1,58 @@
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.Commons.Messages.InboundApi;
using ScadaLink.Commons.Types;
using ScadaLink.Communication;
namespace ScadaLink.InboundAPI;
/// <summary>
/// WP-4: Route.To() helper for cross-site calls from inbound API scripts.
/// Resolves instance to site, routes via CommunicationService, blocks until response or timeout.
/// Site unreachable returns error (no store-and-forward).
/// Resolves instance to site, routes via <see cref="IInstanceRouter"/>, blocks until
/// response or timeout. Site unreachable returns error (no store-and-forward).
///
/// InboundAPI-016: the helper carries the executing method's <see cref="CancellationToken"/>
/// (the method-level timeout). Routed calls inherit that deadline by default, so a
/// natural script — <c>Route.To("inst").Call("doWork", p)</c> — is timeout-bounded
/// without the script having to thread a token explicitly.
/// </summary>
public class RouteHelper
{
private readonly IInstanceLocator _instanceLocator;
private readonly CommunicationService _communicationService;
private readonly IInstanceRouter _instanceRouter;
private readonly CancellationToken _deadlineToken;
public RouteHelper(
IInstanceLocator instanceLocator,
CommunicationService communicationService)
IInstanceRouter instanceRouter)
: this(instanceLocator, instanceRouter, CancellationToken.None)
{
}
private RouteHelper(
IInstanceLocator instanceLocator,
IInstanceRouter instanceRouter,
CancellationToken deadlineToken)
{
_instanceLocator = instanceLocator;
_communicationService = communicationService;
_instanceRouter = instanceRouter;
_deadlineToken = deadlineToken;
}
/// <summary>
/// InboundAPI-016: returns a <see cref="RouteHelper"/> whose routed calls inherit
/// <paramref name="deadlineToken"/> (the executing method's timeout) by default.
/// <see cref="InboundScriptExecutor"/> calls this when it builds the script
/// context so the method timeout actually covers routed calls, as the design doc
/// requires.
/// </summary>
public RouteHelper WithDeadline(CancellationToken deadlineToken) =>
new(_instanceLocator, _instanceRouter, deadlineToken);
/// <summary>
/// Creates a route target for the specified instance.
/// </summary>
public RouteTarget To(string instanceCode)
{
return new RouteTarget(instanceCode, _instanceLocator, _communicationService);
return new RouteTarget(instanceCode, _instanceLocator, _instanceRouter, _deadlineToken);
}
}
@@ -39,36 +63,43 @@ public class RouteTarget
{
private readonly string _instanceCode;
private readonly IInstanceLocator _instanceLocator;
private readonly CommunicationService _communicationService;
private readonly IInstanceRouter _instanceRouter;
private readonly CancellationToken _deadlineToken;
internal RouteTarget(
string instanceCode,
IInstanceLocator instanceLocator,
CommunicationService communicationService)
IInstanceRouter instanceRouter,
CancellationToken deadlineToken)
{
_instanceCode = instanceCode;
_instanceLocator = instanceLocator;
_communicationService = communicationService;
_instanceRouter = instanceRouter;
_deadlineToken = deadlineToken;
}
/// <summary>
/// Calls a script on the remote instance. Synchronous from API caller's
/// perspective. <paramref name="parameters"/> may be a dictionary or an
/// anonymous object (<c>new { name = "Bob" }</c>) — see <see cref="ScriptArgs"/>.
///
/// InboundAPI-016: when <paramref name="cancellationToken"/> is not supplied the
/// routed call inherits the executing method's timeout, so the call is bounded by
/// the method-level deadline with no token argument.
/// </summary>
public async Task<object?> Call(
string scriptName,
object? parameters = null,
CancellationToken cancellationToken = default)
{
var siteId = await ResolveSiteAsync(cancellationToken);
var token = Effective(cancellationToken);
var siteId = await ResolveSiteAsync(token);
var correlationId = Guid.NewGuid().ToString();
var request = new RouteToCallRequest(
correlationId, _instanceCode, scriptName, ScriptArgs.Normalize(parameters), DateTimeOffset.UtcNow);
var response = await _communicationService.RouteToCallAsync(
siteId, request, cancellationToken);
var response = await _instanceRouter.RouteToCallAsync(siteId, request, token);
if (!response.Success)
{
@@ -97,14 +128,14 @@ public class RouteTarget
IEnumerable<string> attributeNames,
CancellationToken cancellationToken = default)
{
var siteId = await ResolveSiteAsync(cancellationToken);
var token = Effective(cancellationToken);
var siteId = await ResolveSiteAsync(token);
var correlationId = Guid.NewGuid().ToString();
var request = new RouteToGetAttributesRequest(
correlationId, _instanceCode, attributeNames.ToList(), DateTimeOffset.UtcNow);
var response = await _communicationService.RouteToGetAttributesAsync(
siteId, request, cancellationToken);
var response = await _instanceRouter.RouteToGetAttributesAsync(siteId, request, token);
if (!response.Success)
{
@@ -135,14 +166,14 @@ public class RouteTarget
IReadOnlyDictionary<string, string> attributeValues,
CancellationToken cancellationToken = default)
{
var siteId = await ResolveSiteAsync(cancellationToken);
var token = Effective(cancellationToken);
var siteId = await ResolveSiteAsync(token);
var correlationId = Guid.NewGuid().ToString();
var request = new RouteToSetAttributesRequest(
correlationId, _instanceCode, attributeValues, DateTimeOffset.UtcNow);
var response = await _communicationService.RouteToSetAttributesAsync(
siteId, request, cancellationToken);
var response = await _instanceRouter.RouteToSetAttributesAsync(siteId, request, token);
if (!response.Success)
{
@@ -151,6 +182,13 @@ public class RouteTarget
}
}
/// <summary>
/// InboundAPI-016: a routed call with no explicit token inherits the executing
/// method's deadline. An explicitly supplied token (for a tighter bound) wins.
/// </summary>
private CancellationToken Effective(CancellationToken explicitToken) =>
explicitToken.CanBeCanceled ? explicitToken : _deadlineToken;
private async Task<string> ResolveSiteAsync(CancellationToken cancellationToken)
{
var siteId = await _instanceLocator.GetSiteIdForInstanceAsync(_instanceCode, cancellationToken);
@@ -10,6 +10,10 @@ public static class ServiceCollectionExtensions
services.AddSingleton<InboundScriptExecutor>();
services.AddScoped<RouteHelper>();
// InboundAPI-017: routed calls go through the IInstanceRouter seam; the
// production implementation delegates to CommunicationService.
services.AddScoped<IInstanceRouter, CommunicationServiceInstanceRouter>();
// InboundAPI-006 / InboundAPI-008: endpoint filter enforcing the request
// body size cap and active-node gating for POST /api/{methodName}.
services.AddSingleton<InboundApiEndpointFilter>();
@@ -4,6 +4,7 @@ using System.Text.Json.Serialization;
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.ExternalSystems;
using ScadaLink.Commons.Entities.InboundApi;
using ScadaLink.Commons.Entities.Instances;
@@ -128,7 +129,17 @@ public class ManagementActor : ReceiveActor
_logger.LogError(cause, "Management command {Command} failed (CorrelationId={CorrelationId})",
command.GetType().Name, correlationId);
return new ManagementError(correlationId, cause.Message, "COMMAND_FAILED");
// Curated handler failures (ManagementCommandException) carry a message
// that is safe to surface to the caller. Any other exception is an
// unanticipated fault whose raw .Message can disclose internal detail
// (server/database names, constraint names, stack context) — return a
// generic message and let the operator correlate via the server log
// using the correlation ID (finding ManagementService-016).
var clientMessage = cause is ManagementCommandException
? cause.Message
: $"An internal error occurred (CorrelationId={correlationId}).";
return new ManagementError(correlationId, clientMessage, "COMMAND_FAILED");
}
private static string? GetRequiredRole(object command) => command switch
@@ -173,6 +184,7 @@ public class ManagementActor : ReceiveActor
or SetInstanceAlarmOverrideCommand or DeleteInstanceAlarmOverrideCommand
or GetDeploymentDiffCommand
or MgmtDeployArtifactsCommand
or QueryDeploymentsCommand
or RetryParkedMessageCommand or DiscardParkedMessageCommand
or DebugSnapshotCommand => "Deployment",
@@ -303,7 +315,7 @@ public class ManagementActor : ReceiveActor
// Deployments
MgmtDeployArtifactsCommand cmd => await HandleDeployArtifacts(sp, cmd, user.Username),
QueryDeploymentsCommand cmd => await HandleQueryDeployments(sp, cmd),
QueryDeploymentsCommand cmd => await HandleQueryDeployments(sp, cmd, user),
GetDeploymentDiffCommand cmd => await HandleGetDeploymentDiff(sp, cmd, user),
// Audit Log
@@ -428,7 +440,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.CreateTemplateAsync(cmd.Name, cmd.Description, cmd.ParentTemplateId, user);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleUpdateTemplate(IServiceProvider sp, UpdateTemplateCommand cmd, string user)
@@ -437,7 +449,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.UpdateTemplateAsync(cmd.TemplateId, cmd.Name, cmd.Description, cmd.ParentTemplateId, user);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteTemplate(IServiceProvider sp, DeleteTemplateCommand cmd, string user)
@@ -446,7 +458,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.DeleteTemplateAsync(cmd.TemplateId, user);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleValidateTemplate(IServiceProvider sp, ValidateTemplateCommand cmd)
@@ -455,7 +467,7 @@ public class ManagementActor : ReceiveActor
// Load the template with all members
var template = await repo.GetTemplateWithChildrenAsync(cmd.TemplateId)
?? throw new InvalidOperationException($"Template with ID {cmd.TemplateId} not found.");
?? throw new ManagementCommandException($"Template with ID {cmd.TemplateId} not found.");
var attributes = await repo.GetAttributesByTemplateIdAsync(cmd.TemplateId);
var alarms = await repo.GetAlarmsByTemplateIdAsync(cmd.TemplateId);
@@ -527,35 +539,35 @@ public class ManagementActor : ReceiveActor
{
var svc = sp.GetRequiredService<TemplateFolderService>();
var result = await svc.CreateFolderAsync(cmd.Name, cmd.ParentFolderId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleRenameTemplateFolder(IServiceProvider sp, RenameTemplateFolderCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateFolderService>();
var result = await svc.RenameFolderAsync(cmd.FolderId, cmd.NewName, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleMoveTemplateFolder(IServiceProvider sp, MoveTemplateFolderCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateFolderService>();
var result = await svc.MoveFolderAsync(cmd.FolderId, cmd.NewParentFolderId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteTemplateFolder(IServiceProvider sp, DeleteTemplateFolderCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateFolderService>();
var result = await svc.DeleteFolderAsync(cmd.FolderId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleMoveTemplateToFolder(IServiceProvider sp, MoveTemplateToFolderCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateService>();
var result = await svc.MoveTemplateAsync(cmd.TemplateId, cmd.NewFolderId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
// ========================================================================
@@ -589,7 +601,7 @@ public class ManagementActor : ReceiveActor
EnforceSiteScope(user, cmd.SiteId);
var svc = sp.GetRequiredService<InstanceService>();
var result = await svc.CreateInstanceAsync(cmd.UniqueName, cmd.TemplateId, cmd.SiteId, cmd.AreaId, user.Username);
if (!result.IsSuccess) throw new InvalidOperationException(result.Error);
if (!result.IsSuccess) throw new ManagementCommandException(result.Error);
await AuditAsync(sp, user.Username, "Create", "Instance", result.Value.Id.ToString(), result.Value.UniqueName, result.Value);
return result.Value;
}
@@ -601,7 +613,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.DeployInstanceAsync(cmd.InstanceId, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleEnableInstance(IServiceProvider sp, MgmtEnableInstanceCommand cmd, AuthenticatedUser user)
@@ -611,7 +623,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.EnableInstanceAsync(cmd.InstanceId, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDisableInstance(IServiceProvider sp, MgmtDisableInstanceCommand cmd, AuthenticatedUser user)
@@ -621,7 +633,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.DisableInstanceAsync(cmd.InstanceId, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteInstance(IServiceProvider sp, MgmtDeleteInstanceCommand cmd, AuthenticatedUser user)
@@ -631,7 +643,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.DeleteInstanceAsync(cmd.InstanceId, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleSetConnectionBindings(IServiceProvider sp, SetConnectionBindingsCommand cmd, AuthenticatedUser user)
@@ -641,18 +653,45 @@ public class ManagementActor : ReceiveActor
var result = await svc.SetConnectionBindingsAsync(cmd.InstanceId, cmd.Bindings, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleSetInstanceOverrides(IServiceProvider sp, SetInstanceOverridesCommand cmd, AuthenticatedUser user)
{
await EnforceSiteScopeForInstance(sp, user, cmd.InstanceId);
// Multi-override apply is all-or-nothing (finding ManagementService-015).
// InstanceService.SetAttributeOverrideAsync commits each override
// independently, so a mid-batch failure on an invalid attribute would
// otherwise leave the instance partially mutated. Validate every
// requested attribute up front against the instance's template; only
// apply once the whole batch is known to be valid. (A genuine database
// fault mid-apply remains theoretically possible without a shared
// transaction, but the realistic failure modes — unknown or locked
// attribute — are now rejected before any write.)
var repo = sp.GetRequiredService<ITemplateEngineRepository>();
var instance = await repo.GetInstanceByIdAsync(cmd.InstanceId)
?? throw new ManagementCommandException($"Instance with ID {cmd.InstanceId} not found.");
var templateAttrs = await repo.GetAttributesByTemplateIdAsync(instance.TemplateId);
var attrsByName = templateAttrs.ToDictionary(a => a.Name);
foreach (var attrName in cmd.Overrides.Keys)
{
if (!attrsByName.TryGetValue(attrName, out var templateAttr))
throw new ManagementCommandException(
$"Attribute '{attrName}' does not exist in template {instance.TemplateId}. " +
"No overrides were applied.");
if (templateAttr.IsLocked)
throw new ManagementCommandException(
$"Attribute '{attrName}' is locked and cannot be overridden. No overrides were applied.");
}
var svc = sp.GetRequiredService<InstanceService>();
var results = new List<InstanceAttributeOverride>();
foreach (var (attrName, overrideValue) in cmd.Overrides)
{
var result = await svc.SetAttributeOverrideAsync(cmd.InstanceId, attrName, overrideValue, user.Username);
if (!result.IsSuccess) throw new InvalidOperationException(result.Error);
if (!result.IsSuccess) throw new ManagementCommandException(result.Error);
results.Add(result.Value);
}
return results;
@@ -665,7 +704,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.AssignToAreaAsync(cmd.InstanceId, cmd.AreaId, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleSetInstanceAlarmOverride(IServiceProvider sp, SetInstanceAlarmOverrideCommand cmd, AuthenticatedUser user)
@@ -678,7 +717,7 @@ public class ManagementActor : ReceiveActor
user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteInstanceAlarmOverride(IServiceProvider sp, DeleteInstanceAlarmOverrideCommand cmd, AuthenticatedUser user)
@@ -689,7 +728,7 @@ public class ManagementActor : ReceiveActor
cmd.InstanceId, cmd.AlarmCanonicalName, user.Username);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleListInstanceAlarmOverrides(IServiceProvider sp, ListInstanceAlarmOverridesCommand cmd, AuthenticatedUser user)
@@ -706,7 +745,7 @@ public class ManagementActor : ReceiveActor
var result = await svc.GetDeploymentComparisonAsync(cmd.InstanceId);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleRetryParkedMessage(IServiceProvider sp, RetryParkedMessageCommand cmd, AuthenticatedUser user)
@@ -775,7 +814,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<ISiteRepository>();
var site = await repo.GetSiteByIdAsync(cmd.SiteId)
?? throw new InvalidOperationException($"Site with ID {cmd.SiteId} not found.");
?? throw new ManagementCommandException($"Site with ID {cmd.SiteId} not found.");
site.Name = cmd.Name;
site.Description = cmd.Description;
site.NodeAAddress = cmd.NodeAAddress;
@@ -796,7 +835,7 @@ public class ManagementActor : ReceiveActor
var site = await repo.GetSiteByIdAsync(cmd.SiteId);
var instances = await repo.GetInstancesBySiteIdAsync(cmd.SiteId);
if (instances.Count > 0)
throw new InvalidOperationException(
throw new ManagementCommandException(
$"Cannot delete site {cmd.SiteId}: it has {instances.Count} instance(s).");
await repo.DeleteSiteAsync(cmd.SiteId);
await repo.SaveChangesAsync();
@@ -876,7 +915,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<ISiteRepository>();
var conn = await repo.GetDataConnectionByIdAsync(cmd.DataConnectionId)
?? throw new InvalidOperationException($"DataConnection with ID {cmd.DataConnectionId} not found.");
?? throw new ManagementCommandException($"DataConnection with ID {cmd.DataConnectionId} not found.");
conn.Name = cmd.Name;
conn.Protocol = cmd.Protocol;
conn.PrimaryConfiguration = cmd.PrimaryConfiguration;
@@ -931,7 +970,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<IExternalSystemRepository>();
var def = await repo.GetExternalSystemByIdAsync(cmd.ExternalSystemId)
?? throw new InvalidOperationException($"ExternalSystem with ID {cmd.ExternalSystemId} not found.");
?? throw new ManagementCommandException($"ExternalSystem with ID {cmd.ExternalSystemId} not found.");
def.Name = cmd.Name;
def.EndpointUrl = cmd.EndpointUrl;
def.AuthType = cmd.AuthType;
@@ -982,7 +1021,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<IExternalSystemRepository>();
var method = await repo.GetExternalSystemMethodByIdAsync(cmd.MethodId)
?? throw new InvalidOperationException($"ExternalSystemMethod with ID {cmd.MethodId} not found.");
?? throw new ManagementCommandException($"ExternalSystemMethod with ID {cmd.MethodId} not found.");
if (cmd.Name != null) method.Name = cmd.Name;
if (cmd.HttpMethod != null) method.HttpMethod = cmd.HttpMethod;
if (cmd.Path != null) method.Path = cmd.Path;
@@ -1037,7 +1076,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<INotificationRepository>();
var list = await repo.GetNotificationListByIdAsync(cmd.NotificationListId)
?? throw new InvalidOperationException($"NotificationList with ID {cmd.NotificationListId} not found.");
?? throw new ManagementCommandException($"NotificationList with ID {cmd.NotificationListId} not found.");
list.Name = cmd.Name;
var existingRecipients = await repo.GetRecipientsByListIdAsync(cmd.NotificationListId);
@@ -1079,7 +1118,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<INotificationRepository>();
var config = await repo.GetSmtpConfigurationByIdAsync(cmd.SmtpConfigId)
?? throw new InvalidOperationException($"SmtpConfiguration with ID {cmd.SmtpConfigId} not found.");
?? throw new ManagementCommandException($"SmtpConfiguration with ID {cmd.SmtpConfigId} not found.");
config.Host = cmd.Server;
config.Port = cmd.Port;
config.AuthType = cmd.AuthMode;
@@ -1114,7 +1153,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<ISecurityRepository>();
var mapping = await repo.GetMappingByIdAsync(cmd.MappingId)
?? throw new InvalidOperationException($"RoleMapping with ID {cmd.MappingId} not found.");
?? throw new ManagementCommandException($"RoleMapping with ID {cmd.MappingId} not found.");
mapping.LdapGroupName = cmd.LdapGroupName;
mapping.Role = cmd.Role;
await repo.UpdateMappingAsync(mapping);
@@ -1168,15 +1207,45 @@ public class ManagementActor : ReceiveActor
var result = await svc.DeployToAllSitesAsync(user);
return result.IsSuccess
? result.Value
: throw new InvalidOperationException(result.Error);
: throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleQueryDeployments(IServiceProvider sp, QueryDeploymentsCommand cmd)
private static async Task<object?> HandleQueryDeployments(IServiceProvider sp, QueryDeploymentsCommand cmd, AuthenticatedUser user)
{
var repo = sp.GetRequiredService<IDeploymentManagerRepository>();
// Instance-scoped query: enforce against the target instance's site
// before reading its deployment history (finding ManagementService-014).
if (cmd.InstanceId.HasValue)
{
await EnforceSiteScopeForInstance(sp, user, cmd.InstanceId.Value);
return await repo.GetDeploymentsByInstanceIdAsync(cmd.InstanceId.Value);
return await repo.GetAllDeploymentRecordsAsync();
}
// Unfiltered query: a site-scoped Deployment user must only see records
// for instances at sites within their scope. DeploymentRecord has no
// SiteId, so resolve each record's instance to its site and filter
// (mirrors the HandleListInstances / HandleListSites filter pattern).
var records = await repo.GetAllDeploymentRecordsAsync();
if (user.PermittedSiteIds.Length == 0 || user.Roles.Contains("Admin", StringComparer.OrdinalIgnoreCase))
return records;
var permittedIds = new HashSet<string>(user.PermittedSiteIds);
var templateRepo = sp.GetRequiredService<ITemplateEngineRepository>();
var instanceSiteCache = new Dictionary<int, int?>();
var scoped = new List<DeploymentRecord>();
foreach (var record in records)
{
if (!instanceSiteCache.TryGetValue(record.InstanceId, out var siteId))
{
var instance = await templateRepo.GetInstanceByIdAsync(record.InstanceId);
siteId = instance?.SiteId;
instanceSiteCache[record.InstanceId] = siteId;
}
if (siteId.HasValue && permittedIds.Contains(siteId.Value.ToString()))
scoped.Add(record);
}
return scoped;
}
// ========================================================================
@@ -1223,7 +1292,7 @@ public class ManagementActor : ReceiveActor
IsLocked = cmd.IsLocked
};
var result = await svc.AddAttributeAsync(cmd.TemplateId, attr, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleUpdateAttribute(IServiceProvider sp, UpdateTemplateAttributeCommand cmd, string user)
@@ -1238,14 +1307,14 @@ public class ManagementActor : ReceiveActor
IsLocked = cmd.IsLocked
};
var result = await svc.UpdateAttributeAsync(cmd.AttributeId, attr, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteAttribute(IServiceProvider sp, DeleteTemplateAttributeCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateService>();
var result = await svc.DeleteAttributeAsync(cmd.AttributeId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleAddAlarm(IServiceProvider sp, AddTemplateAlarmCommand cmd, string user)
@@ -1260,7 +1329,7 @@ public class ManagementActor : ReceiveActor
IsLocked = cmd.IsLocked
};
var result = await svc.AddAlarmAsync(cmd.TemplateId, alarm, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleUpdateAlarm(IServiceProvider sp, UpdateTemplateAlarmCommand cmd, string user)
@@ -1275,14 +1344,14 @@ public class ManagementActor : ReceiveActor
IsLocked = cmd.IsLocked
};
var result = await svc.UpdateAlarmAsync(cmd.AlarmId, alarm, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteAlarm(IServiceProvider sp, DeleteTemplateAlarmCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateService>();
var result = await svc.DeleteAlarmAsync(cmd.AlarmId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleAddScript(IServiceProvider sp, AddTemplateScriptCommand cmd, string user)
@@ -1297,7 +1366,7 @@ public class ManagementActor : ReceiveActor
ReturnDefinition = cmd.ReturnDefinition
};
var result = await svc.AddScriptAsync(cmd.TemplateId, script, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleUpdateScript(IServiceProvider sp, UpdateTemplateScriptCommand cmd, string user)
@@ -1312,28 +1381,28 @@ public class ManagementActor : ReceiveActor
ReturnDefinition = cmd.ReturnDefinition
};
var result = await svc.UpdateScriptAsync(cmd.ScriptId, script, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteScript(IServiceProvider sp, DeleteTemplateScriptCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateService>();
var result = await svc.DeleteScriptAsync(cmd.ScriptId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleAddComposition(IServiceProvider sp, AddTemplateCompositionCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateService>();
var result = await svc.AddCompositionAsync(cmd.TemplateId, cmd.ComposedTemplateId, cmd.InstanceName, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteComposition(IServiceProvider sp, DeleteTemplateCompositionCommand cmd, string user)
{
var svc = sp.GetRequiredService<TemplateService>();
var result = await svc.DeleteCompositionAsync(cmd.CompositionId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
// ========================================================================
@@ -1356,21 +1425,21 @@ public class ManagementActor : ReceiveActor
{
var svc = sp.GetRequiredService<SharedScriptService>();
var result = await svc.CreateSharedScriptAsync(cmd.Name, cmd.Code, cmd.ParameterDefinitions, cmd.ReturnDefinition, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleUpdateSharedScript(IServiceProvider sp, UpdateSharedScriptCommand cmd, string user)
{
var svc = sp.GetRequiredService<SharedScriptService>();
var result = await svc.UpdateSharedScriptAsync(cmd.SharedScriptId, cmd.Code, cmd.ParameterDefinitions, cmd.ReturnDefinition, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
private static async Task<object?> HandleDeleteSharedScript(IServiceProvider sp, DeleteSharedScriptCommand cmd, string user)
{
var svc = sp.GetRequiredService<SharedScriptService>();
var result = await svc.DeleteSharedScriptAsync(cmd.SharedScriptId, user);
return result.IsSuccess ? result.Value : throw new InvalidOperationException(result.Error);
return result.IsSuccess ? result.Value : throw new ManagementCommandException(result.Error);
}
// ========================================================================
@@ -1403,7 +1472,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<IExternalSystemRepository>();
var def = await repo.GetDatabaseConnectionByIdAsync(cmd.DatabaseConnectionId)
?? throw new InvalidOperationException($"DatabaseConnection with ID {cmd.DatabaseConnectionId} not found.");
?? throw new ManagementCommandException($"DatabaseConnection with ID {cmd.DatabaseConnectionId} not found.");
def.Name = cmd.Name;
def.ConnectionString = cmd.ConnectionString;
await repo.UpdateDatabaseConnectionAsync(def);
@@ -1457,7 +1526,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<IInboundApiRepository>();
var method = await repo.GetApiMethodByIdAsync(cmd.ApiMethodId)
?? throw new InvalidOperationException($"ApiMethod with ID {cmd.ApiMethodId} not found.");
?? throw new ManagementCommandException($"ApiMethod with ID {cmd.ApiMethodId} not found.");
method.Script = cmd.Script;
method.TimeoutSeconds = cmd.TimeoutSeconds;
method.ParameterDefinitions = cmd.ParameterDefinitions;
@@ -1489,7 +1558,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<IInboundApiRepository>();
var key = await repo.GetApiKeyByIdAsync(cmd.ApiKeyId)
?? throw new InvalidOperationException($"ApiKey with ID {cmd.ApiKeyId} not found.");
?? throw new ManagementCommandException($"ApiKey with ID {cmd.ApiKeyId} not found.");
key.IsEnabled = cmd.IsEnabled;
await repo.UpdateApiKeyAsync(key);
await repo.SaveChangesAsync();
@@ -1530,7 +1599,7 @@ public class ManagementActor : ReceiveActor
{
var repo = sp.GetRequiredService<ITemplateEngineRepository>();
var area = await repo.GetAreaByIdAsync(cmd.AreaId)
?? throw new InvalidOperationException($"Area with ID {cmd.AreaId} not found.");
?? throw new ManagementCommandException($"Area with ID {cmd.AreaId} not found.");
area.Name = cmd.Name;
await repo.UpdateAreaAsync(area);
await repo.SaveChangesAsync();
@@ -1576,13 +1645,13 @@ public class ManagementActor : ReceiveActor
{
var instanceRepo = sp.GetRequiredService<ITemplateEngineRepository>();
var instance = await instanceRepo.GetInstanceByIdAsync(cmd.InstanceId)
?? throw new InvalidOperationException($"Instance {cmd.InstanceId} not found.");
?? throw new ManagementCommandException($"Instance {cmd.InstanceId} not found.");
EnforceSiteScope(user, instance.SiteId);
var siteRepo = sp.GetRequiredService<ISiteRepository>();
var site = await siteRepo.GetSiteByIdAsync(instance.SiteId)
?? throw new InvalidOperationException($"Site {instance.SiteId} not found.");
?? throw new ManagementCommandException($"Site {instance.SiteId} not found.");
var commService = sp.GetRequiredService<CommunicationService>();
var request = new DebugSnapshotRequest(instance.UniqueName, Guid.NewGuid().ToString("N"));
@@ -1597,3 +1666,16 @@ public class SiteScopeViolationException : Exception
{
public SiteScopeViolationException(string message) : base(message) { }
}
/// <summary>
/// Thrown by a command handler to signal a curated, caller-safe failure (finding
/// ManagementService-016). Its <see cref="Exception.Message"/> is intended to be
/// surfaced verbatim to the HTTP/CLI caller — e.g. a validation result or a
/// "not found" message. Unanticipated exceptions (database faults, parse errors,
/// null-reference, etc.) must NOT be this type, so that <c>MapFault</c> can return
/// a generic message for them and avoid leaking internal detail.
/// </summary>
public class ManagementCommandException : Exception
{
public ManagementCommandException(string message) : base(message) { }
}
@@ -45,17 +45,30 @@ public class MailKitSmtpClientWrapper : ISmtpClientWrapper, IDisposable
public async Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default)
{
// NS-016: missing/unparseable credentials and an unrecognised auth type used
// to make this method silently return and the connection then sent mail
// unauthenticated — masking a misconfiguration against an open relay and, at
// worst, sending where authentication was required. Authentication being
// skipped must never be silent: each of these is a permanent configuration
// fault, surfaced as SmtpPermanentException so SendAsync returns a clean
// failure and DeliverBufferedAsync parks the buffered message.
if (string.IsNullOrEmpty(credentials))
return;
{
throw new SmtpPermanentException(
$"SMTP auth type '{authType}' requires credentials, but none are configured.");
}
switch (authType.ToLowerInvariant())
{
case "basic":
var parts = credentials.Split(':', 2);
if (parts.Length == 2)
if (parts.Length != 2)
{
await _client.AuthenticateAsync(parts[0], parts[1], cancellationToken);
throw new SmtpPermanentException(
"Basic SMTP credentials must be in 'username:password' form.");
}
await _client.AuthenticateAsync(parts[0], parts[1], cancellationToken);
break;
case "oauth2":
@@ -63,6 +76,10 @@ public class MailKitSmtpClientWrapper : ISmtpClientWrapper, IDisposable
var oauth2 = new SaslMechanismOAuth2("", credentials);
await _client.AuthenticateAsync(oauth2, cancellationToken);
break;
default:
throw new SmtpPermanentException(
$"Unsupported SMTP auth type '{authType}'. Expected one of: basic, oauth2.");
}
}
@@ -3,6 +3,7 @@ using System.Text.Json;
using MailKit;
using MailKit.Net.Smtp;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MimeKit;
using ScadaLink.Commons.Entities.Notifications;
using ScadaLink.Commons.Interfaces.Repositories;
@@ -18,26 +19,31 @@ namespace ScadaLink.NotificationService;
/// Transient: connection refused, timeout, SMTP 4xx → hand to S&amp;F.
/// Permanent: SMTP 5xx → returned to script.
/// </summary>
public class NotificationDeliveryService : INotificationDeliveryService
public class NotificationDeliveryService : INotificationDeliveryService, IDisposable
{
private readonly INotificationRepository _repository;
private readonly Func<ISmtpClientWrapper> _smtpClientFactory;
private readonly OAuth2TokenService? _tokenService;
private readonly StoreAndForwardService? _storeAndForward;
private readonly ILogger<NotificationDeliveryService> _logger;
private readonly NotificationOptions _options;
public NotificationDeliveryService(
INotificationRepository repository,
Func<ISmtpClientWrapper> smtpClientFactory,
ILogger<NotificationDeliveryService> logger,
OAuth2TokenService? tokenService = null,
StoreAndForwardService? storeAndForward = null)
StoreAndForwardService? storeAndForward = null,
IOptions<NotificationOptions>? options = null)
{
_repository = repository;
_smtpClientFactory = smtpClientFactory;
_logger = logger;
_tokenService = tokenService;
_storeAndForward = storeAndForward;
// NS-017: NotificationOptions supplies the documented fallback values used
// when a deployed SmtpConfiguration row leaves a field unset (non-positive).
_options = options?.Value ?? new NotificationOptions();
}
/// <summary>
@@ -50,6 +56,8 @@ public class NotificationDeliveryService : INotificationDeliveryService
string? originInstanceName = null,
CancellationToken cancellationToken = default)
{
ObjectDisposedException.ThrowIf(_disposed, this);
var list = await _repository.GetListByNameAsync(listName, cancellationToken);
if (list == null)
{
@@ -146,6 +154,24 @@ public class NotificationDeliveryService : INotificationDeliveryService
return new NotificationResult(true, null, WasBuffered: true);
}
catch (Exception ex)
{
// NS-015: a failure that ClassifySmtpError does not recognise (Unknown) —
// most importantly an OAuth2 token-fetch failure (HttpRequestException
// from EnsureSuccessStatusCode, or InvalidOperationException from a
// malformed credential triple) — used to fall through all the catch
// clauses above and escape SendAsync as a raw exception to the calling
// script, which the INotificationDeliveryService contract never
// advertises. Convert any otherwise-unhandled exception into a clean,
// credential-scrubbed permanent NotificationResult: returning control to
// the script is the safe default. (A caller-requested cancellation is
// already re-thrown by the filter above and never reaches here.)
var detail = CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials);
_logger.LogError(
"Unclassified failure sending to list {List} ({ExceptionType}): {Detail}",
listName, ex.GetType().Name, detail);
return new NotificationResult(false, $"Notification delivery failed: {detail}");
}
}
/// <summary>
@@ -224,36 +250,103 @@ public class NotificationDeliveryService : INotificationDeliveryService
payload.ListName, CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
return false;
}
// Transient SMTP errors propagate out of DeliverAsync — the S&F engine retries.
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// A handler shutdown cancellation is neither a delivery success nor a
// permanent failure — let it propagate so the engine does not park.
throw;
}
catch (Exception ex) when (IsTransientSmtpError(ex, cancellationToken))
{
// A typed transient SMTP error: re-throw so the S&F engine retries.
throw;
}
catch (Exception ex)
{
// NS-014: an exception ClassifySmtpError does not recognise (Unknown) —
// chiefly an OAuth2 token-fetch failure — used to escape this handler.
// The S&F engine treats ANY thrown exception as transient, so a
// permanently-broken config (bad client secret, malformed credential
// triple) was retried on every sweep until MaxRetries, burning token
// endpoint calls. Decide deliberately rather than letting it leak:
// - an HttpRequestException with a 5xx token-endpoint status is a
// transient outage → re-throw so the engine retries;
// - everything else (a 4xx/401 token rejection, a malformed credential
// InvalidOperationException, any other unclassified fault) is not
// fixable by retrying → return false so the message is parked.
if (ex is HttpRequestException { StatusCode: { } status } && (int)status is >= 500 and < 600)
{
_logger.LogWarning(
"Buffered notification to list '{List}' hit a transient OAuth2 token-endpoint error ({Status}); will retry.",
payload.ListName, (int)status);
throw;
}
_logger.LogError(
"Buffered notification to list '{List}' failed with a non-retryable error ({ExceptionType}: {Detail}); parking.",
payload.ListName, ex.GetType().Name,
CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
return false;
}
}
private sealed record BufferedNotification(string ListName, string Subject, string Message);
/// <summary>
/// NS-007: throttles concurrent SMTP deliveries to the configured
/// <c>MaxConcurrentConnections</c>. Created lazily from the first SMTP config
/// seen (one SMTP config is deployed per site, so the limit is stable).
/// <c>MaxConcurrentConnections</c>. One SMTP config is deployed per site, so the
/// limit is a stable per-site invariant; it is captured lazily on first use.
/// NS-018: a <see cref="Lazy{T}"/> replaces the hand-rolled double-checked
/// init — its publication is correctly synchronised (no lock-free read of a
/// non-volatile field) and it is disposed in <see cref="Dispose"/>.
/// </summary>
private SemaphoreSlim? _concurrencyLimiter;
private Lazy<SemaphoreSlim>? _concurrencyLimiter;
private readonly object _limiterLock = new();
private bool _disposed;
private SemaphoreSlim GetConcurrencyLimiter(SmtpConfiguration config)
{
if (_concurrencyLimiter != null)
{
return _concurrencyLimiter;
}
// NS-018: the limiter is sized once; capture the size now so the Lazy
// factory does not close over a value that could change between calls.
var configured = config.MaxConcurrentConnections > 0
? config.MaxConcurrentConnections
// NS-017: fall back to the NotificationOptions value, then the
// design-doc default of 5, when the deployed row leaves it unset.
: _options.MaxConcurrentConnections > 0 ? _options.MaxConcurrentConnections : 5;
lock (_limiterLock)
{
// NS-007: a non-positive configured value would make SemaphoreSlim
// throw; fall back to the design-doc default of 5.
var max = config.MaxConcurrentConnections > 0 ? config.MaxConcurrentConnections : 5;
_concurrencyLimiter ??= new SemaphoreSlim(max, max);
return _concurrencyLimiter;
ObjectDisposedException.ThrowIf(_disposed, this);
_concurrencyLimiter ??= new Lazy<SemaphoreSlim>(
() => new SemaphoreSlim(configured, configured));
return _concurrencyLimiter.Value;
}
}
/// <summary>
/// NS-018: disposes the lazily-created concurrency limiter. The service is a
/// scoped DI service; without this the <see cref="SemaphoreSlim"/> leaked a
/// handle per scope.
/// </summary>
public void Dispose()
{
lock (_limiterLock)
{
if (_disposed)
{
return;
}
_disposed = true;
if (_concurrencyLimiter is { IsValueCreated: true } limiter)
{
limiter.Value.Dispose();
}
}
GC.SuppressFinalize(this);
}
/// <summary>
/// NS-008: Validates the sender and recipient email addresses, returning a
/// human-readable error string if any is malformed, or null if all parse.
@@ -300,8 +393,13 @@ public class NotificationDeliveryService : INotificationDeliveryService
try
{
// NS-005/NS-007: explicit TLS mode and the configured connection timeout.
// NS-017: when the deployed SmtpConfiguration row leaves the timeout
// unset (non-positive), fall back to the NotificationOptions value.
var timeoutSeconds = config.ConnectionTimeoutSeconds > 0
? config.ConnectionTimeoutSeconds
: _options.ConnectionTimeoutSeconds;
await smtp.ConnectAsync(
config.Host, config.Port, tlsMode, config.ConnectionTimeoutSeconds, cancellationToken);
config.Host, config.Port, tlsMode, timeoutSeconds, cancellationToken);
// Resolve credentials (OAuth2 token fetched/cached by the token service).
var credentials = config.Credentials;
@@ -1,15 +1,26 @@
namespace ScadaLink.NotificationService;
/// <summary>
/// Configuration options for the Notification Service.
/// Most SMTP configuration is stored in the database (SmtpConfiguration entity).
/// This provides fallback defaults and operational limits.
/// Configuration options for the Notification Service, bound from the
/// <c>ScadaLink:Notification</c> configuration section.
///
/// SMTP settings are primarily carried by the deployed <c>SmtpConfiguration</c>
/// entity. NS-017: these values are the fallback used by
/// <see cref="NotificationDeliveryService"/> when the corresponding
/// <c>SmtpConfiguration</c> field is left unset (non-positive) on a partially
/// deployed row — a value present on the row always takes precedence.
/// </summary>
public class NotificationOptions
{
/// <summary>Default connection timeout for SMTP connections.</summary>
/// <summary>
/// Connection timeout (seconds) used when <c>SmtpConfiguration.ConnectionTimeoutSeconds</c>
/// is unset. Default 30s.
/// </summary>
public int ConnectionTimeoutSeconds { get; set; } = 30;
/// <summary>Maximum concurrent SMTP connections.</summary>
/// <summary>
/// Maximum concurrent SMTP connections used when
/// <c>SmtpConfiguration.MaxConcurrentConnections</c> is unset. Default 5.
/// </summary>
public int MaxConcurrentConnections { get; set; } = 5;
}
+16
View File
@@ -152,6 +152,14 @@ public class JwtTokenService
/// otherwise the documented 30-minute idle timeout could never fire for a client
/// that polls in the background. Call <see cref="RecordActivity"/> to advance the
/// anchor when handling a genuine user request.
/// <para>
/// A principal that is already past the idle timeout cannot be refreshed: this
/// method returns <c>null</c> (the same "cannot refresh" signal it uses for missing
/// claims). Enforcing the idle check here — rather than relying on the caller to
/// invoke <see cref="IsIdleTimedOut"/> first — guarantees the documented 30-minute
/// idle policy holds regardless of caller discipline; otherwise an idle-expired
/// session could be kept alive indefinitely by background refreshes (Security-014).
/// </para>
/// </summary>
public string? RefreshToken(ClaimsPrincipal currentPrincipal, IReadOnlyList<string> currentRoles, IReadOnlyList<string>? permittedSiteIds)
{
@@ -164,6 +172,14 @@ public class JwtTokenService
return null;
}
// An idle-expired session must not be renewed — the user must re-login.
if (IsIdleTimedOut(currentPrincipal))
{
_logger.LogInformation(
"Cannot refresh token for {Username}: session is past the idle timeout", username);
return null;
}
return GenerateToken(displayName, username, currentRoles, permittedSiteIds,
ReadLastActivity(currentPrincipal));
}
+116 -26
View File
@@ -23,6 +23,14 @@ public class LdapAuthService
if (string.IsNullOrWhiteSpace(password))
return new LdapAuthResult(false, null, null, null, "Password is required.");
// Trim once, up front: a username with leading/trailing whitespace (copy-paste
// artefacts, mobile keyboards) is otherwise passed verbatim into the LDAP filter,
// the fallback bind DN, and — most consequentially — the JWT Username claim and
// audit trail, producing two distinct identities for the same person
// (Security-015). The IsNullOrWhiteSpace guard above already rejects an
// all-whitespace value, so the trimmed result here is always non-empty.
username = NormalizeUsername(username);
// Enforce TLS unless explicitly allowed for dev/test
if (_options.LdapTransport == LdapTransport.None && !_options.AllowInsecureLdap)
{
@@ -74,6 +82,7 @@ public class LdapAuthService
// Query for user attributes and group memberships
var displayName = username;
var groups = new List<string>();
var groupLookupSucceeded = true;
try
{
@@ -86,40 +95,45 @@ public class LdapAuthService
new[] { _options.LdapDisplayNameAttribute, _options.LdapGroupAttribute },
false), ct);
// `HasMore()` is the loop guard for end-of-results; it returns false
// when the enumeration is exhausted. An LdapException thrown by
// `Next()` inside a HasMore()-guarded loop is therefore NOT a benign
// "no more results" sentinel — it is a genuine error (referral failure,
// server-side limit, transport drop mid-enumeration). The previous
// `catch (LdapException) { break; }` silently truncated the group list
// and masked a partial outage (Security-012); such an exception now
// propagates to the outer catch and fails the login.
while (searchResults.HasMore())
{
try
{
var entry = searchResults.Next();
var dnAttr = entry.GetAttribute(_options.LdapDisplayNameAttribute);
if (dnAttr != null)
displayName = dnAttr.StringValue;
var entry = searchResults.Next();
var groupAttr = entry.GetAttribute(_options.LdapGroupAttribute);
if (groupAttr != null)
{
foreach (var groupDn in groupAttr.StringValueArray)
{
groups.Add(ExtractFirstRdnValue(groupDn));
}
}
}
catch (LdapException)
var dnAttr = entry.GetAttribute(_options.LdapDisplayNameAttribute);
if (dnAttr != null)
displayName = dnAttr.StringValue;
var groupAttr = entry.GetAttribute(_options.LdapGroupAttribute);
if (groupAttr != null)
{
// No more results
break;
foreach (var groupDn in groupAttr.StringValueArray)
{
groups.Add(ExtractFirstRdnValue(groupDn));
}
}
}
}
catch (LdapException ex)
{
_logger.LogWarning(ex, "Failed to query LDAP attributes for user {Username}; authentication succeeded but group lookup failed", username);
// Auth succeeded even if attribute lookup failed
// A failed group/attribute lookup on initial login means the directory
// is partially unavailable. The design's LDAP-failure rule requires new
// logins to FAIL when LDAP is unavailable — admitting the user here
// would yield an authenticated session with zero roles (Security-012).
_logger.LogWarning(ex, "LDAP group/attribute lookup failed for user {Username}; failing the login per the LDAP-failure rule", username);
groupLookupSucceeded = false;
}
connection.Disconnect();
return new LdapAuthResult(true, displayName, username, groups, null);
return BuildAuthResultFromGroupLookup(username, displayName, groups, groupLookupSucceeded);
}
catch (LdapException ex)
{
@@ -255,16 +269,92 @@ public class LdapAuthService
.Replace("\0", "\\00");
}
private static string ExtractFirstRdnValue(string dn)
/// <summary>
/// Normalises a username by trimming leading and trailing whitespace. Applied once
/// at the top of <see cref="AuthenticateAsync"/> so the same canonical value flows
/// into the LDAP filter, the fallback bind DN, and the JWT <c>Username</c> claim —
/// avoiding two distinct identities for the same person (Security-015).
/// </summary>
public static string NormalizeUsername(string username)
=> username?.Trim() ?? string.Empty;
/// <summary>
/// Builds the final <see cref="LdapAuthResult"/> for a login attempt once the user
/// bind has succeeded. When the group/attribute lookup failed
/// (<paramref name="groupLookupSucceeded"/> is false) the directory is partially
/// unavailable, so the login is FAILED per the design's LDAP-failure rule rather
/// than returning an authenticated session with zero roles (Security-012). When the
/// lookup succeeded, an empty <paramref name="groups"/> list is a genuine
/// "no mapped groups" outcome and the login succeeds.
/// </summary>
public static LdapAuthResult BuildAuthResultFromGroupLookup(
string username,
string displayName,
IReadOnlyList<string> groups,
bool groupLookupSucceeded)
{
// Extract the value of the first RDN from a DN.
// Handles cn=, ou=, or any attribute: "ou=SCADA-Admins,ou=groups,dc=..." → "SCADA-Admins"
if (!groupLookupSucceeded)
{
return new LdapAuthResult(false, null, username, null,
"The directory is temporarily unavailable. Please try again.");
}
return new LdapAuthResult(true, displayName, username, groups, null);
}
/// <summary>
/// Extracts the value of the first RDN from a DN, e.g.
/// <c>ou=SCADA-Admins,ou=groups,dc=...</c> → <c>SCADA-Admins</c>. The scan is
/// RFC 4514 escape-aware: a backslash-escaped <c>,</c> inside the RDN value does
/// not terminate it, and recognised escape sequences are unescaped, so a group CN
/// that legitimately contains a comma is returned intact (Security-013).
/// </summary>
public static string ExtractFirstRdnValue(string dn)
{
if (string.IsNullOrEmpty(dn))
return dn;
var equalsIndex = dn.IndexOf('=');
if (equalsIndex < 0)
return dn;
var valueStart = equalsIndex + 1;
var commaIndex = dn.IndexOf(',', valueStart);
return commaIndex > valueStart ? dn[valueStart..commaIndex] : dn[valueStart..];
var sb = new System.Text.StringBuilder(dn.Length - valueStart);
for (var i = valueStart; i < dn.Length; i++)
{
var c = dn[i];
if (c == '\\' && i + 1 < dn.Length)
{
var next = dn[i + 1];
// RFC 4514 hex escape: \XX (two hex digits).
if (i + 2 < dn.Length && IsHexDigit(next) && IsHexDigit(dn[i + 2]))
{
sb.Append((char)Convert.ToInt32(dn.Substring(i + 1, 2), 16));
i += 2;
}
else
{
// Single-character escape (e.g. \, \+ \\ \" \; etc.) — emit the
// escaped character literally and skip the backslash.
sb.Append(next);
i += 1;
}
continue;
}
if (c == ',')
{
// Unescaped comma terminates the first RDN.
break;
}
sb.Append(c);
}
return sb.ToString();
}
private static bool IsHexDigit(char c)
=> (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
@@ -30,6 +30,19 @@ public class EventLogQueryService : IEventLogQueryService
_logger = logger;
}
/// <summary>
/// Escapes the SQL <c>LIKE</c> metacharacters (<c>\</c>, <c>%</c>, <c>_</c>) in a
/// user-supplied keyword so it is matched as a literal substring. Used together
/// with a <c>LIKE ... ESCAPE '\'</c> clause.
/// </summary>
private static string EscapeLikePattern(string input)
{
return input
.Replace("\\", "\\\\")
.Replace("%", "\\%")
.Replace("_", "\\_");
}
public EventLogQueryResponse ExecuteQuery(EventLogQueryRequest request)
{
try
@@ -78,8 +91,14 @@ public class EventLogQueryService : IEventLogQueryService
if (!string.IsNullOrWhiteSpace(request.KeywordFilter))
{
whereClauses.Add("(message LIKE $keyword OR source LIKE $keyword)");
parameters.Add(new SqliteParameter("$keyword", $"%{request.KeywordFilter}%"));
// Keyword search is a literal substring match. The LIKE
// metacharacters % and _ (and the escape char itself) must be
// escaped so identifiers such as "store_and_forward" or a literal
// "%" are not misinterpreted as wildcards (SiteEventLogging-013).
var escaped = EscapeLikePattern(request.KeywordFilter);
whereClauses.Add(
"(message LIKE $keyword ESCAPE '\\' OR source LIKE $keyword ESCAPE '\\')");
parameters.Add(new SqliteParameter("$keyword", $"%{escaped}%"));
}
var whereClause = whereClauses.Count > 0
@@ -160,9 +160,13 @@ public class SiteEventLogger : ISiteEventLogger, IDisposable
if (!_writeQueue.Writer.TryWrite(pending))
{
// The channel is unbounded, so the only way TryWrite fails is that the
// writer has been completed (logger disposed). Drop silently — there is
// nowhere to persist the event.
pending.Completion.TrySetResult();
// writer has been completed (logger disposed). The event cannot be
// persisted — fault the Task (SiteEventLogging-012) rather than
// reporting false success, so a caller that awaits a critical audit
// event can tell it was dropped.
pending.Completion.TrySetException(
new ObjectDisposedException(nameof(SiteEventLogger),
"Event could not be recorded: the event logger has been disposed."));
}
return pending.Completion.Task;
@@ -191,9 +195,20 @@ public class SiteEventLogger : ISiteEventLogger, IDisposable
cmd.ExecuteNonQuery();
});
// WithConnection returns false only when the logger has been
// disposed; the event simply cannot be persisted in that case.
pending.Completion.TrySetResult();
if (written)
{
pending.Completion.TrySetResult();
}
else
{
// WithConnection returns false only when the logger has been
// disposed mid-drain; the event was not persisted. Fault the
// Task (SiteEventLogging-012) instead of reporting false
// success for a dropped audit event.
pending.Completion.TrySetException(
new ObjectDisposedException(nameof(SiteEventLogger),
"Event could not be recorded: the event logger was disposed before the write completed."));
}
}
catch (Exception ex)
{
@@ -56,6 +56,14 @@ public class AlarmActor : ReceiveActor
private readonly Script<object?>? _compiledTriggerExpression;
private readonly Dictionary<string, object?> _attributeSnapshot = new();
/// <summary>
/// SiteRuntime-017: the exact dictionary instance this actor was seeded from
/// at construction. The Instance Actor must pass a private snapshot here, not
/// its live <c>_attributes</c> field. Exposed for regression coverage of that
/// isolation contract.
/// </summary>
internal IReadOnlyDictionary<string, object?>? SeedAttributesReference { get; }
// Rate of change tracking
private readonly Queue<(DateTimeOffset Timestamp, double Value)> _rateOfChangeWindow = new();
private readonly TimeSpan _rateOfChangeWindowDuration;
@@ -90,6 +98,7 @@ public class AlarmActor : ReceiveActor
// Seed the trigger-expression attribute snapshot from the instance's
// initial attribute set so static attributes (which never re-emit an
// AttributeValueChanged after deploy) evaluate correctly at startup.
SeedAttributesReference = initialAttributes;
if (initialAttributes != null)
{
foreach (var kvp in initialAttributes)
@@ -4,7 +4,6 @@ using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Messages.DataConnection;
using ScadaLink.Commons.Messages.DebugView;
using ScadaLink.Commons.Messages.Instance;
using ScadaLink.Commons.Messages.Lifecycle;
using ScadaLink.Commons.Messages.ScriptExecution;
using ScadaLink.Commons.Messages.Streaming;
using ScadaLink.Commons.Types.Enums;
@@ -102,20 +101,13 @@ public class InstanceActor : ReceiveActor
// Handle static attribute writes
Receive<SetStaticAttributeCommand>(HandleSetStaticAttribute);
// Handle lifecycle messages
Receive<DisableInstanceCommand>(_ =>
{
_logger.LogInformation("Instance {Instance} received disable command", _instanceUniqueName);
Sender.Tell(new InstanceLifecycleResponse(
_.CommandId, _instanceUniqueName, true, null, DateTimeOffset.UtcNow));
});
Receive<EnableInstanceCommand>(_ =>
{
_logger.LogInformation("Instance {Instance} received enable command", _instanceUniqueName);
Sender.Tell(new InstanceLifecycleResponse(
_.CommandId, _instanceUniqueName, true, null, DateTimeOffset.UtcNow));
});
// SiteRuntime-019: the disable/enable lifecycle is owned entirely by the
// Deployment Manager — DeploymentManagerActor.HandleDisable/HandleEnable
// stop or re-create the Instance Actor directly and reply to the caller.
// DisableInstanceCommand / EnableInstanceCommand are never routed to the
// Instance Actor, so no handlers are registered here. (The previous no-op
// handlers were dead code that implied a non-existent instance-side
// acknowledgement contract.)
// WP-15: Handle script call requests — route to appropriate Script Actor (Ask pattern)
Receive<ScriptCallRequest>(HandleScriptCallRequest);
@@ -590,11 +582,25 @@ public class InstanceActor : ReceiveActor
/// WP-15: Script Actors spawned per script definition.
/// WP-16: Alarm Actors spawned per alarm definition, as peers to Script Actors.
/// WP-32: Compilation errors reject entire instance deployment (logged but actor still starts).
///
/// SiteRuntime-017: each child is seeded from a private point-in-time snapshot
/// of <c>_attributes</c>, NOT the live dictionary. The snapshot is taken here on
/// the Instance Actor thread, so it is race-free; handing the live mutable
/// <see cref="System.Collections.Generic.Dictionary{TKey,TValue}"/> by reference
/// would let a child constructor enumerate it on the child's mailbox thread while
/// this actor mutates it in <c>HandleAttributeValueChanged</c>.
/// </summary>
private void CreateChildActors()
{
if (_configuration == null) return;
// SiteRuntime-017: snapshot the live attribute dictionary once, on the
// Instance Actor thread, before any child is constructed. Each child
// Props closure captures this immutable copy instead of the mutable
// _attributes field, so no child constructor ever enumerates a
// dictionary this actor is concurrently mutating.
var attributeSnapshot = new Dictionary<string, object?>(_attributes);
// Create Script Actors
foreach (var script in _configuration.Scripts)
{
@@ -622,7 +628,7 @@ public class InstanceActor : ReceiveActor
_options,
_logger,
triggerExpression,
_attributes,
attributeSnapshot,
_healthCollector,
_serviceProvider));
@@ -672,7 +678,7 @@ public class InstanceActor : ReceiveActor
_options,
_logger,
triggerExpression,
_attributes,
attributeSnapshot,
_healthCollector));
var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}");
@@ -48,6 +48,15 @@ public class ScriptActor : ReceiveActor, IWithTimers
private bool _lastExpressionResult;
private readonly Dictionary<string, object?> _attributeSnapshot = new();
/// <summary>
/// SiteRuntime-017: the exact dictionary instance this actor was seeded from
/// at construction. The Instance Actor must pass a private snapshot here, not
/// its live <c>_attributes</c> field — sharing the live dictionary lets this
/// constructor enumerate it while the Instance Actor mutates it on another
/// thread. Exposed for regression coverage of that isolation contract.
/// </summary>
internal IReadOnlyDictionary<string, object?>? SeedAttributesReference { get; }
public ITimerScheduler Timers { get; set; } = null!;
public ScriptActor(
@@ -80,6 +89,7 @@ public class ScriptActor : ReceiveActor, IWithTimers
// Seed the trigger-expression attribute snapshot from the instance's
// initial attribute set so static attributes (which never re-emit an
// AttributeValueChanged after deploy) evaluate correctly at startup.
SeedAttributesReference = initialAttributes;
if (initialAttributes != null)
{
foreach (var kvp in initialAttributes)
@@ -14,9 +14,14 @@ namespace ScadaLink.SiteRuntime.Actors;
/// <summary>
/// WP-15: Script Execution Actor -- short-lived child of Script Actor.
/// Receives compiled code, params, Instance Actor ref, and call depth.
/// Runs on a dedicated blocking I/O dispatcher.
/// Executes the script via Script Runtime API, returns result, then stops.
///
/// The actor itself and its mailbox run on the default Akka dispatcher; only the
/// script body is dispatched off the actor thread, onto the dedicated
/// <see cref="ScadaLink.SiteRuntime.Scripts.ScriptExecutionScheduler"/>
/// (SiteRuntime-009), so blocking script I/O cannot starve the shared thread pool
/// or stall other Akka dispatchers.
///
/// WP-32: Script failures are logged but do not disable the script.
/// Supervision: Stop on unhandled exception (parent ScriptActor decides).
/// </summary>
@@ -20,6 +20,10 @@
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
<ItemGroup>
<InternalsVisibleTo Include="ScadaLink.SiteRuntime.Tests" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
<ProjectReference Include="../ScadaLink.Communication/ScadaLink.Communication.csproj" />
@@ -73,6 +73,22 @@ public class ReplicationService
message));
}
/// <summary>
/// WP-11 / StoreAndForward-016: Replicates an operator-initiated requeue (a parked
/// message moved back to the pending queue) to standby (fire-and-forget). The
/// carried message reflects the active node's post-requeue state (Pending,
/// retry_count = 0) so the standby's copy can be brought into sync.
/// </summary>
public void ReplicateRequeue(StoreAndForwardMessage message)
{
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
FireAndForget(new ReplicationOperation(
ReplicationOperationType.Requeue,
message.Id,
message));
}
/// <summary>
/// WP-11: Applies a replicated operation received from the active node.
/// Used by the standby node to keep its SQLite in sync.
@@ -95,6 +111,15 @@ public class ReplicationService
operation.Message.Status = StoreAndForwardMessageStatus.Parked;
await storage.UpdateMessageAsync(operation.Message);
break;
case ReplicationOperationType.Requeue when operation.Message != null:
// StoreAndForward-016: an operator retried a parked message on the
// active node; mirror that on the standby by moving its row back to
// Pending with retry_count = 0 so a failover preserves the retry.
operation.Message.Status = StoreAndForwardMessageStatus.Pending;
operation.Message.RetryCount = 0;
await storage.UpdateMessageAsync(operation.Message);
break;
}
}
@@ -132,5 +157,10 @@ public enum ReplicationOperationType
{
Add,
Remove,
Park
Park,
/// <summary>
/// StoreAndForward-016: an operator moved a parked message back to the pending
/// queue. The standby resets its matching row to Pending with retry_count = 0.
/// </summary>
Requeue
}
@@ -27,7 +27,12 @@ public class StoreAndForwardMessage
/// </summary>
public int RetryCount { get; set; }
/// <summary>Maximum retry-sweep attempts before parking (0 = no limit).</summary>
/// <summary>
/// Maximum retry-sweep attempts before the message is parked.
/// <c>0</c> = no limit — the message is retried on every sweep until delivered
/// and is never parked for exhausting retries. This is <b>not</b> a "never retry"
/// value; a positive value is required to bound delivery attempts.
/// </summary>
public int MaxRetries { get; set; }
/// <summary>Retry interval in milliseconds.</summary>
@@ -10,7 +10,8 @@ namespace ScadaLink.StoreAndForward;
/// 1. Caller attempts immediate delivery via IDeliveryHandler
/// 2. On transient failure → buffer in SQLite → retry loop
/// 3. On success → remove from buffer
/// 4. On max retries → park
/// 4. On reaching MaxRetries → park (a MaxRetries of 0 means "no limit" — the
/// message is retried until delivered and is never parked for retry exhaustion)
/// 5. Permanent failures are returned to caller immediately (never buffered)
///
/// WP-10: Fixed retry interval (not exponential). Per-source-entity retry settings.
@@ -116,10 +117,38 @@ public class StoreAndForwardService
/// Attempts immediate delivery first. On transient failure, buffers for retry.
/// On permanent failure (handler returns false), returns false immediately.
///
/// WP-10: Retry-count lifecycle — the immediate (or caller-made) delivery attempt
/// is attempt 0 and is not counted; the background retry sweep increments
/// <see cref="StoreAndForwardMessage.RetryCount"/> on each retry. A buffered
/// message is parked once <c>RetryCount</c> reaches <paramref name="maxRetries"/>
/// — <b>but only when <paramref name="maxRetries"/> is greater than 0</b>. A
/// <paramref name="maxRetries"/> of <c>0</c> means <b>no limit</b>: the message is
/// retried on every sweep until it is delivered and is <b>never parked</b> on a
/// retry-count basis. It is therefore <i>not</i> a "do not retry" value — callers
/// that want delivery abandoned after a bounded number of attempts must pass a
/// positive <paramref name="maxRetries"/>.
///
/// WP-15: CachedCall idempotency note — this method does not deduplicate.
/// The caller (e.g., ExternalSystem.CachedCall()) is responsible for ensuring
/// that the remote system can handle duplicate deliveries safely.
/// </summary>
/// <param name="category">Message category (selects the delivery handler).</param>
/// <param name="target">Target system name (external system / notification list / DB connection).</param>
/// <param name="payloadJson">JSON-serialized call payload, treated opaquely.</param>
/// <param name="originInstanceName">Instance that originated the message (WP-13: survives instance deletion).</param>
/// <param name="maxRetries">
/// Maximum background retry-sweep attempts before the message is parked.
/// <b><c>0</c> = no limit</b> — the message is retried on every sweep until
/// delivered and is never parked for exhausting retries; it is <b>not</b> a
/// "never retry" value. <c>null</c> uses <see cref="StoreAndForwardOptions.DefaultMaxRetries"/>.
/// Must be positive to bound delivery attempts. Mirrors the
/// <see cref="StoreAndForwardMessage.MaxRetries"/> contract.
/// </param>
/// <param name="retryInterval">Fixed interval between retry sweeps for this message; <c>null</c> uses the configured default.</param>
/// <param name="attemptImmediateDelivery">
/// When <c>false</c>, the caller has already made its own delivery attempt and the
/// message is buffered directly for the retry sweep (the handler is not invoked here).
/// </param>
public async Task<StoreAndForwardResult> EnqueueAsync(
StoreAndForwardCategory category,
string target,
@@ -335,13 +364,27 @@ public class StoreAndForwardService
/// <summary>
/// WP-12: Retries a parked message (moves back to pending queue).
///
/// StoreAndForward-016: an operator requeue is a buffer state change and is
/// replicated to the standby (as a <see cref="ReplicationOperationType.Requeue"/>)
/// so a failover preserves the operator's retry intent.
/// StoreAndForward-017: the activity-log entry carries the message's true
/// category rather than a hard-coded one.
/// </summary>
public async Task<bool> RetryParkedMessageAsync(string messageId)
{
var success = await _storage.RetryParkedMessageAsync(messageId);
if (success)
{
RaiseActivity("Retry", StoreAndForwardCategory.ExternalSystem,
// Re-load the requeued row so the activity log gets the real category
// and the standby gets the post-requeue state (Pending, retry_count = 0).
var message = await _storage.GetMessageByIdAsync(messageId);
var category = message?.Category ?? StoreAndForwardCategory.ExternalSystem;
if (message != null)
{
_replication?.ReplicateRequeue(message);
}
RaiseActivity("Retry", category,
$"Parked message {messageId} moved back to queue");
}
return success;
@@ -349,13 +392,23 @@ public class StoreAndForwardService
/// <summary>
/// WP-12: Permanently discards a parked message.
///
/// StoreAndForward-016: an operator discard is a buffer removal and is replicated
/// to the standby (as a <see cref="ReplicationOperationType.Remove"/>) so the
/// discarded message does not reappear after a failover.
/// StoreAndForward-017: the activity-log entry carries the message's true
/// category rather than a hard-coded one.
/// </summary>
public async Task<bool> DiscardParkedMessageAsync(string messageId)
{
// Capture the category before the row is deleted so the activity log is
// labelled correctly.
var message = await _storage.GetMessageByIdAsync(messageId);
var success = await _storage.DiscardParkedMessageAsync(messageId);
if (success)
{
RaiseActivity("Discard", StoreAndForwardCategory.ExternalSystem,
_replication?.ReplicateRemove(messageId);
RaiseActivity("Discard", message?.Category ?? StoreAndForwardCategory.ExternalSystem,
$"Parked message {messageId} discarded");
}
return success;
@@ -714,7 +714,7 @@ public class FlatteningService
foreach (var composition in compositions)
ResolveComposedScriptsRecursive(
composition, composition.InstanceName,
composition, composition.InstanceName, parentPath: "",
compositionMap, composedTemplateChains, scripts, scriptCanonicalById,
new HashSet<int>());
}
@@ -723,11 +723,17 @@ public class FlatteningService
/// <summary>
/// Recursively resolves the scripts of a composed module and every module
/// nested inside it, path-qualifying each canonical name with the
/// accumulated <paramref name="prefix"/>.
/// accumulated <paramref name="prefix"/>. <paramref name="parentPath"/> is
/// the path of the enclosing module — empty for a depth-1 composition
/// (parent is the root template) and the enclosing module's
/// <c>prefix</c> for deeper nesting — and is carried into each script's
/// <see cref="ScriptScope"/> so a nested script's <c>Parent.X</c>
/// resolves against its real parent module.
/// </summary>
private static void ResolveComposedScriptsRecursive(
TemplateComposition composition,
string prefix,
string parentPath,
IReadOnlyDictionary<int, IReadOnlyList<TemplateComposition>> compositionMap,
IReadOnlyDictionary<int, IReadOnlyList<Template>> composedTemplateChains,
Dictionary<string, ResolvedScript> scripts,
@@ -747,7 +753,7 @@ public class FlatteningService
{
CanonicalName = canonicalName,
Source = "Composed",
Scope = new Commons.Types.Scripts.ScriptScope(SelfPath: prefix, ParentPath: "")
Scope = new Commons.Types.Scripts.ScriptScope(SelfPath: prefix, ParentPath: parentPath)
};
}
}
@@ -762,7 +768,7 @@ public class FlatteningService
foreach (var nested in nestedCompositions)
ResolveComposedScriptsRecursive(
nested, $"{prefix}.{nested.InstanceName}",
nested, $"{prefix}.{nested.InstanceName}", parentPath: prefix,
compositionMap, composedTemplateChains, scripts, scriptCanonicalById, visited);
}
}
@@ -705,12 +705,28 @@ public class TemplateService
{
var newDerivedName = $"{owner.Name}.{newInstanceName}";
var allTemplates = await _repository.GetAllTemplatesAsync(cancellationToken);
if (allTemplates.Any(t => t.Id != derived.Id && t.Name == newDerivedName))
return Result<TemplateComposition>.Failure(
$"Cannot rename derived template to '{newDerivedName}': a template with that name already exists.");
derived.Name = newDerivedName;
await _repository.UpdateTemplateAsync(derived, cancellationToken);
// The cascade of derived templates created by AddComposition follows a
// dotted path (Pump.TempSensor and the nested Pump.TempSensor.Probe1).
// Renaming the slot must rename every derived template in that cascade
// so the dotted-path naming invariant holds — pre-check every new name
// the cascade will introduce before any row mutates.
var renames = new List<(Template Template, string NewName)>();
await CollectCascadeRenamesAsync(derived, newDerivedName, renames, cancellationToken);
var renamedIds = renames.Select(r => r.Template.Id).ToHashSet();
foreach (var (_, newName) in renames)
{
if (allTemplates.Any(t => !renamedIds.Contains(t.Id) && t.Name == newName))
return Result<TemplateComposition>.Failure(
$"Cannot rename derived template to '{newName}': a template with that name already exists.");
}
foreach (var (template, newName) in renames)
{
template.Name = newName;
await _repository.UpdateTemplateAsync(template, cancellationToken);
}
}
composition.InstanceName = newInstanceName;
@@ -747,6 +763,30 @@ public class TemplateService
return Result<bool>.Success(true);
}
/// <summary>
/// Recursively collects the (template, new name) pairs for a renamed derived
/// template and every cascaded inner derived template beneath it. Each inner
/// derived's new name is re-derived from its renamed parent and the slot's
/// instance name (mirroring the cascade <see cref="CreateCascadedCompositionAsync"/>
/// builds and the recursion in <see cref="CascadeDeleteDerivedAsync"/>).
/// </summary>
private async Task CollectCascadeRenamesAsync(
Template derived,
string newName,
List<(Template Template, string NewName)> renames,
CancellationToken cancellationToken)
{
renames.Add((derived, newName));
foreach (var child in derived.Compositions.ToList())
{
var childDerived = await _repository.GetTemplateByIdAsync(child.ComposedTemplateId, cancellationToken);
if (childDerived != null && childDerived.IsDerived && childDerived.OwnerCompositionId == child.Id)
await CollectCascadeRenamesAsync(
childDerived, $"{newName}.{child.InstanceName}", renames, cancellationToken);
}
}
/// <summary>
/// Recursively deletes a derived template along with the cascade of inner
/// derived templates the compose flow created. Each composition row on the
@@ -68,6 +68,22 @@ public class CommandTreeTests
Assert.True(leaf.Action != null, $"Leaf command '{leaf.Name}' has no action."));
}
[Fact]
public void TemplateCompositionDelete_IsKeyedByIdOnly()
{
// CLI-015: the in-repo README documented `template composition delete` with
// --template-id / --instance-name, but the implementation keys deletion by the
// composition's own integer ID via a single --id option. Pin the real surface.
var template = TemplateCommands.Build(Url, Format, Username, Password);
var composition = template.Subcommands.Single(c => c.Name == "composition");
var delete = composition.Subcommands.Single(c => c.Name == "delete");
var optionNames = delete.Options.Select(o => o.Name).ToList();
Assert.Contains("--id", optionNames);
Assert.DoesNotContain("--template-id", optionNames);
Assert.DoesNotContain("--instance-name", optionNames);
}
[Theory]
[InlineData(typeof(GetInstanceCommand))]
[InlineData(typeof(ListSitesCommand))]
@@ -0,0 +1,91 @@
using ScadaLink.CLI;
using ScadaLink.CLI.Commands;
namespace ScadaLink.CLI.Tests;
/// <summary>
/// Regression tests for CLI-016 — <c>WriteAsTable</c> previously derived the table
/// header set from the first array element only, so any property unique to a later
/// element was silently dropped from the rendered table.
/// </summary>
[Collection("Console")]
public class TableHeaderUnionTests
{
[Fact]
public void HandleResponse_TableFormat_HeterogeneousArray_IncludesAllColumns()
{
// The second element has a "Status" property the first lacks. The pre-fix code
// derived headers from items[0] only, so "Status" (and its value "Faulted")
// were dropped from the table entirely.
var writer = new StringWriter();
Console.SetOut(writer);
try
{
var json = "[{\"Id\":1,\"Name\":\"Alpha\"},{\"Id\":2,\"Name\":\"Beta\",\"Status\":\"Faulted\"}]";
var response = new ManagementResponse(200, json, null, null);
var exitCode = CommandHelpers.HandleResponse(response, "table");
Assert.Equal(0, exitCode);
var output = writer.ToString();
Assert.Contains("Status", output);
Assert.Contains("Faulted", output);
}
finally
{
Console.SetOut(new StreamWriter(Console.OpenStandardOutput()) { AutoFlush = true });
}
}
[Fact]
public void HandleResponse_TableFormat_HeterogeneousArray_PreservesFirstSeenColumnOrder()
{
// Column order must be the first-seen order across all elements: the first
// element contributes Id, Name; the second contributes Status after them.
var writer = new StringWriter();
Console.SetOut(writer);
try
{
var json = "[{\"Id\":1,\"Name\":\"Alpha\"},{\"Status\":\"Faulted\",\"Id\":2,\"Name\":\"Beta\"}]";
var response = new ManagementResponse(200, json, null, null);
var exitCode = CommandHelpers.HandleResponse(response, "table");
Assert.Equal(0, exitCode);
var output = writer.ToString();
var headerLine = output.Split('\n')[0];
Assert.True(headerLine.IndexOf("Id") < headerLine.IndexOf("Name"));
Assert.True(headerLine.IndexOf("Name") < headerLine.IndexOf("Status"));
}
finally
{
Console.SetOut(new StreamWriter(Console.OpenStandardOutput()) { AutoFlush = true });
}
}
[Fact]
public void HandleResponse_TableFormat_FirstElementHasExtraColumn_StillRendersAllRows()
{
// The reverse case: the first element has a property a later element lacks.
// The later row must still render (with an empty cell), and all columns kept.
var writer = new StringWriter();
Console.SetOut(writer);
try
{
var json = "[{\"Id\":1,\"Name\":\"Alpha\",\"Note\":\"first\"},{\"Id\":2,\"Name\":\"Beta\"}]";
var response = new ManagementResponse(200, json, null, null);
var exitCode = CommandHelpers.HandleResponse(response, "table");
Assert.Equal(0, exitCode);
var output = writer.ToString();
Assert.Contains("Note", output);
Assert.Contains("first", output);
Assert.Contains("Beta", output);
}
finally
{
Console.SetOut(new StreamWriter(Console.OpenStandardOutput()) { AutoFlush = true });
}
}
}
@@ -0,0 +1,90 @@
using System.CommandLine;
using ScadaLink.CLI.Commands;
namespace ScadaLink.CLI.Tests;
/// <summary>
/// Regression tests for CLI-014. The <c>Update*Command</c> records in Commons carry
/// non-nullable "core" fields (e.g. <c>string Name</c>, <c>string Protocol</c>,
/// <c>string Script</c>) — an update is a <em>whole-entity replace</em>, not a sparse
/// patch. The CLI must therefore mark those core flags as <c>Required</c>: making them
/// optional would let an omitted flag send <c>null</c>/empty and silently blank the
/// field server-side. These tests pin that contract so the documented surface and the
/// implemented surface stay aligned.
/// </summary>
public class UpdateCommandContractTests
{
private static readonly Option<string> Url = new("--url") { Recursive = true };
private static readonly Option<string> Username = new("--username") { Recursive = true };
private static readonly Option<string> Password = new("--password") { Recursive = true };
private static readonly Option<string> Format = CliOptions.CreateFormatOption();
private static Command UpdateCommand(Command group, params string[] path)
{
var current = group;
foreach (var segment in path)
current = current.Subcommands.Single(c => c.Name == segment);
return current;
}
private static void AssertRequired(Command command, params string[] requiredOptionNames)
{
foreach (var name in requiredOptionNames)
{
var option = command.Options.SingleOrDefault(o => o.Name == name);
Assert.True(option != null, $"'{command.Name}' is missing expected option '{name}'.");
Assert.True(option!.Required, $"'{command.Name}' option '{name}' must be Required (whole-replace contract).");
}
}
[Fact]
public void TemplateUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(TemplateCommands.Build(Url, Format, Username, Password), "update"), "--name");
[Fact]
public void TemplateAttributeUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(TemplateCommands.Build(Url, Format, Username, Password), "attribute", "update"), "--name", "--data-type");
[Fact]
public void TemplateAlarmUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(TemplateCommands.Build(Url, Format, Username, Password), "alarm", "update"), "--name", "--trigger-type", "--priority");
[Fact]
public void TemplateScriptUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(TemplateCommands.Build(Url, Format, Username, Password), "script", "update"), "--name", "--code");
[Fact]
public void SiteUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(SiteCommands.Build(Url, Format, Username, Password), "update"), "--name");
[Fact]
public void DataConnectionUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(DataConnectionCommands.Build(Url, Format, Username, Password), "update"), "--name", "--protocol");
[Fact]
public void ExternalSystemUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(ExternalSystemCommands.Build(Url, Format, Username, Password), "update"), "--name", "--endpoint-url", "--auth-type");
[Fact]
public void NotificationUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(NotificationCommands.Build(Url, Format, Username, Password), "update"), "--name", "--emails");
[Fact]
public void ApiMethodUpdate_CoreFieldsRequired()
=> AssertRequired(UpdateCommand(ApiMethodCommands.Build(Url, Format, Username, Password), "update"), "--script");
[Fact]
public void ExternalSystemMethodUpdate_IsGenuinelySparse_CoreFieldsOptional()
{
// UpdateExternalSystemMethodCommand is the one update record whose fields are
// genuinely all-nullable, so its flags are correctly optional. Pin that too so
// it is not mistakenly forced to Required.
var update = UpdateCommand(ExternalSystemCommands.Build(Url, Format, Username, Password), "method", "update");
foreach (var name in new[] { "--name", "--http-method", "--path" })
{
var option = update.Options.SingleOrDefault(o => o.Name == name);
Assert.True(option != null, $"method update is missing option '{name}'.");
Assert.False(option!.Required, $"method update option '{name}' should be optional (sparse-patch record).");
}
}
}
@@ -0,0 +1,90 @@
using System.Security.Claims;
using Microsoft.AspNetCore.Builder;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Routing;
using Microsoft.Extensions.DependencyInjection;
using ScadaLink.CentralUI.Auth;
namespace ScadaLink.CentralUI.Tests.Auth;
/// <summary>
/// Regression tests for CentralUI-020. The Blazor circuit's
/// <c>CookieAuthenticationStateProvider</c> serves a frozen constructor-time
/// principal, so <c>SessionExpiry</c> could never observe a server-side cookie
/// expiry by polling the auth state. The fix adds <c>GET /auth/ping</c>, an
/// endpoint evaluated per HTTP request (where the cookie middleware re-validates
/// the cookie): it returns 200 while the session is live and 401 once the
/// cookie has lapsed, giving <c>SessionExpiry</c> a real signal to redirect on.
/// </summary>
public class AuthPingEndpointTests
{
private static IReadOnlyList<RouteEndpoint> BuildEndpoints()
{
var builder = WebApplication.CreateBuilder();
builder.Services.AddRouting();
builder.Services.AddAntiforgery();
var app = builder.Build();
app.MapAuthEndpoints();
return ((IEndpointRouteBuilder)app).DataSources
.SelectMany(ds => ds.Endpoints)
.OfType<RouteEndpoint>()
.ToList();
}
private static RouteEndpoint? Find(IReadOnlyList<RouteEndpoint> endpoints, string pattern, string method)
=> endpoints.FirstOrDefault(e =>
e.RoutePattern.RawText == pattern &&
(e.Metadata.GetMetadata<HttpMethodMetadata>()?.HttpMethods.Contains(method) ?? false));
[Fact]
public void AuthPing_GetRoute_IsMapped()
{
var ping = Find(BuildEndpoints(), "/auth/ping", "GET");
Assert.NotNull(ping);
}
[Fact]
public async Task AuthPing_AnonymousUser_Returns401()
{
var context = new DefaultHttpContext
{
User = new ClaimsPrincipal(new ClaimsIdentity()) // not authenticated
};
await AuthEndpoints.HandlePing(context);
Assert.Equal(StatusCodes.Status401Unauthorized, context.Response.StatusCode);
}
[Fact]
public async Task AuthPing_AuthenticatedUser_Returns200()
{
var identity = new ClaimsIdentity(
new[] { new Claim(ClaimTypes.Name, "alice") }, authenticationType: "TestCookie");
var context = new DefaultHttpContext
{
User = new ClaimsPrincipal(identity)
};
await AuthEndpoints.HandlePing(context);
Assert.Equal(StatusCodes.Status200OK, context.Response.StatusCode);
}
[Fact]
public void AuthPing_DoesNotTriggerCookieRedirect()
{
// The endpoint must NOT use RequireAuthorization(): that would make the
// cookie middleware answer an expired request with a 302 to /login,
// which a fetch() follows transparently and reads as a 200 login page —
// SessionExpiry would never see the expiry. The endpoint allows
// anonymous access and decides 200/401 itself.
var ping = Find(BuildEndpoints(), "/auth/ping", "GET");
Assert.NotNull(ping);
var authorize = ping!.Metadata
.GetOrderedMetadata<Microsoft.AspNetCore.Authorization.IAuthorizeData>();
Assert.Empty(authorize);
}
}
@@ -0,0 +1,88 @@
using System.Security.Claims;
using Microsoft.AspNetCore.Components.Authorization;
using ScadaLink.CentralUI.Auth;
using ScadaLink.Security;
namespace ScadaLink.CentralUI.Tests.Auth;
/// <summary>
/// Regression tests for CentralUI-024. Ten components each copy-pasted a
/// <c>GetCurrentUserAsync</c> helper using the magic string
/// <c>FindFirst("Username")</c>, and <c>NavMenu</c>/<c>Dashboard</c> used
/// <c>FindFirst("DisplayName")</c>. A rename of the claim type in
/// <see cref="JwtTokenService"/> (the single source of truth) would have
/// silently broken every call site. The shared
/// <see cref="ClaimsPrincipalExtensions"/> helpers now resolve the claim type
/// through the <c>JwtTokenService</c> constants.
/// </summary>
public class ClaimsPrincipalExtensionsTests
{
private static ClaimsPrincipal Principal(params Claim[] claims)
=> new(new ClaimsIdentity(claims, authenticationType: "TestCookie"));
[Fact]
public void GetUsername_ResolvesTheJwtTokenServiceUsernameClaim()
{
var principal = Principal(
new Claim(JwtTokenService.UsernameClaimType, "alice"));
Assert.Equal("alice", principal.GetUsername());
}
[Fact]
public void GetUsername_FallsBackToUnknown_WhenClaimAbsent()
{
var principal = Principal();
Assert.Equal(ClaimsPrincipalExtensions.UnknownUser, principal.GetUsername());
}
[Fact]
public void GetDisplayName_ResolvesTheJwtTokenServiceDisplayNameClaim()
{
var principal = Principal(
new Claim(JwtTokenService.DisplayNameClaimType, "Alice Anderson"));
Assert.Equal("Alice Anderson", principal.GetDisplayName());
}
[Fact]
public void GetDisplayName_IsNull_WhenClaimAbsent()
{
Assert.Null(Principal().GetDisplayName());
}
[Fact]
public async Task GetCurrentUsernameAsync_ReadsUsernameFromAuthState()
{
var principal = Principal(
new Claim(JwtTokenService.UsernameClaimType, "bob"));
var provider = new StubAuthStateProvider(
new AuthenticationState(principal));
Assert.Equal("bob", await provider.GetCurrentUsernameAsync());
}
[Fact]
public void Username_LookupTracksAJwtTokenServiceRename()
{
// The lookup must NOT use a hard-coded "Username" literal: if the
// constant's *value* is ever changed, the helper must follow it. Build a
// principal whose claim carries the JwtTokenService constant's current
// value and confirm the helper finds it via that same constant.
var principal = Principal(
new Claim(JwtTokenService.UsernameClaimType, "carol"));
Assert.Equal("carol",
principal.FindFirst(JwtTokenService.UsernameClaimType)?.Value);
Assert.Equal("carol", principal.GetUsername());
}
private sealed class StubAuthStateProvider : AuthenticationStateProvider
{
private readonly AuthenticationState _state;
public StubAuthStateProvider(AuthenticationState state) => _state = state;
public override Task<AuthenticationState> GetAuthenticationStateAsync()
=> Task.FromResult(_state);
}
}
@@ -0,0 +1,88 @@
using Bunit;
using Bunit.TestDoubles;
using Microsoft.AspNetCore.Components;
using Microsoft.Extensions.DependencyInjection;
using ScadaLink.CentralUI.Components.Shared;
namespace ScadaLink.CentralUI.Tests.Auth;
/// <summary>
/// Regression tests for CentralUI-020 and CentralUI-025. <c>SessionExpiry</c>
/// used to poll the Blazor <c>AuthenticationStateProvider</c>, which (via
/// <c>CookieAuthenticationStateProvider</c>) serves a frozen constructor-time
/// principal — so the polled state could never become "expired" and the
/// idle-logout redirect never fired. The component now polls the server
/// <c>GET /auth/ping</c> endpoint, which reflects the live cookie session: a
/// 401 response triggers a redirect to <c>/login</c>. These tests exercise that
/// redirect path directly (CentralUI-025: the path was previously untested).
/// </summary>
public class SessionExpiryComponentTests : BunitContext
{
private const string ModulePath = "./_content/ScadaLink.CentralUI/js/session-expiry.js";
[Fact]
public async Task CheckSession_ExpiredSession_RedirectsToLogin()
{
// The server reports the cookie has lapsed: ping returns HTTP 401.
var module = JSInterop.SetupModule(ModulePath);
module.Setup<int>("ping", "/auth/ping").SetResult(401);
var nav = Services.GetRequiredService<NavigationManager>();
var cut = Render<SessionExpiry>();
await cut.InvokeAsync(() => cut.Instance.CheckSessionAsync());
Assert.EndsWith("/login", nav.Uri);
}
[Fact]
public async Task CheckSession_LiveSession_DoesNotRedirect()
{
// The server reports the session is still valid: ping returns HTTP 200.
var module = JSInterop.SetupModule(ModulePath);
module.Setup<int>("ping", "/auth/ping").SetResult(200);
var nav = Services.GetRequiredService<NavigationManager>();
var before = nav.Uri;
var cut = Render<SessionExpiry>();
await cut.InvokeAsync(() => cut.Instance.CheckSessionAsync());
Assert.Equal(before, nav.Uri);
Assert.DoesNotContain("/login", nav.Uri);
}
[Fact]
public async Task CheckSession_TransientNetworkFailure_DoesNotRedirect()
{
// A network blip surfaces as status 0 — inconclusive. The component must
// NOT log an authenticated user out on a transient failure.
var module = JSInterop.SetupModule(ModulePath);
module.Setup<int>("ping", "/auth/ping").SetResult(0);
var nav = Services.GetRequiredService<NavigationManager>();
var before = nav.Uri;
var cut = Render<SessionExpiry>();
await cut.InvokeAsync(() => cut.Instance.CheckSessionAsync());
Assert.Equal(before, nav.Uri);
}
[Fact]
public async Task CheckSession_OnLoginPage_DoesNotPingOrRedirect()
{
// On /login the component must neither poll nor redirect (a /login →
// /login redirect would loop). JSInterop is left in Strict mode with no
// module setup, so any ping call would throw and fail the test.
var nav = (BunitNavigationManager)Services
.GetRequiredService<NavigationManager>();
nav.NavigateTo("login");
var cut = Render<SessionExpiry>();
await cut.InvokeAsync(() => cut.Instance.CheckSessionAsync());
// No JS module import was attempted and the URL is unchanged.
Assert.EndsWith("/login", nav.Uri);
}
}
@@ -0,0 +1,153 @@
using System.Collections;
using System.Reflection;
using System.Security.Claims;
using Bunit;
using Microsoft.AspNetCore.Components.Authorization;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using NSubstitute;
using ScadaLink.CentralUI.Auth;
using ScadaLink.Commons.Entities.Sites;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Messages.Streaming;
using ScadaLink.Communication;
using ScadaLink.Communication.Grpc;
using DebugViewPage = ScadaLink.CentralUI.Components.Pages.Deployment.DebugView;
namespace ScadaLink.CentralUI.Tests.Deployment;
/// <summary>
/// Regression tests for CentralUI-021. The <c>DebugView</c> stream callback runs
/// on an Akka/gRPC thread; it used to call <c>UpsertWithCap</c> directly on that
/// thread, mutating the <c>_attributeValues</c>/<c>_alarmStates</c>
/// <see cref="Dictionary{TKey,TValue}"/> while the render thread enumerated the
/// same dictionaries via <c>FilteredAttributeValues</c>. <c>Dictionary</c> is
/// not thread-safe, so the write could throw "Collection was modified" or
/// corrupt the buckets. The fix routes the callback through
/// <c>HandleStreamEvent</c>, which marshals the mutation onto the renderer's
/// dispatcher so every dictionary access happens on one thread.
/// </summary>
public class DebugViewStreamRaceTests : BunitContext
{
private IRenderedComponent<DebugViewPage> RenderPage()
{
JSInterop.Mode = JSRuntimeMode.Loose;
var repo = Substitute.For<ITemplateEngineRepository>();
var siteRepo = Substitute.For<ISiteRepository>();
siteRepo.GetAllSitesAsync().Returns(new List<Site>());
Services.AddSingleton(repo);
Services.AddSingleton(siteRepo);
var comms = new CommunicationService(
Options.Create(new CommunicationOptions()),
NullLogger<CommunicationService>.Instance);
Services.AddSingleton(comms);
var grpcFactory = new SiteStreamGrpcClientFactory(NullLoggerFactory.Instance);
var debugStream = new DebugStreamService(
comms, Services.BuildServiceProvider(), grpcFactory,
NullLogger<DebugStreamService>.Instance);
Services.AddSingleton(debugStream);
var identity = new ClaimsIdentity(
new[] { new Claim(ClaimTypes.Name, "deployer") }, "TestCookie");
var stubAuth = new StubAuthStateProvider(
new AuthenticationState(new ClaimsPrincipal(identity)));
Services.AddSingleton<AuthenticationStateProvider>(stubAuth);
Services.AddScoped(_ => new SiteScopeService(stubAuth));
return Render<DebugViewPage>();
}
private sealed class StubAuthStateProvider : AuthenticationStateProvider
{
private readonly AuthenticationState _state;
public StubAuthStateProvider(AuthenticationState state) => _state = state;
public override Task<AuthenticationState> GetAuthenticationStateAsync()
=> Task.FromResult(_state);
}
private static MethodInfo HandleStreamEvent => typeof(DebugViewPage).GetMethod(
"HandleStreamEvent", BindingFlags.Instance | BindingFlags.NonPublic)!;
private static IDictionary AttributeValues(DebugViewPage c) => (IDictionary)
typeof(DebugViewPage).GetField("_attributeValues",
BindingFlags.Instance | BindingFlags.NonPublic)!.GetValue(c)!;
private static IEnumerable FilteredAttributeValues(DebugViewPage c) => (IEnumerable)
typeof(DebugViewPage).GetProperty("FilteredAttributeValues",
BindingFlags.Instance | BindingFlags.NonPublic)!.GetValue(c)!;
[Fact]
public void HandleStreamEvent_AppliesUpdate_OnceDispatcherRuns()
{
// The fix defers the mutation onto the dispatcher — it must not drop it.
var cut = RenderPage();
var dict = AttributeValues(cut.Instance);
var evt = new AttributeValueChanged(
"Inst-1", "Pump.Speed", "Speed", 42, "Good", DateTimeOffset.UtcNow);
HandleStreamEvent.Invoke(cut.Instance, new object[] { evt });
cut.WaitForState(() => dict.Count == 1, TimeSpan.FromSeconds(2));
Assert.True(dict.Contains("Speed"));
}
[Fact]
public async Task HandleStreamEvent_OffThreadEvents_DoNotFaultDispatcherReads()
{
// CentralUI-021 reproduction. Writers fire stream events from background
// threads (the Akka/gRPC callback threads). The reader enumerates
// FilteredAttributeValues *through the renderer's dispatcher* — exactly
// as the real render thread does. Pre-fix the writers mutated the
// Dictionary directly on their own threads, racing the dispatcher-side
// enumeration and intermittently throwing "Collection was modified".
// Post-fix every write is marshalled onto the dispatcher, so writes and
// reads are serialised on one thread and the enumeration never faults.
var cut = RenderPage();
var dict = AttributeValues(cut.Instance);
Exception? failure = null;
using var stop = new CancellationTokenSource();
var writers = Enumerable.Range(0, 4).Select(w => Task.Run(() =>
{
try
{
for (var i = 0; i < 600 && !stop.IsCancellationRequested; i++)
{
var evt = new AttributeValueChanged(
"Inst-1", $"Tag.{w}.{i}", $"Tag-{w}-{i}",
i, "Good", DateTimeOffset.UtcNow);
HandleStreamEvent.Invoke(cut.Instance, new object[] { evt });
}
}
catch (Exception ex) { failure ??= ex; stop.Cancel(); }
})).ToArray();
var reader = Task.Run(async () =>
{
try
{
while (!stop.IsCancellationRequested)
{
await cut.InvokeAsync(() =>
{
foreach (var _ in FilteredAttributeValues(cut.Instance)) { }
});
}
}
catch (Exception ex) { failure ??= ex; stop.Cancel(); }
});
await Task.WhenAll(writers);
stop.Cancel();
await reader.WaitAsync(TimeSpan.FromSeconds(5));
Assert.Null(failure);
// Sanity: events were actually delivered (cap is honoured separately).
cut.WaitForState(() => dict.Count > 0, TimeSpan.FromSeconds(2));
}
}
@@ -109,4 +109,48 @@ public class DeploymentsPushUpdateTests : BunitContext
_deployRepo.DidNotReceive()
.GetAllDeploymentRecordsAsync(Arg.Any<CancellationToken>());
}
/// <summary>
/// Regression test for CentralUI-022. The notifier is a process singleton:
/// it can read its subscriber list and begin invoking
/// <c>OnDeploymentStatusChanged</c> on the DeploymentManager thread an
/// instant before the component is disposed. The handler must no-op against
/// a disposed component rather than letting <c>InvokeAsync</c> throw an
/// unobserved <see cref="ObjectDisposedException"/>.
/// </summary>
[Fact]
public void Deployments_HasDisposalGuardField()
{
var field = typeof(DeploymentsPage).GetField(
"_disposed", BindingFlags.Instance | BindingFlags.NonPublic);
Assert.NotNull(field);
Assert.Equal(typeof(bool), field!.FieldType);
}
[Fact]
public void Deployments_StatusChangeAfterDispose_DoesNotThrowOrReload()
{
RegisterServices();
var cut = Render<DeploymentsPage>();
var component = cut.Instance;
component.Dispose();
_deployRepo.ClearReceivedCalls();
// Simulate the race: the notifier captured the handler before the
// Dispose() unsubscribe and invokes it directly against the now-disposed
// component. Pre-fix this dispatched InvokeAsync against a dead circuit
// and threw ObjectDisposedException on a fire-and-forget task.
var handler = typeof(DeploymentsPage).GetMethod(
"OnDeploymentStatusChanged", BindingFlags.Instance | BindingFlags.NonPublic)!;
var ex = Record.Exception(() => handler.Invoke(component,
new object[] { new DeploymentStatusChange("dep-9", 1, DeploymentStatus.Success) }));
Assert.Null(ex);
// The guard short-circuits before any reload is attempted.
_deployRepo.DidNotReceive()
.GetAllDeploymentRecordsAsync(Arg.Any<CancellationToken>());
}
}
@@ -13,9 +13,23 @@ namespace ScadaLink.CentralUI.Tests.Shared;
/// </summary>
public class DiffDialogTests : BunitContext
{
/// <summary>
/// DiffDialog applies/removes a body scroll-lock class via JS interop on
/// open/close. CentralUI-023 narrowed those catch blocks so they no longer
/// swallow every exception — including bUnit's strict-mode unplanned-call
/// exception. Tests that exercise open/close must therefore register the
/// body-class calls so they do not surface as harness exceptions.
/// </summary>
private void SetupBodyLockInterop()
{
JSInterop.SetupVoid("document.body.classList.add", "modal-open");
JSInterop.SetupVoid("document.body.classList.remove", "modal-open");
}
[Fact]
public async Task DisposeAsync_WhileOpen_CompletesPendingTask()
{
SetupBodyLockInterop();
var cut = Render<DiffDialog>();
// Open the dialog; the returned task represents the caller's await.
@@ -41,6 +55,7 @@ public class DiffDialogTests : BunitContext
[Fact]
public async Task Close_CompletesPendingTaskWithTrue()
{
SetupBodyLockInterop();
var cut = Render<DiffDialog>();
// Block-bodied lambda so InvokeAsync sees a void delegate — it must NOT

Some files were not shown because too many files have changed in this diff Show More