Compare commits
36 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d30ded7e72 | |||
| a0e6a36e79 | |||
| 7d7214a4ca | |||
| 340a70f0e6 | |||
| ab098bf6c8 | |||
| fccd3274d3 | |||
| d7630d80fe | |||
| db08c6eb38 | |||
| 9043f0089b | |||
| 301e7fb854 | |||
| 87f14c190a | |||
| 5a08b04535 | |||
| d8f99ba781 | |||
| 91438dcc1b | |||
| 61253e3269 | |||
| a9bd7ee37c | |||
| a9ceba00d0 | |||
| 239bee3bc4 | |||
| 977d7369a7 | |||
| 9c60592632 | |||
| fd1518f4f4 | |||
| b949dc4183 | |||
| 3cc174c3cd | |||
| d030153378 | |||
| d63d412461 | |||
| 0a535cd4a5 | |||
| 5065384305 | |||
| bf3f572ad9 | |||
| 3499d76f14 | |||
| 78b10d00d8 | |||
| 41c3fa3d84 | |||
| 9e21b47080 | |||
| f789ab4a91 | |||
| 199cdbe798 | |||
| 8050a1996f | |||
| c94d3b7570 |
@@ -32,3 +32,7 @@ TestResults/
|
|||||||
**/logs/
|
**/logs/
|
||||||
site_events.db
|
site_events.db
|
||||||
data/
|
data/
|
||||||
|
|
||||||
|
# Claude Code local files
|
||||||
|
.claude/settings.local.json
|
||||||
|
.claude/scheduled_tasks.lock
|
||||||
|
|||||||
@@ -0,0 +1,69 @@
|
|||||||
|
<Project>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageVersion Include="Akka" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Cluster" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Cluster.Hosting" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Cluster.Tools" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Hosting" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Remote" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Remote.Hosting" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Streams" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.Streams.TestKit" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="Akka.TestKit.Xunit2" Version="1.5.62" />
|
||||||
|
<PackageVersion Include="AspNetCore.HealthChecks.UI.Client" Version="9.0.0" />
|
||||||
|
<PackageVersion Include="bunit" Version="2.0.33-preview" />
|
||||||
|
<PackageVersion Include="coverlet.collector" Version="6.0.4" />
|
||||||
|
<PackageVersion Include="FluentAssertions" Version="8.3.0" />
|
||||||
|
<PackageVersion Include="Google.Protobuf" Version="3.29.3" />
|
||||||
|
<PackageVersion Include="Grpc.AspNetCore" Version="2.71.0" />
|
||||||
|
<PackageVersion Include="Grpc.Net.Client" Version="2.71.0" />
|
||||||
|
<PackageVersion Include="Grpc.Tools" Version="2.71.0" />
|
||||||
|
<PackageVersion Include="MailKit" Version="4.16.0" />
|
||||||
|
<PackageVersion Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.AspNetCore.Authorization" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.AspNetCore.DataProtection" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.AspNetCore.Mvc.Testing" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.AspNetCore.SignalR.Client" Version="9.0.3" />
|
||||||
|
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="5.0.0" />
|
||||||
|
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Workspaces" Version="5.0.0" />
|
||||||
|
<PackageVersion Include="Microsoft.Data.SqlClient" Version="6.0.2" />
|
||||||
|
<PackageVersion Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.EntityFrameworkCore" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.EntityFrameworkCore.InMemory" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.EntityFrameworkCore.Sqlite" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.EntityFrameworkCore.SqlServer" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Configuration.Json" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.DependencyInjection" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Hosting.Abstractions" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Hosting.WindowsServices" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Http" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Logging" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="10.0.7" />
|
||||||
|
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
|
||||||
|
<PackageVersion Include="Microsoft.Playwright" Version="1.58.0" />
|
||||||
|
<PackageVersion Include="Moq" Version="4.20.72" />
|
||||||
|
<PackageVersion Include="Novell.Directory.Ldap.NETStandard" Version="3.6.0" />
|
||||||
|
<PackageVersion Include="NSubstitute" Version="5.3.0" />
|
||||||
|
<PackageVersion Include="OPCFoundation.NetStandard.Opc.Ua.Client" Version="1.5.378.106" />
|
||||||
|
<PackageVersion Include="OpenTelemetry.Api" Version="1.15.3" />
|
||||||
|
<PackageVersion Include="Serilog" Version="4.3.1" />
|
||||||
|
<PackageVersion Include="Serilog.AspNetCore" Version="10.0.0" />
|
||||||
|
<PackageVersion Include="Serilog.Sinks.Console" Version="6.1.1" />
|
||||||
|
<PackageVersion Include="Serilog.Sinks.File" Version="7.0.0" />
|
||||||
|
<PackageVersion Include="System.CommandLine" Version="2.0.5" />
|
||||||
|
<PackageVersion Include="System.IdentityModel.Tokens.Jwt" Version="8.11.0" />
|
||||||
|
<PackageVersion Include="xunit" Version="2.9.3" />
|
||||||
|
<PackageVersion Include="xunit.runner.visualstudio" Version="3.1.4" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,447 @@
|
|||||||
|
# Code Review — CLI
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.CLI` |
|
||||||
|
| Design doc | `docs/requirements/Component-CLI.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 12 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The CLI is a small, well-structured HTTP client over the Management API. The command-tree
|
||||||
|
construction is consistent and repetitive in a good way: every subcommand funnels through
|
||||||
|
`CommandHelpers.ExecuteCommandAsync`, which centralizes URL/credential resolution, HTTP
|
||||||
|
dispatch, and response handling. There are no Akka.NET concerns (the CLI is a pure HTTP
|
||||||
|
client) and no concurrency-sensitive code apart from the `debug stream` SignalR handler.
|
||||||
|
|
||||||
|
The dominant theme is **graceful-degradation gaps**: several user-supplied inputs (malformed
|
||||||
|
URLs, malformed `--bindings`/`--overrides` JSON, non-JSON success bodies) are deserialized
|
||||||
|
or constructed without `try/catch`, so a normal user mistake surfaces as an unhandled
|
||||||
|
exception with a stack trace instead of a clean error message and exit code 1. A second
|
||||||
|
theme is **dead configuration**: the `SCADALINK_FORMAT` environment variable and the
|
||||||
|
`defaultFormat` config-file field are loaded by `CliConfig` but never consulted by any
|
||||||
|
command, so the documented format-precedence chain does not work. The third theme is
|
||||||
|
**substantial design-document drift**: `Component-CLI.md` describes a name-keyed,
|
||||||
|
`--file`-based command surface that bears little resemblance to the implemented
|
||||||
|
ID-keyed, flag-based surface. Test coverage exercises `OutputFormatter`, `CliConfig`, and
|
||||||
|
`CommandHelpers.HandleResponse`, but the HTTP client, the `debug stream` path, the JSON
|
||||||
|
argument parsing, and the command-tree wiring are untested.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | Format precedence is broken (CLI-001); empty/non-JSON success bodies crash table rendering (CLI-002, CLI-003). |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | Not applicable — CLI is a pure HTTP/SignalR client with no Akka.NET runtime (design doc confirms). No issues. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Only `debug stream` is concurrent; `CancellationTokenSource` is never disposed (CLI-011). Exit-code resolution after Ctrl+C is loose (CLI-012). |
|
||||||
|
| 4 | Error handling & resilience | ☑ | Unhandled exceptions on malformed URL (CLI-004) and malformed JSON arguments (CLI-005); `StartAsync` cancellation is misreported (CLI-010). |
|
||||||
|
| 5 | Security | ☑ | `--password` on the command line leaks into process listings / shell history with no env-var or prompt alternative (CLI-006). |
|
||||||
|
| 6 | Performance & resource management | ☑ | `HttpClient` per invocation is acceptable for a one-shot CLI. `CancellationTokenSource` leak noted in CLI-011. |
|
||||||
|
| 7 | Design-document adherence | ☑ | `Component-CLI.md` is heavily stale relative to the implemented command surface (CLI-007). |
|
||||||
|
| 8 | Code organization & conventions | ☑ | Consistent and clean; `CliConfig.DefaultFormat` is loaded but unused (covered by CLI-001). Minor: `--format` not validated (CLI-008). |
|
||||||
|
| 9 | Testing coverage | ☑ | No tests for `ManagementHttpClient`, `DebugCommands`, command-tree wiring, or JSON argument parsing (CLI-013). |
|
||||||
|
| 10 | Documentation & comments | ☑ | `Component-CLI.md` mismatch (CLI-007); the in-repo `README.md` is reasonably accurate. Minor exit-code doc mismatch (CLI-009). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### CLI-001 — `SCADALINK_FORMAT` env var and config-file format are dead; format precedence broken
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:18`, `src/ScadaLink.CLI/Commands/DebugCommands.cs:45`, `src/ScadaLink.CLI/CliConfig.cs:37-39` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CliConfig.Load()` reads `SCADALINK_FORMAT` and the `defaultFormat` config-file field into
|
||||||
|
`CliConfig.DefaultFormat`, and `Component-CLI.md` documents a format-precedence chain
|
||||||
|
(command-line option → env var → config file). However, every command resolves the format
|
||||||
|
with `var format = result.GetValue(formatOption) ?? "json";` and `formatOption` is created
|
||||||
|
in `Program.cs:11` with `DefaultValueFactory = _ => "json"`. `GetValue` therefore always
|
||||||
|
returns a non-null value ("json" when the flag is absent), so the `?? "json"` fallback never
|
||||||
|
fires and `config.DefaultFormat` is never consulted. The env var and config-file format
|
||||||
|
settings are dead code: `scadalink site list` always outputs JSON regardless of
|
||||||
|
`SCADALINK_FORMAT=table` or a `defaultFormat` entry in `~/.scadalink/config.json`. The
|
||||||
|
documented behaviour silently does not work.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either remove the `--format` option's `DefaultValueFactory` and have `CommandHelpers`
|
||||||
|
resolve precedence explicitly (`result.GetValue(formatOption)` → `config.DefaultFormat`),
|
||||||
|
or detect whether the option was explicitly supplied (`result.GetResult(formatOption)`) and
|
||||||
|
only then override the config value. Apply the same fix to `DebugCommands.BuildStream`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). Removed the `--format` option's
|
||||||
|
`DefaultValueFactory` in `Program.cs` and added `CommandHelpers.ResolveFormat`, which uses
|
||||||
|
`ParseResult.GetResult(formatOption)` to detect an explicitly supplied flag and resolves
|
||||||
|
precedence explicitly: explicit `--format` → `CliConfig.DefaultFormat` (env var / config
|
||||||
|
file) → `"json"`. Both `CommandHelpers.ExecuteCommandAsync` and `DebugCommands.BuildStream`
|
||||||
|
now call `ResolveFormat`. Regression tests added in `FormatResolutionTests`.
|
||||||
|
|
||||||
|
### CLI-002 — Empty success body crashes table rendering with an unhandled exception
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:59-68`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:78-80` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ManagementHttpClient.SendCommandAsync` returns `JsonData = responseBody` for any
|
||||||
|
success status code, including a 200/204 with an empty body. `HandleResponse` then tests
|
||||||
|
`response.JsonData != null` — an empty string is non-null — and for `--format table`
|
||||||
|
calls `WriteAsTable(response.JsonData)`, which immediately does `JsonDocument.Parse(json)`.
|
||||||
|
`JsonDocument.Parse("")` throws `JsonException`, which is not caught anywhere, so a
|
||||||
|
command that legitimately returns no body (e.g. a delete that returns 204) terminates with
|
||||||
|
a stack trace instead of a clean success message.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
In `HandleResponse`, treat a null-or-whitespace `JsonData` as a "command succeeded, no
|
||||||
|
output" case (print nothing or `(ok)`), and return 0 before attempting to parse.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-003 — Non-JSON success body crashes table rendering
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:80` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`WriteAsTable` calls `JsonDocument.Parse(json)` with no `try/catch`. If the server returns
|
||||||
|
a success status but a body that is not valid JSON (a proxy/HTML error page returned with
|
||||||
|
a 200, a plain-text message, etc.), the CLI throws an unhandled `JsonException`. The
|
||||||
|
error-path code in `ManagementHttpClient` (lines 52-61) already defensively wraps
|
||||||
|
`JsonDocument.Parse` in a `try/catch`; the success path and `WriteAsTable` do not get the
|
||||||
|
same treatment.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the `JsonDocument.Parse` in `WriteAsTable` in a `try/catch`; on failure, fall back to
|
||||||
|
printing the raw body verbatim (as the JSON path already does at line 66).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-004 — Malformed `--url` throws an unhandled `UriFormatException`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/ManagementHttpClient.cs:13` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `ManagementHttpClient` constructor does `new Uri(baseUrl.TrimEnd('/') + "/")` with no
|
||||||
|
validation. If the user passes a malformed URL (e.g. `--url localhost:9001` without a
|
||||||
|
scheme, or `--url ""`), `new Uri(...)` throws `UriFormatException`. This call is not
|
||||||
|
guarded by the `try/catch` in `SendCommandAsync` (it happens in the constructor at
|
||||||
|
`CommandHelpers.cs:50`), so a common typo terminates the CLI with a stack trace rather
|
||||||
|
than the documented "connection failure → exit 1 with a descriptive message".
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Validate the URL before constructing the client — e.g. `Uri.TryCreate(url, UriKind.Absolute, out _)` in `CommandHelpers.ExecuteCommandAsync` and `DebugCommands.BuildStream` — and emit a
|
||||||
|
clean `INVALID_URL` error with exit code 1 on failure.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-005 — Malformed `--bindings` / `--overrides` JSON throws unhandled exceptions
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/InstanceCommands.cs:55-58`, `src/ScadaLink.CLI/Commands/InstanceCommands.cs:181-182` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`set-bindings` deserializes the `--bindings` argument with
|
||||||
|
`JsonSerializer.Deserialize<List<List<JsonElement>>>(...)` and then indexes `p[0]`/`p[1]`
|
||||||
|
and calls `p[0].GetString()!` / `p[1].GetInt32()`. `set-overrides` deserializes `--overrides`
|
||||||
|
with `JsonSerializer.Deserialize<Dictionary<string, string?>>(...)`. None of this is wrapped
|
||||||
|
in a `try/catch`. Invalid JSON throws `JsonException`; a pair with fewer than two elements
|
||||||
|
throws `ArgumentOutOfRangeException`; a non-string/non-int element throws `InvalidOperationException`. All of these surface as raw stack traces, so a user typo in a JSON argument
|
||||||
|
crashes the CLI instead of producing a clean validation error and exit code 1.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the parsing in `try/catch (JsonException ...)` (and guard the pair length / element
|
||||||
|
kinds), and on failure call `OutputFormatter.WriteError(...)` with an `INVALID_ARGUMENT`
|
||||||
|
code and return 1.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-006 — Password is passed as a command-line argument with no safer alternative
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Program.cs:9`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:36-44` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Credentials are supplied only via `--username` / `--password`. A password on the command
|
||||||
|
line is visible to any local user via the process list (`ps`, `/proc/<pid>/cmdline`) and is
|
||||||
|
typically persisted into shell history. Unlike the management URL — which can also come
|
||||||
|
from `SCADALINK_MANAGEMENT_URL` or the config file — there is no environment-variable
|
||||||
|
fallback, no `--password-stdin`, and no interactive prompt for the password. For a tool
|
||||||
|
explicitly intended for CI/CD automation this materially increases the chance of credential
|
||||||
|
leakage.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `SCADALINK_PASSWORD` environment variable fallback and/or a `--password-stdin`
|
||||||
|
option (read the password from stdin), and document that `--password` on the command line
|
||||||
|
is discouraged. Optionally prompt interactively when stdin is a TTY and no password was
|
||||||
|
supplied.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-007 — `Component-CLI.md` command surface is substantially stale
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `docs/requirements/Component-CLI.md:51-211` (vs. all files under `src/ScadaLink.CLI/Commands/`) |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The "Command Structure" section of the design doc no longer matches the implemented CLI.
|
||||||
|
Examples of the drift:
|
||||||
|
|
||||||
|
- The doc keys most operations by **name** (`template get <name>`, `instance get <code>`,
|
||||||
|
`site get <site-id>`); the implementation keys everything by integer **ID** via `--id`
|
||||||
|
(`TemplateCommands.cs:40`, `InstanceCommands.cs:31`, `SiteCommands.cs:26`).
|
||||||
|
- The doc shows `template create ... --file <path>` and `site update <site-id> --file <path>`;
|
||||||
|
the implementation has no `--file` option anywhere and instead takes individual flags
|
||||||
|
(`TemplateCommands.cs:52-72`, `SiteCommands.cs:83-115`).
|
||||||
|
- The doc lists commands that do not exist (`template diff`, `instance bind-connections`,
|
||||||
|
`instance assign-area`, `template attribute add --tag-path`, `data-connection assign/unassign`,
|
||||||
|
`security api-key enable/disable` as separate commands) and omits commands that do exist
|
||||||
|
(`instance alarm-override set/delete/list`, `external-system method` subgroup).
|
||||||
|
- The doc's `notification smtp update --file` differs from the implemented
|
||||||
|
`--server/--port/--auth-mode/--from-address` flags (`NotificationCommands.cs:72-94`).
|
||||||
|
- The doc uses `--site` for site identification in several places where the implementation
|
||||||
|
uses `--site-id` or `--identifier`.
|
||||||
|
|
||||||
|
A reader following the design doc would be unable to drive the CLI.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Regenerate the "Command Structure" section of `Component-CLI.md` from the actual command
|
||||||
|
tree (the in-repo `src/ScadaLink.CLI/README.md` is much closer to reality and could be the
|
||||||
|
source), or mark the doc's command list as illustrative and point to the README as
|
||||||
|
authoritative.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-008 — `--format` value is not validated
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Program.cs:10-11`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:60` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `--format` option accepts any string. `HandleResponse` only checks
|
||||||
|
`string.Equals(format, "table", ...)`; any other value — including a typo like
|
||||||
|
`--format tabel` or `--format xml` — silently falls through to JSON output. The user gets
|
||||||
|
no feedback that their requested format was not honoured.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Restrict the option to the accepted values, e.g. `formatOption.AcceptOnlyFromAmong("json", "table")`, so `System.CommandLine` rejects invalid input with a clear parse error.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-009 — Exit-code documentation does not match `HandleResponse` behaviour
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `docs/requirements/Component-CLI.md:238-249`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:75` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc's Exit Codes table defines code 2 as "Authorization failure (insufficient
|
||||||
|
role)" and the Error Handling section says "If the server returns HTTP 403, the CLI exits
|
||||||
|
with code 2." `HandleResponse` implements `return response.StatusCode == 403 ? 2 : 1;`,
|
||||||
|
which is correct for the HTTP error path. However, the `NO_URL`, `NO_CREDENTIALS`,
|
||||||
|
`INVALID_OPERATION` (from `set-bindings`/`set-overrides`) and any other client-side failure
|
||||||
|
all return 1, and a connection failure carries `StatusCode == 0` — none of which the doc
|
||||||
|
enumerates. More importantly, an authorization failure that the server signals with a body
|
||||||
|
`code` of `UNAUTHORIZED` but an HTTP status other than 403 would be classified as a generic
|
||||||
|
error (exit 1). The mapping is purely status-driven and the doc does not state that.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either document precisely that exit code 2 is determined solely by HTTP 403, or key the
|
||||||
|
"authorization failure" exit code off the response `code` field as well. Align the doc
|
||||||
|
with whichever is chosen.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-010 — `debug stream` reports Ctrl+C during connect as a connection failure
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/DebugCommands.cs:181-189` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`StreamDebugAsync` calls `await connection.StartAsync(cts.Token)` inside a
|
||||||
|
`try { } catch (Exception ex)` that unconditionally reports
|
||||||
|
`"Connection failed: {ex.Message}"` with code `CONNECTION_FAILED` and returns 1. If the
|
||||||
|
user presses Ctrl+C while the connection is still being established, `cts` is cancelled and
|
||||||
|
`StartAsync` throws `OperationCanceledException`; this is caught by the generic handler and
|
||||||
|
misreported as a connection failure (with exit code 1) rather than a clean user-initiated
|
||||||
|
cancellation (exit code 0).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Catch `OperationCanceledException` separately (return 0 quietly) before the generic
|
||||||
|
`catch (Exception)` handler, mirroring how the `exitTcs.Task.WaitAsync(cts.Token)` path at
|
||||||
|
lines 209-215 already treats cancellation as graceful.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-011 — `CancellationTokenSource` in `debug stream` is never disposed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/DebugCommands.cs:89` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`var cts = new CancellationTokenSource();` is created in `StreamDebugAsync` but never
|
||||||
|
disposed; there is no `using` declaration and no explicit `Dispose()` call on any exit
|
||||||
|
path. `CancellationTokenSource` owns a `WaitHandle` and should be disposed. The impact is
|
||||||
|
small because the process exits shortly after, but it is an `IDisposable` left undisposed,
|
||||||
|
contrary to the review checklist's resource-management expectation.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Declare it as `using var cts = new CancellationTokenSource();` (or wrap the method body in
|
||||||
|
a `try/finally`).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-012 — `debug stream` exit code is unreliable after stream termination
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CLI/Commands/DebugCommands.cs:208-227` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
After `await exitTcs.Task.WaitAsync(cts.Token)`, the method returns
|
||||||
|
`exitTcs.Task.IsCompletedSuccessfully ? exitTcs.Task.Result : 0`. When the user cancels
|
||||||
|
with Ctrl+C, `WaitAsync` throws `OperationCanceledException` and `exitTcs` is typically
|
||||||
|
still incomplete, so the method returns 0 — correct. However, the `OnStreamTerminated`
|
||||||
|
handler and the `Closed` handler both call `exitTcs.TrySetResult`, and these run on
|
||||||
|
SignalR callback threads concurrently with the Ctrl+C path. If a stream termination and a
|
||||||
|
Ctrl+C race, the final exit code depends on which `TrySetResult` won and whether
|
||||||
|
`WaitAsync` observed completion before cancellation — the result is not deterministic. A
|
||||||
|
stream the server terminated abnormally can end up returning 0.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Resolve the exit code from a single authoritative source: after the `try/catch` around
|
||||||
|
`WaitAsync`, check `exitTcs.Task` completion explicitly and treat a Ctrl+C with no prior
|
||||||
|
result as 0, but always prefer a result that was set by `OnStreamTerminated`/`Closed`.
|
||||||
|
Consider awaiting `exitTcs.Task` without the cancellation token after a brief grace period.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CLI-013 — HTTP client, `debug stream`, and JSON-argument parsing are untested
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.CLI.Tests/` (vs. `src/ScadaLink.CLI/ManagementHttpClient.cs`, `src/ScadaLink.CLI/Commands/DebugCommands.cs`, `src/ScadaLink.CLI/Commands/InstanceCommands.cs:55-58`) |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test project covers `OutputFormatter`, `CliConfig.Load`, and
|
||||||
|
`CommandHelpers.HandleResponse`. It does not cover:
|
||||||
|
|
||||||
|
- `ManagementHttpClient.SendCommandAsync` — the timeout (504), connection-failure (code 0),
|
||||||
|
and error-body-parsing paths are untested.
|
||||||
|
- The `debug stream` SignalR command — no tests at all.
|
||||||
|
- The JSON-argument parsing in `InstanceCommands` (`set-bindings`, `set-overrides`) — the
|
||||||
|
paths most likely to crash on bad input (CLI-005) have no coverage.
|
||||||
|
- Command-tree wiring — there is no test asserting that each `Build` produces the expected
|
||||||
|
subcommands/options or that the command-name derivation
|
||||||
|
(`ManagementCommandRegistry.GetCommandName`) resolves for every command type the CLI
|
||||||
|
constructs.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for `ManagementHttpClient` (using a stub `HttpMessageHandler`), for the
|
||||||
|
JSON-argument parsing helpers (extracting the parsing into testable methods), and a
|
||||||
|
smoke test that walks the root command tree and asserts every leaf command's payload type
|
||||||
|
resolves via `ManagementCommandRegistry`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,686 @@
|
|||||||
|
# Code Review — CentralUI
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.CentralUI` |
|
||||||
|
| Design doc | `docs/requirements/Component-CentralUI.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 15 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The Central UI is a sizeable, generally well-structured Blazor Server module:
|
||||||
|
custom Bootstrap components only (no third-party UI frameworks, as required),
|
||||||
|
consistent list/form page patterns, careful disposal in most components, and a
|
||||||
|
thoughtful Roslyn-backed script editor. The most serious problem is the
|
||||||
|
**Test Run sandbox** (`ScriptAnalysisService.RunInSandboxAsync`): it compiles
|
||||||
|
and executes arbitrary user C# *in the central process* with no enforcement of
|
||||||
|
the documented script trust model — the forbidden-API list is only a Monaco
|
||||||
|
editor diagnostic, never applied before execution — so a Design user can run
|
||||||
|
`System.IO`/`Process`/`Reflection` code on the central node. Several other
|
||||||
|
themes recur: (1) per-circuit security drift — site-scoped Deployment claims
|
||||||
|
are written at login but never read, so site scoping is not enforced anywhere;
|
||||||
|
(2) Blazor render-thread and disposal hazards — background `Timer` / `Task.Delay`
|
||||||
|
callbacks and stream callbacks touch component state and `@ref` children that
|
||||||
|
may already be disposed; (3) process-global mutation (`Console.SetOut`) shared
|
||||||
|
across concurrent circuits; (4) drift from the design doc on session expiry and
|
||||||
|
on the "deployment status pushes via SignalR" claim (the page actually polls).
|
||||||
|
Testing coverage is thin for a module this large: only the script analyzer,
|
||||||
|
TreeView, schema model, and a few data-connection pages have unit tests; most
|
||||||
|
pages and the auth bridge are untested.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | DebugView cap logic, audit-log timezone, toast race — see findings. |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | Module is mostly UI; `DebugStreamService` actor usage reviewed (in Communication but driven from here). No actor-convention violations in CentralUI proper. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | `Console.SetOut` global mutation, stream/timer callbacks on non-render threads, toast `_ = Task.Delay`. |
|
||||||
|
| 4 | Error handling & resilience | ☑ | Broad `catch {}` swallowing, dangling `TaskCompletionSource` on dialog disposal. |
|
||||||
|
| 5 | Security | ☑ | Sandbox not enforcing trust model (Critical); site scoping never enforced; auth bridge reads stale HttpContext; logout CSRF. |
|
||||||
|
| 6 | Performance & resource management | ☑ | N+1 site-connection query, repeated `FilteredMessages` recomputation, full-page paginators rendering all page buttons. |
|
||||||
|
| 7 | Design-document adherence | ☑ | Session expiry diverges from "15-min sliding + 30-min idle"; Deployments polls despite "push via SignalR"; nav exposes Deployment-only pages to all roles. |
|
||||||
|
| 8 | Code organization & conventions | ☑ | Generally good; options classes absent (no appsettings binding here); no major violations. |
|
||||||
|
| 9 | Testing coverage | ☑ | Auth, sandbox-run, DebugView, Health, ParkedMessages, most pages untested. |
|
||||||
|
| 10 | Documentation & comments | ☑ | Comments are accurate and helpful; a few stale claims noted. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### CentralUI-001 — Test Run sandbox executes arbitrary C# with no trust-model enforcement
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Critical |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:171-424` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`RunInSandboxAsync` compiles user-supplied script code with `CSharpScript.Create`
|
||||||
|
and executes it (`script.RunAsync`) directly inside the central process. The
|
||||||
|
"sandbox" applies only a wall-clock timeout and an output-size cap. It does
|
||||||
|
**not** enforce the documented script trust model: the forbidden-API set
|
||||||
|
(`System.IO`, `System.Diagnostics`/`Process`, `System.Reflection`, `System.Net`,
|
||||||
|
threading) is checked only in `FindForbiddenApiUsages`, which feeds Monaco
|
||||||
|
editor diagnostics — it is never consulted before `RunInSandboxAsync` executes.
|
||||||
|
`DefaultOptions` references `typeof(object).Assembly` (the full BCL), so a
|
||||||
|
Design-role user can submit `System.IO.File.WriteAllText(...)`,
|
||||||
|
`System.Diagnostics.Process.Start(...)`, reflection, or raw socket code via
|
||||||
|
`POST /api/script-analysis/run` and it runs with the central host process's
|
||||||
|
full privileges. The endpoint is gated only by `RequireDesign`. This is a
|
||||||
|
remote code execution path on the central cluster node.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Before executing, run the same forbidden-API analysis used for diagnostics and
|
||||||
|
reject any script with a `SCADA001`/`SCADA002` (severity-8) marker; additionally
|
||||||
|
restrict the compilation's metadata references to the curated script API
|
||||||
|
surface, and ideally execute in an isolated `AssemblyLoadContext`/process with
|
||||||
|
constrained permissions. Treat the trust model as an execution-time gate, not
|
||||||
|
an editor hint.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. A Roslyn semantic trust-model gate was added. `RunInSandboxAsync`
|
||||||
|
now calls `EnforceTrustModel` after compilation and before `script.RunAsync`; if the
|
||||||
|
script references any forbidden API the run is rejected (`SandboxErrorKind.CompileError`)
|
||||||
|
with the offending markers, and the same gate is applied to nested shared scripts in
|
||||||
|
`callSharedFunc`. `FindForbiddenApiUsages` was reworked so it resolves every identifier
|
||||||
|
(not just the leftmost) against the semantic model and checks types **and** members —
|
||||||
|
so a fully-qualified call such as `System.IO.File.WriteAllText(...)` is now caught, not
|
||||||
|
only `using`-directive or bare-type forms. This is a static semantic gate consistent
|
||||||
|
with the documented trust model; it is not a process sandbox — reflection-based
|
||||||
|
indirection remains out of its reach, and full isolation would require running scripts
|
||||||
|
in a separate constrained process (a larger change deliberately not taken here).
|
||||||
|
Regression tests `RunInSandbox_FullyQualifiedForbiddenApi_IsBlockedBeforeExecution`,
|
||||||
|
`RunInSandbox_ForbiddenUsingDirective_IsBlockedBeforeExecution` and
|
||||||
|
`Diagnose_FullyQualifiedForbiddenCall_RaisesSCADA002` fail against the pre-fix code and
|
||||||
|
pass after; `RunInSandbox_CleanScript_StillRuns` guards against over-blocking. Fixed by
|
||||||
|
the commit whose message references `CentralUI-001`.
|
||||||
|
|
||||||
|
### CentralUI-002 — Site-scoped Deployment permissions are issued but never enforced
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Auth/AuthEndpoints.cs:63-69`; `src/ScadaLink.CentralUI/Components/Pages/Deployment/*.razor` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Login adds `SiteId` claims (`JwtTokenService.SiteIdClaimType`) for non-system-wide
|
||||||
|
Deployment users, and the design doc (Component-CentralUI "Responsibilities" and
|
||||||
|
CLAUDE.md Security & Auth) requires the Deployment role to be site-scoped. A
|
||||||
|
repo-wide search shows the `SiteId` claim is written at login and **never read
|
||||||
|
anywhere in CentralUI**. Deployment pages — `DebugView.razor`, `Deployments.razor`,
|
||||||
|
`InstanceCreate.razor`, `InstanceConfigure.razor`, `Topology.razor`,
|
||||||
|
`ParkedMessages.razor`, `EventLogs.razor` — list and act on every site with no
|
||||||
|
filtering by the user's permitted sites. A Deployment user scoped to one site
|
||||||
|
can deploy to, debug, and manage instances at any site.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Enforce site scoping: filter site/instance lists by the user's `SiteId` claims
|
||||||
|
(or treat the absence of `SiteId` claims as system-wide), and re-check the claim
|
||||||
|
server-side before any mutating cross-site command (deploy, enable/disable/delete,
|
||||||
|
debug stream, parked-message retry/discard). A shared helper that reads the
|
||||||
|
claims from `AuthenticationStateProvider` and exposes "permitted site ids" would
|
||||||
|
keep this consistent.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. Confirmed: the `SiteId` claim was written at login
|
||||||
|
(`AuthEndpoints`, `RoleMapper`) but never read by any CentralUI page — site
|
||||||
|
scoping was unenforced. Added a scoped `SiteScopeService` (`Auth/SiteScopeService.cs`)
|
||||||
|
that reads the current circuit's `SiteId` claims and exposes `IsSystemWideAsync`,
|
||||||
|
`PermittedSiteIdsAsync`, `FilterSitesAsync`, and `IsSiteAllowedAsync` (absence of
|
||||||
|
claims = system-wide, matching `SiteScopeAuthorizationHandler`). All seven
|
||||||
|
Deployment/Monitoring pages now consume it: `Topology`, `DebugView`,
|
||||||
|
`InstanceCreate`, `Deployments` filter their site/instance lists; `InstanceConfigure`
|
||||||
|
rejects direct navigation to an instance on a non-permitted site; `DebugView`,
|
||||||
|
`InstanceCreate`, and `ParkedMessages` re-check the claim server-side before any
|
||||||
|
mutating/streaming command. Regression tests: `SiteScopeServiceTests` (6 tests
|
||||||
|
pinning the helper logic) and `TopologyPageTests.SiteScoping_ScopedDeploymentUser_OnlySeesPermittedSites`
|
||||||
|
/ `SiteScoping_SystemWideDeploymentUser_SeesAllSites`. Fixed by the commit whose
|
||||||
|
message references `CentralUI-002`.
|
||||||
|
|
||||||
|
### CentralUI-003 — `Console.SetOut`/`SetError` mutates process-global state across concurrent circuits
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:359-423` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`RunInSandboxAsync` redirects `Console.Out`/`Console.Error` to a per-call
|
||||||
|
`StringWriter`, runs the script, then restores them in `finally`. `Console.Out`
|
||||||
|
is process-global. If two users (two Blazor circuits) run Test Run concurrently,
|
||||||
|
their captured outputs interleave or cross over, and the `finally` of whichever
|
||||||
|
finishes first restores `Console.Out` to the *original* writer while the other
|
||||||
|
run is still executing — so the second run's script output is lost or written
|
||||||
|
to the real console. `RunInSandboxAsync` is `async` and the script runs on a
|
||||||
|
thread-pool thread, so concurrent execution is fully expected.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Do not redirect process-global `Console`. Provide console capture through the
|
||||||
|
script globals surface (e.g. a `TextWriter` exposed on `SandboxScriptHost` that
|
||||||
|
the sandbox API writes to), or serialize Test Run executions with a semaphore if
|
||||||
|
global redirection must be kept. Capturing per-call without global mutation is
|
||||||
|
the correct fix.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. Confirmed: `RunInSandboxAsync` redirected the process-global
|
||||||
|
`Console.Out`/`Console.Error` per call and restored them in `finally`, so a
|
||||||
|
concurrent run's `finally` could restore the writer while another run was still
|
||||||
|
executing — the long run silently lost output (reproduced by the regression
|
||||||
|
test, 74 of 80 expected lines captured). Added `SandboxConsoleCapture`, a routing
|
||||||
|
`TextWriter` installed into `Console.Out`/`Console.Error` exactly once for the
|
||||||
|
process; each run pushes its own `StringWriter` onto an `AsyncLocal` capture
|
||||||
|
scope via `BeginCapture`, so writes are routed per logical call-tree with no
|
||||||
|
per-run mutation of global `Console` state. `RunInSandboxAsync` now opens the
|
||||||
|
scope with `using` declarations instead of calling `Console.SetOut`. Regression
|
||||||
|
tests `RunInSandbox_CapturesConsoleOutput` and
|
||||||
|
`RunInSandbox_ConcurrentRuns_DoNotCrossContaminateConsoleOutput` fail against the
|
||||||
|
pre-fix code and pass after. Fixed by the commit whose message references
|
||||||
|
`CentralUI-003`.
|
||||||
|
|
||||||
|
### CentralUI-004 — `CookieAuthenticationStateProvider` reads `HttpContext` for the life of the circuit
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Auth/CookieAuthenticationStateProvider.cs:22-28` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetAuthenticationStateAsync` returns `_httpContextAccessor.HttpContext?.User`.
|
||||||
|
In Blazor Server, `HttpContext` is only valid during the initial HTTP request
|
||||||
|
that establishes the circuit; for the lifetime of the long-lived SignalR circuit
|
||||||
|
`IHttpContextAccessor.HttpContext` is `null` (or, worse, a stale/foreign context
|
||||||
|
if the accessor's `AsyncLocal` leaks). Any later call to
|
||||||
|
`GetAuthenticationStateAsync` — e.g. an `<AuthorizeView>` re-evaluating, or pages
|
||||||
|
that call it directly (`Sites.razor`, `Templates.razor`) — then sees an
|
||||||
|
unauthenticated principal and may render the wrong UI, or returns a stale
|
||||||
|
identity that never reflects role changes. The class derives from
|
||||||
|
`ServerAuthenticationStateProvider`, which is designed to be seeded once via
|
||||||
|
`SetAuthenticationState`; overriding `GetAuthenticationStateAsync` to read
|
||||||
|
`HttpContext` defeats that design.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Capture the authenticated principal once when the circuit is created (e.g. via
|
||||||
|
the root component / `AuthenticationStateProvider` seeding pattern used by the
|
||||||
|
Blazor Web App template) and store it on the scoped provider, instead of reading
|
||||||
|
`IHttpContextAccessor` on every call. Do not depend on `HttpContext` after the
|
||||||
|
circuit is established.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. Confirmed: `GetAuthenticationStateAsync` read
|
||||||
|
`_httpContextAccessor.HttpContext?.User` on every call; the provider is
|
||||||
|
registered `Scoped`, so it is constructed within the initial HTTP request's DI
|
||||||
|
scope while `HttpContext` is still valid, but every later call (an
|
||||||
|
`<AuthorizeView>` re-evaluating, or a page calling it directly) over the
|
||||||
|
long-lived SignalR circuit saw `HttpContext == null` and returned an anonymous
|
||||||
|
principal. The provider now snapshots the principal once in the constructor into
|
||||||
|
a cached `Task<AuthenticationState>` and serves that for the life of the
|
||||||
|
circuit, never touching `IHttpContextAccessor` again. Regression tests
|
||||||
|
`CookieAuthenticationStateProviderTests.GetAuthenticationStateAsync_StillReturnsUser_AfterHttpContextIsGone`
|
||||||
|
and `..._IsStableAcrossCalls_IgnoringStaleForeignContext` fail against the
|
||||||
|
pre-fix code (they would see an anonymous / foreign principal) and pass after.
|
||||||
|
Fixed by the commit whose message references `CentralUI-004`.
|
||||||
|
|
||||||
|
### CentralUI-005 — Session expiry implementation diverges from the documented policy
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Auth/AuthEndpoints.cs:47-81`; `src/ScadaLink.CentralUI/Components/Shared/SessionExpiry.razor:18-30` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
CLAUDE.md (Security & Auth) specifies "15-minute expiry with sliding refresh,
|
||||||
|
30-minute idle timeout." `AuthEndpoints` instead sets a single fixed
|
||||||
|
`expires_at = UtcNow + 30 minutes` claim and a 30-minute cookie `ExpiresUtc`,
|
||||||
|
with no sliding refresh and no separate idle vs absolute timeout.
|
||||||
|
`SessionExpiry.razor` schedules a single hard redirect at that fixed time. The
|
||||||
|
result is a hard 30-minute cap with no sliding renewal — an active user is
|
||||||
|
logged out mid-session, and there is no 15-minute component at all.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either implement the documented policy (sliding 15-minute token with refresh on
|
||||||
|
activity, plus a 30-minute idle cutoff) or update the design docs to match the
|
||||||
|
fixed 30-minute model. The code and the documented decision must agree.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-006 — Deployment status page polls every 10s despite the documented SignalR-push design
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Pages/Deployment/Deployments.razor:196-216` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Component-CentralUI "Real-Time Updates" states: "Deployment status:
|
||||||
|
Pending/in-progress/success/failed transitions push to the UI immediately via
|
||||||
|
SignalR (built into Blazor Server). No polling required for deployment
|
||||||
|
tracking." `Deployments.razor` instead runs a `Timer` that reloads all
|
||||||
|
deployment records and instance names from the database every 10 seconds. This
|
||||||
|
is a full N-record + instance-map reload per tick for every open circuit, and
|
||||||
|
contradicts the design. It also re-issues two repository round-trips on each
|
||||||
|
tick regardless of whether anything changed.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Implement push-based updates (an injected event/observable raised by the
|
||||||
|
Deployment Manager that the page subscribes to and renders via
|
||||||
|
`InvokeAsync(StateHasChanged)`), or amend the design doc to acknowledge polling.
|
||||||
|
If polling is kept as a fallback, fetch only changed/in-progress records.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-007 — Monitoring nav links to Deployment-only pages are shown to all roles
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Layout/NavMenu.razor:69-78`; `src/ScadaLink.CentralUI/Components/Pages/Monitoring/EventLogs.razor:2`; `src/ScadaLink.CentralUI/Components/Pages/Monitoring/ParkedMessages.razor:2` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`NavMenu` renders the "Event Logs" and "Parked Messages" links inside the
|
||||||
|
all-authenticated-users Monitoring section. The design doc classifies both the
|
||||||
|
Site Event Log Viewer and Parked Message Management as **Deployment Role**.
|
||||||
|
Two inconsistencies result: (a) an Admin- or Design-only user sees nav links
|
||||||
|
they cannot use; (b) the pages themselves are annotated only `[Authorize]`
|
||||||
|
(any authenticated user), not `[Authorize(Policy = RequireDeployment)]`, so a
|
||||||
|
non-Deployment user who follows the link is *not* blocked — they can query site
|
||||||
|
event logs and retry/discard parked messages. The authorization attribute and
|
||||||
|
the nav visibility both contradict the design.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add `[Authorize(Policy = AuthorizationPolicies.RequireDeployment)]` to
|
||||||
|
`EventLogs.razor` and `ParkedMessages.razor`, and move their nav links into a
|
||||||
|
`<AuthorizeView Policy="RequireDeployment">` block (consistent with the Topology
|
||||||
|
/ Deployments / Debug View links). Confirm Health Dashboard is intentionally
|
||||||
|
all-roles (it is, per the design).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-008 — Audit-log date filters treat browser-local datetimes as UTC
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Pages/Monitoring/AuditLog.razor:242-243` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `From`/`To` filters bind `<input type="datetime-local">` to `DateTime?`
|
||||||
|
fields. A `datetime-local` input yields the value the user typed in their
|
||||||
|
*browser-local* time zone. `FetchPage` converts them with
|
||||||
|
`new DateTimeOffset(_filterFrom.Value, TimeSpan.Zero)` — i.e. it labels the
|
||||||
|
local wall-clock value as UTC. For any non-UTC user the audit query window is
|
||||||
|
shifted by their UTC offset, silently returning the wrong rows. CLAUDE.md
|
||||||
|
mandates UTC throughout, but that requires converting the local input *to* UTC,
|
||||||
|
not relabelling it.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Convert the picked local time to UTC before querying — capture the browser
|
||||||
|
offset (JS interop) and apply it, or document the inputs as UTC and label them
|
||||||
|
in the UI. The same issue should be checked in `EventLogs.razor` if it has
|
||||||
|
time-range filters.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-009 — `DebugView` stream callbacks touch a possibly-disposed `ToastNotification`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Pages/Deployment/DebugView.razor:400-409,538-544` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `onTerminated` callback passed to `DebugStreamService.StartStreamAsync`
|
||||||
|
captures `_toast` and `this` and runs on an Akka/gRPC thread. If the user
|
||||||
|
navigates away, `Dispose()` calls `StopStream`, but a stream-termination event
|
||||||
|
already in flight can still invoke `onTerminated`, which calls
|
||||||
|
`_toast.ShowError(...)` and `StateHasChanged()` on a disposed component. The
|
||||||
|
component does not guard callbacks with a disposed flag or a
|
||||||
|
`CancellationTokenSource`. The same applies to the `onEvent` callbacks at
|
||||||
|
lines 391-398 that call `InvokeAsync(StateHasChanged)`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Track a `_disposed`/`CancellationTokenSource` on the component, check it at the
|
||||||
|
top of every stream callback, and stop the stream synchronously before marking
|
||||||
|
disposed. `InvokeAsync` after disposal throws `ObjectDisposedException`; the
|
||||||
|
callbacks should no-op once disposed.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-010 — `ToastNotification` auto-dismiss continuation runs after component disposal
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Shared/ToastNotification.razor:62-71,90` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AddToast` schedules `Task.Delay(dismissMs).ContinueWith(...)` with the result
|
||||||
|
discarded (`_ =`). The continuation calls `InvokeAsync(StateHasChanged)`. If the
|
||||||
|
host page is disposed before the 5-second delay elapses (common — navigate away
|
||||||
|
right after an action), the continuation runs against a disposed component and
|
||||||
|
`InvokeAsync` throws `ObjectDisposedException` on a thread-pool thread with no
|
||||||
|
catch, producing an unobserved task exception. `Dispose()` is an empty body and
|
||||||
|
cancels nothing.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Hold a `CancellationTokenSource`, pass its token to `Task.Delay`, cancel it in
|
||||||
|
`Dispose()`, and guard the continuation. Alternatively wrap the continuation
|
||||||
|
body in a try/catch for `ObjectDisposedException`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-011 — `DiffDialog` leaves a dangling `TaskCompletionSource` when disposed while open
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Shared/DiffDialog.razor:89-95,151-157` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`OpenAsync` creates `_tcs` and returns `_tcs.Task` to the caller, which
|
||||||
|
typically `await`s it. The task is completed only by `Close()`. If the user
|
||||||
|
navigates away while the dialog is open, `DisposeAsync` runs but never completes
|
||||||
|
`_tcs`, so the awaiting caller's continuation never resumes — a permanently
|
||||||
|
suspended `Task` (and any `using`/cleanup after the await is skipped). The
|
||||||
|
`IDialogService.Confirm/Prompt` path has the same shape but at least its host
|
||||||
|
is a single long-lived `DialogHost`; `DiffDialog` is per-page.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
In `DisposeAsync`, call `_tcs?.TrySetResult(false)` (or `TrySetCanceled`) so any
|
||||||
|
awaiter completes deterministically.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-012 — N+1 query loading data connections for the Sites page
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Pages/Admin/Sites.razor:196-205` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LoadDataAsync` fetches all sites, then issues
|
||||||
|
`SiteRepository.GetDataConnectionsBySiteIdAsync(site.Id)` once per site in a
|
||||||
|
loop. With N sites this is N+1 database round-trips on every page load and every
|
||||||
|
post-delete refresh. The connection lists are only used for a small per-card
|
||||||
|
summary.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a repository method that returns all data connections (or connections for a
|
||||||
|
set of site ids) in one query and group them client-side, or project the small
|
||||||
|
summary in a single query.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-013 — `ScriptAnalysisService` blocks on async shared-script lookups
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:951-952` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ResolveCalledShape` calls `_sharedScripts.GetShapesAsync().GetAwaiter().GetResult()`
|
||||||
|
to resolve a shared-script shape synchronously. `GetShapesAsync` ultimately hits
|
||||||
|
`SharedScriptService` and its EF Core repository. Sync-over-async on a request
|
||||||
|
thread risks thread-pool starvation under load and can deadlock if any awaited
|
||||||
|
continuation needs a captured context. `Hover` and `SignatureHelp` (which call
|
||||||
|
`ResolveCalledShape`) are themselves synchronous methods, so the blocking call
|
||||||
|
is structural.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Make `Hover` and `SignatureHelp` async and `await` `GetShapesAsync`, or have the
|
||||||
|
catalog expose a cached synchronous snapshot that is refreshed asynchronously.
|
||||||
|
The `IMemoryCache` is already present — caching the shapes there and reading
|
||||||
|
them synchronously would remove the blocking call.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-014 — Test Run side effects (HTTP/SQL/SMTP) fire against production services
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:254-259`; `src/ScadaLink.CentralUI/ScriptAnalysis/SandboxHostHelpers.cs:26-117` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
By design (documented in the XML comments) Test Run wires `ExternalSystem`,
|
||||||
|
`Database`, and `Notify` to central's *real* `IExternalSystemClient`,
|
||||||
|
`IDatabaseGateway`, and `INotificationDeliveryService`, so a Test Run that calls
|
||||||
|
`Notify.To(...).Send(...)` actually emails recipients, `Database.Connection(...)`
|
||||||
|
opens a real DB connection, and `External.Call(...)` makes real HTTP calls —
|
||||||
|
with production-equivalent side effects. There is no dry-run mode, no
|
||||||
|
confirmation, and (combined with CentralUI-001) no restriction on what a script
|
||||||
|
can do. A Design user testing a draft script can dispatch real notifications or
|
||||||
|
mutate external databases. The behaviour is intentional but the blast radius is
|
||||||
|
not surfaced to the user.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
At minimum, surface a clear warning in the Test Run UI that side effects are
|
||||||
|
real, and require explicit opt-in for side-effecting calls. Preferably offer a
|
||||||
|
dry-run mode that stubs the helpers, defaulting to dry-run.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-015 — `DialogService` continuations resolve off the render thread
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/ServiceCollectionExtensions.cs:24`; `src/ScadaLink.CentralUI/Components/Shared/DialogService.cs:18-69` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DialogService` is `AddScoped` (one per circuit, correct) but
|
||||||
|
`ConfirmAsync`/`PromptAsync` complete via `ContinueWith(..., TaskScheduler.Default)`,
|
||||||
|
so a caller awaiting them resumes on a thread-pool thread. Any subsequent
|
||||||
|
component state mutation by the caller is then off the render thread unless the
|
||||||
|
caller wraps it in `InvokeAsync`. Call sites are not consistently doing so,
|
||||||
|
which can produce non-deterministic render glitches.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either resolve continuations on the circuit's sync context or document that
|
||||||
|
callers must `InvokeAsync` after awaiting `ConfirmAsync`/`PromptAsync`. Audit
|
||||||
|
call sites for off-thread state mutation.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-016 — Pagers render one button per page with no windowing
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Shared/DataTable.razor:62-68`; `src/ScadaLink.CentralUI/Components/Pages/Deployment/Deployments.razor:167-173` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `DataTable` and `Deployments` paginators loop `for i = 1..totalPages` and
|
||||||
|
emit a `<li>` button for every page. With a few thousand records at page size 25
|
||||||
|
that is hundreds of buttons rendered into the diff on every state change. It is
|
||||||
|
not a correctness bug but degrades render performance and usability on large
|
||||||
|
datasets.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Window the pager (first / prev / a few around current / next / last) or switch
|
||||||
|
large lists to a "load more" / numeric jump input.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-017 — `/auth/logout` POST disables antiforgery, enabling logout CSRF
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Auth/AuthEndpoints.cs:127-138` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `POST /auth/logout` endpoint calls `.DisableAntiforgery()`, and a plain
|
||||||
|
`GET /logout` endpoint also signs the user out. Either can be triggered
|
||||||
|
cross-site (an `<img src="/logout">` or an auto-submitting form) to forcibly log
|
||||||
|
a user out. Login itself reasonably disables antiforgery (pre-auth), but logout
|
||||||
|
is a state-changing authenticated action and should be CSRF-protected.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Require an antiforgery token on `POST /auth/logout` (the `NavMenu` sign-out form
|
||||||
|
can include the antiforgery token), and remove or protect the state-changing
|
||||||
|
`GET /logout` route.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-018 — Broad `catch {}` blocks swallow JS interop and storage errors silently
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.CentralUI/Components/Shared/MonacoEditor.razor:116-118,123,142,164,170,176,182,189`; `src/ScadaLink.CentralUI/Components/Shared/TreeView.razor:129,139`; `src/ScadaLink.CentralUI/Components/Pages/Admin/Sites.razor:316-319` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Numerous `try { ... } catch { }` blocks swallow every exception with no logging.
|
||||||
|
The prerender-time JS-unavailable case is legitimate, but these catches also
|
||||||
|
hide real failures: a genuine Monaco init failure, or a clipboard permission
|
||||||
|
error become invisible. In `TreeView.razor` the storage-restore
|
||||||
|
`JsonSerializer.Deserialize` (line 139) is not inside a try at all and would
|
||||||
|
throw uncaught on a corrupt `treeviewStorage` payload. Debugging UI issues in
|
||||||
|
production is then guesswork.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Catch the specific expected exception type (e.g. `JSDisconnectedException`,
|
||||||
|
`InvalidOperationException` during prerender) and log anything else via
|
||||||
|
`ILogger`. Wrap the TreeView storage `Deserialize` in its own guarded block.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### CentralUI-019 — Sparse unit-test coverage for a large module; critical paths untested
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.CentralUI.Tests/` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The module has ~65 source files but unit tests cover only the script analyzer,
|
||||||
|
TreeView, schema model, and two data-connection pages. Untested critical paths
|
||||||
|
include: the auth bridge (`CookieAuthenticationStateProvider`,
|
||||||
|
`AuthEndpoints`), `RunInSandboxAsync` (timeout, recursion limit, error
|
||||||
|
classification, side-effect wiring), `DialogService` resolution semantics,
|
||||||
|
`DebugView` stream lifecycle and the `UpsertWithCap` cap logic, `Health` and
|
||||||
|
`Deployments` timer behaviour, and `SchemaBuilderModel` round-tripping of nested
|
||||||
|
schemas. Given findings CentralUI-001/003/009/010 sit on untested code, the gap
|
||||||
|
is material. The Playwright suite covers login and navigation only.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add bUnit/unit tests for the auth bridge, sandbox-run behaviour (including
|
||||||
|
forbidden-API rejection once CentralUI-001 is fixed), dialog resolution, and the
|
||||||
|
DebugView cap/lifecycle logic. Prioritise the paths named in the Critical/High
|
||||||
|
findings.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,355 @@
|
|||||||
|
# Code Review — ClusterInfrastructure
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.ClusterInfrastructure` |
|
||||||
|
| Design doc | `docs/requirements/Component-ClusterInfrastructure.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 8 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The ClusterInfrastructure module is currently a **Phase 0 skeleton**. It contains
|
||||||
|
only two source files: `ClusterOptions.cs`, a plain options POCO, and
|
||||||
|
`ServiceCollectionExtensions.cs`, whose two registration methods are explicit no-ops.
|
||||||
|
None of the responsibilities described in `Component-ClusterInfrastructure.md` —
|
||||||
|
Akka.NET cluster bootstrap, leader election, failover detection, split-brain
|
||||||
|
resolution, cluster singleton hosting, Windows service lifecycle — are implemented.
|
||||||
|
There are therefore no correctness, concurrency, or Akka-convention defects to find
|
||||||
|
in *behaviour*, because there is no behaviour. The findings below instead concern
|
||||||
|
(a) the large gap between the design doc and the code, (b) the options class missing
|
||||||
|
the validation, configuration-binding affordances, and coverage of documented
|
||||||
|
settings that peer modules provide, and (c) the no-op DI extensions silently
|
||||||
|
returning success, which is a latent reliability hazard once the Host wires this
|
||||||
|
module in. The dominant theme is **incompleteness**: this module is the foundation
|
||||||
|
every other component runs on, yet it presently delivers nothing the design requires.
|
||||||
|
The single options class is clean and its test covers defaults and setters
|
||||||
|
adequately for what exists.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | No executable logic exists beyond an options POCO; no logic bugs, but `ServiceCollectionExtensions` returns success while doing nothing (CI-002). |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | No actors, no `ActorSystem` bootstrap, no supervision, no cluster/singleton wiring exist despite the design doc requiring all of them (CI-001). Nothing to assess against `Tell`/`Ask`, immutability, or `PipeTo`. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | No shared mutable state, no actors, no async code. No issues found in current code. |
|
||||||
|
| 4 | Error handling & resilience | ✓ | Failover, split-brain, dual-node recovery, and graceful-shutdown logic are entirely absent (CI-001). No exception paths to review in current code. |
|
||||||
|
| 5 | Security | ✓ | No authn/authz surface in this module. Akka remoting is unconfigured, so transport security cannot be assessed; flagged as part of the missing implementation (CI-001). No secret handling present. |
|
||||||
|
| 6 | Performance & resource management | ✓ | No streams, connections, timers, or `IDisposable` resources exist yet. No issues found in current code. |
|
||||||
|
| 7 | Design-document adherence | ✓ | Severe drift: the module implements none of its documented responsibilities (CI-001). `ClusterOptions` also omits remoting host/port, cluster role/site identifier, gRPC port, storage paths, and `down-if-alone` (CI-003). |
|
||||||
|
| 8 | Code organization & conventions | ✓ | Options class is correctly owned by the component project. Missing config-section-name constant (CI-005) and missing `IValidateOptions`/data-annotation validation (CI-004) versus the Options pattern intent. |
|
||||||
|
| 9 | Testing coverage | ✓ | `ClusterOptionsTests` covers defaults and setters. No tests for any cluster behaviour because none exists; the test project references nothing else (CI-006). |
|
||||||
|
| 10 | Documentation & comments | ✓ | `ClusterOptions` has no XML doc comments unlike peer options classes (CI-007). The "Phase 0 skeleton" placeholders are undocumented at the module level — no README or tracking note (CI-008). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### ClusterInfrastructure-001 — Module implements none of its documented responsibilities
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:9`, `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:16` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`Component-ClusterInfrastructure.md` assigns this module seven concrete
|
||||||
|
responsibilities: bootstrap the Akka.NET `ActorSystem`, form the two-node cluster,
|
||||||
|
manage leader election / active-standby role assignment, detect node failures and
|
||||||
|
trigger failover, provide remoting, host the cluster singleton, and manage the
|
||||||
|
Windows service lifecycle. The entire module is two files: a `ClusterOptions` POCO
|
||||||
|
and a `ServiceCollectionExtensions` whose methods are explicitly commented
|
||||||
|
`// Phase 0: skeleton only` and `// Phase 0: placeholder for Akka actor registration`
|
||||||
|
and simply return the unmodified `IServiceCollection`. There is no `Akka.Cluster`,
|
||||||
|
`Akka.Cluster.Tools`, `Akka.Remote`, or split-brain-resolver dependency in the
|
||||||
|
`.csproj` at all (it references only `Microsoft.Extensions.DependencyInjection.Abstractions`,
|
||||||
|
`Microsoft.Extensions.Options`, and `ScadaLink.Commons`). Because every other
|
||||||
|
ScadaLink component runs inside the actor system this module is responsible for
|
||||||
|
creating, the absence of any implementation blocks the foundational layer of the
|
||||||
|
system.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Track the gap explicitly (a milestone/issue) and implement the documented behaviour:
|
||||||
|
add the Akka cluster/remote/cluster-tools and split-brain-resolver package
|
||||||
|
references, build the cluster bootstrap (HOCON generation from `ClusterOptions`),
|
||||||
|
the split-brain resolver configuration, cluster-singleton hosting support, and
|
||||||
|
`CoordinatedShutdown` wiring. Until then, the module's `Status` and the design doc
|
||||||
|
should clearly state it is unimplemented so callers do not assume otherwise.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Re-triaged 2026-05-16 — remains Open, needs a design decision from the user._
|
||||||
|
|
||||||
|
Verified against the source at the reviewed commit: the finding's factual claims hold.
|
||||||
|
`src/ScadaLink.ClusterInfrastructure` still contains only `ClusterOptions.cs` and a
|
||||||
|
no-op `ServiceCollectionExtensions.cs`, and the `.csproj` references no Akka packages.
|
||||||
|
|
||||||
|
However, the documented cluster behaviour is **not actually absent from the system** —
|
||||||
|
it has been implemented in the **Host** project rather than in this module:
|
||||||
|
|
||||||
|
- `src/ScadaLink.Host/Actors/AkkaHostedService.cs` bootstraps the `ActorSystem`,
|
||||||
|
generates the HOCON from `ClusterOptions` (it imports `ScadaLink.ClusterInfrastructure`
|
||||||
|
and injects `IOptions<ClusterOptions>`), and configures the `keep-oldest` split-brain
|
||||||
|
resolver with `down-if-alone = on` (see `AkkaHostedService.cs:95-96`).
|
||||||
|
- `src/ScadaLink.Host/Health/AkkaClusterHealthCheck.cs`, `AkkaClusterNodeProvider.cs`,
|
||||||
|
and `Health/ActiveNodeHealthCheck.cs` cover cluster membership / active-node detection.
|
||||||
|
- Akka cluster/remote package references live in `ScadaLink.Host.csproj` and the
|
||||||
|
per-component projects (`SiteRuntime`, `Communication`, etc.).
|
||||||
|
|
||||||
|
So the real situation is an **ownership / design-doc drift**, not missing behaviour:
|
||||||
|
`Component-ClusterInfrastructure.md` assigns the Akka bootstrap, HOCON generation,
|
||||||
|
split-brain config and `CoordinatedShutdown` wiring to this module, but the
|
||||||
|
implementation deliberately lives in the Host. `ClusterOptions` is the one piece this
|
||||||
|
module legitimately owns and it is consumed correctly by the Host.
|
||||||
|
|
||||||
|
Resolving CI-001 as literally written is **not a small, well-scoped fix** — it is one
|
||||||
|
of two substantial decisions, both requiring the user:
|
||||||
|
|
||||||
|
1. **Move the bootstrap into this module** — relocate the HOCON generation, split-brain
|
||||||
|
config, cluster-singleton helpers and `CoordinatedShutdown` wiring out of
|
||||||
|
`ScadaLink.Host` into `ScadaLink.ClusterInfrastructure`, add the Akka package
|
||||||
|
references, and re-wire the Host to call into it. This is a cross-module refactor
|
||||||
|
touching `src/ScadaLink.Host/*` and several other projects — outside the edit scope
|
||||||
|
permitted for this finding (only `src/ScadaLink.ClusterInfrastructure/`,
|
||||||
|
`tests/ScadaLink.ClusterInfrastructure.Tests/`, and this file may be edited).
|
||||||
|
2. **Accept the current placement** — keep the bootstrap in the Host and update
|
||||||
|
`Component-ClusterInfrastructure.md` (and the README component table) to record that
|
||||||
|
the Host owns the actor-system/cluster bootstrap and that this module's role is the
|
||||||
|
shared `ClusterOptions` contract. That fix is a design-doc edit, also outside this
|
||||||
|
module's permitted edit scope.
|
||||||
|
|
||||||
|
Either path is a deliberate architecture decision, not a bug fix, so per
|
||||||
|
REVIEW-PROCESS.md §2 this finding is left **Open** and surfaced for the user to decide.
|
||||||
|
No code change was made. Module test suite verified green (3 passed) at re-triage time.
|
||||||
|
|
||||||
|
### ClusterInfrastructure-002 — No-op DI extension methods report success while doing nothing
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:7-17` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AddClusterInfrastructure` and `AddClusterInfrastructureActors` both accept an
|
||||||
|
`IServiceCollection` and return it unchanged. A caller (e.g. the Host) that invokes
|
||||||
|
`services.AddClusterInfrastructure()` receives a fluent, success-looking result but
|
||||||
|
no actor system, no cluster, and no singleton support is actually registered. This
|
||||||
|
is a silent failure: the system will appear to start, then fail later and far from
|
||||||
|
the cause (e.g. when a component resolves an `ActorSystem` that was never added, or
|
||||||
|
when the cluster singleton never forms). A no-op that masquerades as a completed
|
||||||
|
registration is worse than an unimplemented method that throws.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Until the real implementation exists, make the placeholder loud rather than silent —
|
||||||
|
either throw `NotImplementedException` from the methods, or have them log a
|
||||||
|
prominent warning, so an integrating caller fails fast with a clear cause. Replace
|
||||||
|
with the genuine registration when CI-001 is addressed.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ClusterInfrastructure-003 — ClusterOptions omits several documented node-configuration settings
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3-11` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The "Node Configuration", "Split-Brain Resolution", and "Failure Detection Timing"
|
||||||
|
sections of the design doc enumerate the settings each node needs. `ClusterOptions`
|
||||||
|
exposes `SeedNodes`, `SplitBrainResolverStrategy`, `StableAfter`,
|
||||||
|
`HeartbeatInterval`, `FailureDetectionThreshold`, and `MinNrOfMembers`, but is
|
||||||
|
missing: the Akka remoting hostname/port (default 8081 central, 8082 site), the
|
||||||
|
cluster role (Central vs. Site) and the site identifier, the `down-if-alone` flag
|
||||||
|
(the design explicitly requires `down-if-alone = on` for the keep-oldest resolver),
|
||||||
|
and — for site nodes — the gRPC port (default 8083) and local SQLite storage paths.
|
||||||
|
Without these, the options class cannot drive a correct HOCON configuration when
|
||||||
|
CI-001 is implemented. (Some settings such as remoting host/port may instead belong
|
||||||
|
in `Host/NodeOptions.cs`; the split of ownership should be decided deliberately, but
|
||||||
|
at minimum `down-if-alone` belongs with the split-brain settings here.)
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add the missing settings — at minimum a `DownIfAlone` boolean (default `true`) and
|
||||||
|
the cluster role / site identifier — or document explicitly which settings are
|
||||||
|
owned by `Host/NodeOptions.cs` instead, so the design doc and the options classes
|
||||||
|
agree on where each value lives.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ClusterInfrastructure-004 — ClusterOptions has no validation despite safety-critical values
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3-11` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ClusterOptions` carries values whose misconfiguration has cluster-wide
|
||||||
|
consequences. The design doc is emphatic that `min-nr-of-members` must be `1` (a
|
||||||
|
value of `2` blocks the singleton and therefore all data collection indefinitely
|
||||||
|
after failover), that `SplitBrainResolverStrategy` must be `keep-oldest` for a
|
||||||
|
two-node cluster (quorum strategies cause total shutdown), and that the timing
|
||||||
|
values are interdependent (`HeartbeatInterval` must be well below
|
||||||
|
`FailureDetectionThreshold`). The class has no data annotations, no
|
||||||
|
`IValidateOptions<ClusterOptions>`, and no guard logic, so an `appsettings.json`
|
||||||
|
setting `MinNrOfMembers: 2` or `SplitBrainResolverStrategy: "keep-majority"` (the
|
||||||
|
exact value the test at `ClusterOptionsTests.cs:35` shows is settable) would be
|
||||||
|
accepted silently and produce the catastrophic outcomes the design doc warns
|
||||||
|
against.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add validation — data annotations (`[Range]` for `MinNrOfMembers`, etc.) plus an
|
||||||
|
`IValidateOptions<ClusterOptions>` implementation that enforces
|
||||||
|
`MinNrOfMembers == 1`, restricts `SplitBrainResolverStrategy` to a known set,
|
||||||
|
requires `SeedNodes` non-empty, and asserts `HeartbeatInterval <
|
||||||
|
FailureDetectionThreshold` and positive `StableAfter`. Register it with
|
||||||
|
`ValidateOnStart()` so misconfiguration fails fast at boot.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ClusterInfrastructure-005 — No configuration section name constant for the Options pattern binding
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
CLAUDE.md specifies per-component configuration via `appsettings.json` sections
|
||||||
|
bound with the Options pattern. `ClusterOptions` provides no `public const string
|
||||||
|
SectionName` (or equivalent) for the binding site to reference, so whichever code
|
||||||
|
binds the section must hard-code the magic string, and there is no single source of
|
||||||
|
truth for the section name. Because `AddClusterInfrastructure` is itself a no-op
|
||||||
|
(CI-002), the options class is currently bound nowhere at all, making the missing
|
||||||
|
constant easy to overlook.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `public const string SectionName = "Cluster";` (or the agreed name) to
|
||||||
|
`ClusterOptions` and have the eventual `AddClusterInfrastructure` bind
|
||||||
|
`configuration.GetSection(ClusterOptions.SectionName)` against it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ClusterInfrastructure-006 — No tests for any cluster behaviour; only the options POCO is covered
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs:1-51` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test project contains only `ClusterOptionsTests`, exercising default values and
|
||||||
|
property setters of `ClusterOptions`. There are no tests for cluster formation,
|
||||||
|
leader election, failover detection, split-brain resolution, singleton handover, or
|
||||||
|
the `ServiceCollectionExtensions` registration methods — none can exist because the
|
||||||
|
behaviour itself is absent (CI-001). This is recorded so the testing gap is tracked
|
||||||
|
alongside the implementation gap: the most safety-critical paths of the entire
|
||||||
|
system (failover, split-brain, dual-node recovery) are completely untested. The
|
||||||
|
test at line 30-50 also asserts that `SplitBrainResolverStrategy` can be set to
|
||||||
|
`"keep-majority"`, implicitly endorsing a value the design doc forbids for a
|
||||||
|
two-node cluster — see CI-004.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
When CI-001 is implemented, add multi-node `Akka.Cluster.TestKit` /
|
||||||
|
`MultiNodeTestKit` tests covering cluster formation, failover promotion,
|
||||||
|
split-brain downing, and singleton handover, plus unit tests for HOCON generation
|
||||||
|
from `ClusterOptions` and for the options validation from CI-004.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ClusterInfrastructure-007 — ClusterOptions lacks XML documentation comments
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3-11` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ClusterOptions` and each of its six properties have no XML doc comments. Peer
|
||||||
|
options classes such as `StoreAndForward/StoreAndForwardOptions.cs` document the
|
||||||
|
class and every property (including units and design-doc references). For a class
|
||||||
|
whose values carry the cluster-wide consequences described in the design doc
|
||||||
|
(notably `MinNrOfMembers` and `SplitBrainResolverStrategy`), the absence of inline
|
||||||
|
documentation is a maintainability and safety gap — a future editor has no in-code
|
||||||
|
warning that `MinNrOfMembers` must stay `1`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add `<summary>` comments to the class and each property, stating units and the
|
||||||
|
documented constraints (e.g. that `MinNrOfMembers` must be `1`, that
|
||||||
|
`HeartbeatInterval` must be well below `FailureDetectionThreshold`), referencing
|
||||||
|
the relevant design-doc sections as peer modules do.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ClusterInfrastructure-008 — "Phase 0 skeleton" status is undocumented at the module level
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:9`, `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:16` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The only indication that this foundational module is unimplemented is two inline
|
||||||
|
comments inside private method bodies (`// Phase 0: skeleton only` /
|
||||||
|
`// Phase 0: placeholder for Akka actor registration`). There is no module README,
|
||||||
|
no `<!-- TODO -->` in the design doc, and no tracking marker visible to anyone
|
||||||
|
reading the project structure or the component table. Given that the design doc
|
||||||
|
(`Component-ClusterInfrastructure.md`) describes a fully featured component with no
|
||||||
|
caveat, a reader will reasonably assume the module is built. The mismatch between a
|
||||||
|
complete-looking design doc and an empty implementation is itself a documentation
|
||||||
|
defect.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a short note to the design doc (or a module-level `README.md`) stating the
|
||||||
|
current implementation status and what "Phase 0" delivers, and reference a tracked
|
||||||
|
issue for the remaining work (CI-001). Keep the README component table accurate
|
||||||
|
about which components are skeletons versus implemented.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,448 @@
|
|||||||
|
# Code Review — Commons
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.Commons` |
|
||||||
|
| Design doc | `docs/requirements/Component-Commons.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 12 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Commons is in good overall health. It is a well-organized, dependency-light library:
|
||||||
|
the architectural-constraint tests enforce the no-Akka/no-EF/no-ASP.NET rule, the
|
||||||
|
POCO-entity and message-as-record conventions, and the UTC timestamp rule. The folder
|
||||||
|
and namespace hierarchy closely matches REQ-COM-5b. No Critical issues were found.
|
||||||
|
|
||||||
|
The findings cluster around three themes. First, a handful of files quietly stretch
|
||||||
|
the REQ-COM-6 "no business logic" boundary — `StaleTagMonitor`, `OpcUaEndpointConfigSerializer`,
|
||||||
|
`OpcUaEndpointConfigValidator`, `ScriptParameters`, `ValueFormatter`, `DynamicJsonElement`
|
||||||
|
and `ScriptArgs` all carry non-trivial behavior, and a couple have real correctness or
|
||||||
|
concurrency defects (the `StaleTagMonitor` stale-fire race, the `DynamicJsonElement`
|
||||||
|
`JsonDocument`-lifetime hazard, the silent conversion-failure swallowing in
|
||||||
|
`ScriptParameters.GetNullable`). Second, the `ManagementCommandRegistry` name mapping is
|
||||||
|
asymmetric and namespace-scoped in a way that does not match the broader set of
|
||||||
|
`*Command` records elsewhere in `Messages/`. Third, several behavior-bearing types
|
||||||
|
(`ValueFormatter`, `DynamicJsonElement`, `ScriptArgs`, `ManagementCommandRegistry`,
|
||||||
|
`Result<T>`, the OPC UA serializer round-trip) have no unit tests despite containing the
|
||||||
|
kind of edge-case logic that warrants them. Entity and message contracts otherwise look
|
||||||
|
clean and additive-evolution-friendly, with the exception of one `ValueTuple` use in a
|
||||||
|
wire command.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | `DynamicJsonElement.TryConvert` returns success for non-convertible types; `Result<T>` allows null error; legacy-config fallback loses data. |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | Commons has no actors (correct). Message contracts are records and immutable. One wire message uses `ValueTuple` (Commons-008). Correlation IDs present on request/response messages. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | `StaleTagMonitor` has a check-then-act race between the timer callback and `OnValueReceived` (Commons-001). |
|
||||||
|
| 4 | Error handling & resilience | ✓ | `ScriptParameters.GetNullable` silently swallows conversion failures (Commons-003); OPC UA legacy deserialize discards malformed input (Commons-005). |
|
||||||
|
| 5 | Security | ✓ | No auth logic here. `SmtpConfiguration.Credentials` / OPC UA passwords are plain-string fields (storage/encryption is a consumer concern) — noted, not a finding. No script-trust violations: Commons defines no forbidden-API surface. |
|
||||||
|
| 6 | Performance & resource management | ✓ | `StaleTagMonitor` disposes its `Timer` correctly. `DynamicJsonElement` references a `JsonElement` whose backing document lifetime is not owned (Commons-002). |
|
||||||
|
| 7 | Design-document adherence | ✓ | Several behavior-bearing helper/validator/serializer classes push against REQ-COM-6 "no business logic" (Commons-007). Folder layout matches REQ-COM-5b. |
|
||||||
|
| 8 | Code organization & conventions | ✓ | `ManagementCommandRegistry` naming is asymmetric/namespace-scoped (Commons-004). `DeployedConfigSnapshot`, `InstanceAlarmOverride`, `TemplateFolder`, `ISiteRepository`, several service interfaces and `Messages/Management` exist but are not listed in Component-Commons.md (Commons-009). |
|
||||||
|
| 9 | Testing coverage | ✓ | `ValueFormatter`, `DynamicJsonElement`, `ScriptArgs`, `ManagementCommandRegistry`, `Result<T>`, `ConfigurationDiff`, `AlarmContext`, and the OPC UA serializer round-trip have no tests (Commons-010). |
|
||||||
|
| 10 | Documentation & comments | ✓ | `OpcUaEndpointConfigSerializer.Deserialize` XML doc does not mention the silent data-loss path (Commons-005). `Component-Commons.md` is stale relative to the actual file set (Commons-009). `ValueFormatter` uses current-culture formatting without documenting it (Commons-012). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### Commons-001 — `StaleTagMonitor` stale-fire race between timer and `OnValueReceived`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/StaleTagMonitor.cs:42-46`, `:62-67` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`OnValueReceived` sets `_staleFired = false` then calls `_timer.Change(...)`, while the
|
||||||
|
timer callback `OnTimerElapsed` reads `_staleFired`, sets it to `true`, and invokes the
|
||||||
|
`Stale` event. `_staleFired` is `volatile`, which guarantees visibility but not
|
||||||
|
atomicity of the check-then-set. The two methods run on different threads (a value-
|
||||||
|
arrival thread and a `ThreadPool` timer thread). If the timer callback has already
|
||||||
|
passed the `if (_staleFired) return;` check when `OnValueReceived` runs, `Stale` fires
|
||||||
|
even though a fresh value just arrived — a spurious staleness signal. There is also a
|
||||||
|
window where `OnValueReceived` resets `_staleFired` and reschedules the timer while a
|
||||||
|
callback for the previous period is mid-flight, so `Stale` can fire once per period as
|
||||||
|
documented but at the wrong moment. For a heartbeat monitor feeding connection-health
|
||||||
|
decisions, a false stale signal can trigger an unnecessary reconnect.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Guard the state transition with a lock, or replace the `_staleFired` bool with an
|
||||||
|
`Interlocked.CompareExchange` on an `int` so only one of "fire" / "reset" wins. The
|
||||||
|
callback should atomically test-and-set; `OnValueReceived` should atomically reset and
|
||||||
|
only then reschedule the timer.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-002 — `DynamicJsonElement` retains a `JsonElement` whose `JsonDocument` lifetime it does not own
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/DynamicJsonElement.cs:10-17` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DynamicJsonElement` stores a `JsonElement` and exposes it for deferred, dynamic access
|
||||||
|
from scripts. A `JsonElement` is only valid while the `JsonDocument` that produced it has
|
||||||
|
not been disposed; accessing a `JsonElement` after its document is disposed throws
|
||||||
|
`ObjectDisposedException`. Nothing in `DynamicJsonElement` keeps the document alive or
|
||||||
|
documents that the caller must. Because the wrapper is explicitly designed for
|
||||||
|
"convenient property access in scripts" — i.e. access at an arbitrary later time — a
|
||||||
|
caller that wraps an element from a `using var doc = JsonDocument.Parse(...)` block (the
|
||||||
|
exact pattern used in `OpcUaEndpointConfigSerializer`) will hand scripts a wrapper that
|
||||||
|
faults on first member access.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either clone the element on construction with `JsonElement.Clone()` (which detaches it
|
||||||
|
from the document and makes it safe to retain), or hold a reference to the owning
|
||||||
|
`JsonDocument` and implement `IDisposable`. Document the lifetime contract on the type
|
||||||
|
regardless.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-003 — `ScriptParameters.GetNullable` silently swallows conversion failures
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/ScriptParameters.cs:72-86` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetNullable<T>` catches `ScriptParameterException` from `ConvertScalar` and returns
|
||||||
|
`default!` (null) "on conversion failure for nullable". This conflates two distinct
|
||||||
|
cases: a parameter that is genuinely absent/null, and a parameter that is *present but
|
||||||
|
holds an unconvertible value* (e.g. `Get<int?>("count")` when `count` is the string
|
||||||
|
`"banana"`). The latter is almost always a script or caller bug, and silently mapping it
|
||||||
|
to `null` hides it — the script then proceeds with a null it interprets as "not
|
||||||
|
supplied". The non-nullable `Get<T>` and the array/list paths correctly throw with a
|
||||||
|
descriptive message for the same bad input, so the behavior is also inconsistent across
|
||||||
|
the API surface. The XML doc states "returns null if missing, null, or unconvertible",
|
||||||
|
so the behavior is intentional, but it remains a footgun.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Distinguish "absent/null" from "present but unconvertible": return null only for the
|
||||||
|
former and throw `ScriptParameterException` for the latter, mirroring the array/list
|
||||||
|
element handling. If the swallowing must stay for compatibility, at minimum surface it
|
||||||
|
(e.g. an out-of-band warning) rather than failing silently.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-004 — `ManagementCommandRegistry` name mapping is asymmetric and namespace-scoped
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Messages/Management/ManagementCommandRegistry.cs:14-35` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`BuildRegistry` registers only types in the exact `ScadaLink.Commons.Messages.Management`
|
||||||
|
namespace whose names end in `Command`. `GetCommandName(Type)`, however, strips a
|
||||||
|
`Command` suffix from *any* type passed to it. The two halves disagree:
|
||||||
|
|
||||||
|
- `GetCommandName` will happily compute a command name for `*Command` records that live
|
||||||
|
in other `Messages/` sub-namespaces (`DeployInstanceCommand` in `Messages.Deployment`,
|
||||||
|
`DisableInstanceCommand` in `Messages.Lifecycle`, `SetStaticAttributeCommand` in
|
||||||
|
`Messages.Instance`, `DeployArtifactsCommand` in `Messages.Artifacts`, etc.), yet
|
||||||
|
`Resolve` will return `null` for every one of those names because they were never
|
||||||
|
registered.
|
||||||
|
- Because of this gap the Management namespace carries deliberately renamed duplicates
|
||||||
|
(`MgmtDeployInstanceCommand`, `MgmtEnableInstanceCommand`, `MgmtDisableInstanceCommand`,
|
||||||
|
`MgmtDeleteInstanceCommand` in `InstanceCommands.cs`) whose `Mgmt` prefix exists only
|
||||||
|
to dodge a collision the registry's namespace filter already prevents — a confusing,
|
||||||
|
undocumented coupling.
|
||||||
|
|
||||||
|
A round-trip `Resolve(GetCommandName(t))` is therefore not guaranteed to return `t`,
|
||||||
|
which is the implicit contract of a name registry.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Make the two methods symmetric: either scan all of `Messages/` (and detect/throw on
|
||||||
|
duplicate stripped names, since `ToFrozenDictionary` will throw on a collision) or
|
||||||
|
restrict `GetCommandName` to types the registry actually contains. Document the chosen
|
||||||
|
scope, and reconsider whether the `Mgmt*` prefixed duplicates are still needed.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-005 — `OpcUaEndpointConfigSerializer.Deserialize` discards malformed legacy input and over-reports `IsLegacy`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Serialization/OpcUaEndpointConfigSerializer.cs:25-51` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
When the typed-deserialize path fails or the JSON lacks `endpointUrl`, `Deserialize`
|
||||||
|
falls through to `LoadLegacy`. If `LoadLegacy` itself throws `JsonException` (genuinely
|
||||||
|
malformed JSON), the method returns `(new OpcUaEndpointConfig(), IsLegacy: true)` — a
|
||||||
|
default, empty config with the legacy flag set. The original stored string is silently
|
||||||
|
discarded, and the caller is told it is a recoverable "legacy" row when in fact the data
|
||||||
|
was unparseable. A form built on the documented `IsLegacy` contract ("prompt the user to
|
||||||
|
re-save") will present an empty config as if it were the user's saved configuration,
|
||||||
|
inviting them to overwrite real (if malformed) data with blanks. The XML doc only
|
||||||
|
describes the happy legacy path and does not mention this data-loss branch.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Distinguish "parsed as legacy" from "could not parse at all" — e.g. return a third state
|
||||||
|
or throw for genuinely malformed input so the caller can surface an error instead of an
|
||||||
|
empty form. Update the XML doc to describe the failure branch.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-006 — `DynamicJsonElement.TryConvert` reports success for unconvertible target types
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/DynamicJsonElement.cs:47-51`, `:66-76` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`TryConvert` does `result = ConvertTo(binder.Type); return result != null || binder.Type == typeof(object);`.
|
||||||
|
`ConvertTo` returns `null` for any type/kind pair it does not handle (e.g. requesting
|
||||||
|
`int` from a JSON string, or `DateTime` from anything). For a non-`object` target this
|
||||||
|
yields `result == null` and `return false`, which is correct. But the `|| binder.Type == typeof(object)`
|
||||||
|
clause makes `(object)dynamicElement` succeed with a `null` result even when the wrapped
|
||||||
|
element is, say, a JSON object or a non-null string — the cast silently produces `null`
|
||||||
|
instead of the element or its value. Any script doing `object o = jsonThing;` gets `null`
|
||||||
|
for a present value. The conversion of a present, non-null JSON value should never yield
|
||||||
|
`null`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
For the `object` target, return the element itself (or `Wrap(_element)`) rather than
|
||||||
|
`null`. Only return `null` when the wrapped element is genuinely `JsonValueKind.Null`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-007 — Several Commons types carry non-trivial logic, stretching REQ-COM-6
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/ScriptParameters.cs`, `src/ScadaLink.Commons/Serialization/OpcUaEndpointConfigSerializer.cs`, `src/ScadaLink.Commons/Validators/OpcUaEndpointConfigValidator.cs`, `src/ScadaLink.Commons/Types/StaleTagMonitor.cs`, `src/ScadaLink.Commons/Types/ScriptArgs.cs` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
REQ-COM-6 states Commons "must contain only data structures, interfaces, enums, and
|
||||||
|
constants" and "must not contain any business logic", with method bodies "limited to
|
||||||
|
trivial data-access logic". Several files exceed that: `ScriptParameters` performs typed
|
||||||
|
conversion with reflection and JSON-element unwrapping; `OpcUaEndpointConfigSerializer`
|
||||||
|
implements a multi-shape (typed + legacy flat-dict) serialization strategy;
|
||||||
|
`OpcUaEndpointConfigValidator` encodes OPC UA domain rules (e.g. `LifetimeCount` ≥ 3×
|
||||||
|
`KeepAliveCount`); `StaleTagMonitor` runs a `Timer` and raises events; `ScriptArgs`
|
||||||
|
reflects over arbitrary objects. The `ArchitecturalConstraintTests` "no service/actor"
|
||||||
|
heuristic only counts public methods (> 3) and so does not catch these. This is design
|
||||||
|
drift, not a defect — but it should be a deliberate decision: either move these helpers
|
||||||
|
into the components that own the behavior (Data Connection Layer, Site Runtime,
|
||||||
|
Template Engine) or amend Component-Commons.md to explicitly permit "pure stateless
|
||||||
|
helpers/validators".
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide and document the policy. If these are intentionally allowed in Commons, add a
|
||||||
|
sentence to REQ-COM-6 carving out pure validators/serializers/parsers; otherwise relocate
|
||||||
|
them. Tighten the architectural test if the rule is meant to be enforced.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-008 — `SetConnectionBindingsCommand` uses `ValueTuple` in a wire message contract
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Messages/Management/InstanceCommands.cs:10` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SetConnectionBindingsCommand` declares
|
||||||
|
`IReadOnlyList<(string AttributeName, int DataConnectionId)> Bindings`. The tuple element
|
||||||
|
names are compile-time-only; `System.Text.Json` serializes a `ValueTuple` as `Item1` /
|
||||||
|
`Item2`, and the message is positional with no room for additive evolution (you cannot
|
||||||
|
add a third field without changing the tuple type, which REQ-COM-5a forbids). Every other
|
||||||
|
message in `Messages/` uses named records. A management command travels over the
|
||||||
|
ClusterClient boundary and is exactly the kind of contract REQ-COM-5a's additive-only
|
||||||
|
rule targets.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Replace the tuple with a small named record, e.g.
|
||||||
|
`record ConnectionBinding(string AttributeName, int DataConnectionId)`, and use
|
||||||
|
`IReadOnlyList<ConnectionBinding>`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-009 — `Component-Commons.md` is stale relative to the actual file set
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `docs/requirements/Component-Commons.md:61-198` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc's entity list, repository list, and folder tree no longer match the code:
|
||||||
|
|
||||||
|
- Entities present but undocumented: `DeployedConfigSnapshot`, `InstanceAlarmOverride`,
|
||||||
|
`TemplateFolder`.
|
||||||
|
- Repository interface present but undocumented: `ISiteRepository` (the doc lists seven
|
||||||
|
repositories under REQ-COM-4; the code has eight).
|
||||||
|
- Service interfaces present but undocumented: `IDatabaseGateway`,
|
||||||
|
`IExternalSystemClient`, `IInstanceLocator`, `INotificationDeliveryService` — REQ-COM-4a
|
||||||
|
documents only `IAuditService`.
|
||||||
|
- Whole namespaces absent from the REQ-COM-5b folder tree: `Messages/Management`,
|
||||||
|
`Messages/DataConnection`, `Messages/Integration`, `Messages/Instance`,
|
||||||
|
`Messages/RemoteQuery`, plus `Types/DataConnections`, `Types/Scripts`, `Serialization/`,
|
||||||
|
and `Validators/`.
|
||||||
|
|
||||||
|
CLAUDE.md's editing rules require the design docs to stay in sync with the code; the doc
|
||||||
|
is now a partial map.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Refresh Component-Commons.md to enumerate the current entities, repository and service
|
||||||
|
interfaces, and the actual `Types/`, `Messages/`, `Serialization/`, and `Validators/`
|
||||||
|
folders.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-010 — Behavior-bearing Commons types have no unit tests
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.Commons.Tests/` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ScadaLink.Commons.Tests` covers `Result`, `RetryPolicy`, `ScriptParameters`,
|
||||||
|
`StaleTagMonitor`, the OPC UA validator, enums, message conventions, compatibility, and
|
||||||
|
entity conventions. It does not cover several types that contain exactly the kind of
|
||||||
|
edge-case logic that warrants tests:
|
||||||
|
|
||||||
|
- `ValueFormatter` — scalar vs collection vs null formatting.
|
||||||
|
- `DynamicJsonElement` — member/index access, conversions, the issues in Commons-002 and
|
||||||
|
Commons-006 would have been caught by tests.
|
||||||
|
- `ScriptArgs.Normalize` — dictionary/anonymous-object/primitive-rejection paths.
|
||||||
|
- `ManagementCommandRegistry` — `Resolve` / `GetCommandName` round-trip (would have
|
||||||
|
surfaced Commons-004).
|
||||||
|
- `Result<T>` — `Match`, failure/success accessors, error-on-misuse.
|
||||||
|
- `OpcUaEndpointConfigSerializer` typed↔flat round-trip and legacy fallback.
|
||||||
|
- `ConfigurationDiff` / `AlarmContext` / `ScriptScope` — minor, but `HasChanges` /
|
||||||
|
`HasParent` logic is untested.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add focused unit tests for the helper/utility types above, prioritizing
|
||||||
|
`DynamicJsonElement`, `ScriptArgs`, `ManagementCommandRegistry`, and the OPC UA serializer
|
||||||
|
round-trip.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-011 — `Result<T>.Failure` accepts a null error string
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/Result.cs:15-20`, `:30-32`, `:36` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`Result<T>.Failure(string error)` and the private failure constructor do not validate
|
||||||
|
`error`. A caller passing `null` produces a failed `Result` whose `Error` getter returns
|
||||||
|
`null` via `_error!`, and whose `Match` calls `onFailure(_error!)` with `null`. `Result`
|
||||||
|
is the system-wide error-handling type ("consistent error handling across component
|
||||||
|
boundaries"); a failed result with no error message defeats its purpose and pushes a
|
||||||
|
`NullReferenceException` risk onto every consumer that logs or displays `Error`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Throw `ArgumentNullException` (or `ArgumentException` for empty/whitespace) in
|
||||||
|
`Failure`/the failure constructor so a failed `Result` always carries a message.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Commons-012 — `ValueFormatter` uses current-culture formatting without documenting it
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/ValueFormatter.cs:20-27` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`FormatDisplayValue` formats `IFormattable` values (and collection elements) with the
|
||||||
|
parameterless `ToString()`, which uses the current thread culture. The XML doc calls this
|
||||||
|
"the value's natural string representation" without noting the culture dependency. The
|
||||||
|
same numeric or `DateTime` attribute value will render differently depending on the
|
||||||
|
server/UI locale — e.g. decimal separators, date order. CLAUDE.md mandates UTC for
|
||||||
|
timestamps and notes local-time conversion is "a UI display concern only"; if
|
||||||
|
`ValueFormatter` is used outside a UI rendering context (e.g. logging, event-log entries,
|
||||||
|
diff display) the culture-dependent output is inconsistent and a latent bug.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide whether `ValueFormatter` is a UI-only helper. If it can be used outside the UI,
|
||||||
|
format with `CultureInfo.InvariantCulture` (using the `IFormattable.ToString(null, IFormatProvider)`
|
||||||
|
overload). Either way, document the culture behavior on the method.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,432 @@
|
|||||||
|
# Code Review — Communication
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.Communication` |
|
||||||
|
| Design doc | `docs/requirements/Component-Communication.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 8 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The Communication module is generally well-structured and matches the design doc's
|
||||||
|
two-transport model (ClusterClient for command/control, gRPC server-streaming for
|
||||||
|
real-time data). The actors keep mutable state on the actor thread, use `PipeTo` for
|
||||||
|
async work, and the gRPC server/client lifecycle is mostly disciplined. However the
|
||||||
|
review found several High and Medium issues clustered around two themes:
|
||||||
|
**(a) gRPC subscription bookkeeping races** — `SiteStreamGrpcClient` overwrites and
|
||||||
|
removes subscription entries by correlation ID without disposal or ownership checks,
|
||||||
|
so reconnect cycles leak `CancellationTokenSource`es and can cancel the wrong stream;
|
||||||
|
and **(b) missing supervision strategy** on the coordinator actors, contrary to the
|
||||||
|
CLAUDE.md "Resume for coordinator actors" decision. Design-doc adherence is otherwise
|
||||||
|
good. Test coverage is broad for happy paths but has gaps around failover, cache
|
||||||
|
mutation races, and the snapshot-timeout cleanup path.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | Snapshot-timeout orphan, reconnect not calling `CleanupGrpc`, subscription-map races. |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | No supervision strategy on coordinators; `Sender` captured in async-launched closure path. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | `SiteStreamGrpcClient._subscriptions` overwrite/remove race; `_siteClients` field reassignment unused but non-readonly. |
|
||||||
|
| 4 | Error handling & resilience | ✓ | gRPC reconnect leaks server-side relay; `LoadSiteAddressesFromDb` swallows DB failures silently. |
|
||||||
|
| 5 | Security | ✓ | No findings in module code. DebugStreamHub auth lives outside this module (Central UI). |
|
||||||
|
| 6 | Performance & resource management | ✓ | Orphaned subscriptions/CTS leaks; `SiteStreamGrpcClientFactory.Dispose` blocks on async. |
|
||||||
|
| 7 | Design-document adherence | ✓ | `GrpcMaxStreamLifetime` / keepalive options defined but never applied; hard-coded values used instead. |
|
||||||
|
| 8 | Code organization & conventions | ✓ | Options pattern correct; minor: public records declared in actor files. No structural issues. |
|
||||||
|
| 9 | Testing coverage | ✓ | No tests for snapshot-timeout cleanup, address-cache refresh races, or gRPC server reconnect-leak. |
|
||||||
|
| 10 | Documentation & comments | ✓ | XML comment on `DebugStreamBridgeActor` says "Persistent actor" — it is not an Akka.Persistence actor. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### Communication-001 — Early stream termination escapes StartStreamAsync's narrow exception handling
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.Communication/DebugStreamService.cs:130-143` |
|
||||||
|
|
||||||
|
**Re-triaged 2026-05-16:** originally filed Critical, claiming an orphaned bridge actor
|
||||||
|
and a multi-minute site-side resource leak on every snapshot timeout. On verification
|
||||||
|
that impact does **not** occur: `DebugStreamBridgeActor` calls `CleanupGrpc()` and
|
||||||
|
`Context.Stop(Self)` on every path that invokes `onTerminated` (site disconnect, gRPC
|
||||||
|
max-retries, `ReceiveTimeout`), so it always self-terminates and releases its gRPC
|
||||||
|
subscription; and the pure-timeout path does reach `StopStream`, which also stops it.
|
||||||
|
The genuine defect described below is an error-handling gap, not a leak — severity
|
||||||
|
corrected to Medium.
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`StartStreamAsync` awaits the initial snapshot inside a `try` whose only handler is
|
||||||
|
`catch (OperationCanceledException)`. When the stream terminates before the snapshot
|
||||||
|
arrives, `onTerminatedWrapper` completes the await via
|
||||||
|
`snapshotTcs.TrySetException(new InvalidOperationException(...))`. That
|
||||||
|
`InvalidOperationException` is not an `OperationCanceledException`, so it escapes the
|
||||||
|
catch entirely: the caller (Blazor debug view / SignalR hub) receives a raw,
|
||||||
|
untranslated exception, and `StartStreamAsync` performs no teardown of its own on that
|
||||||
|
path — it relies implicitly on the bridge actor self-terminating. Cleanup from the
|
||||||
|
service side is therefore not deterministic, and the failure surfaced to the caller is
|
||||||
|
not a meaningful, documented result.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
In `StartStreamAsync`, catch any exception from the snapshot await, deterministically
|
||||||
|
tear down the bridge actor (`Tell(StopDebugStream)` via the local actor reference, since
|
||||||
|
a racing `onTerminatedWrapper` may already have removed the session entry), and translate
|
||||||
|
the failure into a meaningful exception for the caller.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. The `catch (OperationCanceledException)`-only block in
|
||||||
|
`StartStreamAsync` was replaced with `catch (Exception)`: it removes the session entry,
|
||||||
|
sends `StopDebugStream` to the bridge actor via the local reference (idempotent — the
|
||||||
|
actor may already be stopping itself), and throws a descriptive exception —
|
||||||
|
`TimeoutException` for the 30s timeout, otherwise an `InvalidOperationException` that
|
||||||
|
names the instance/site and wraps the underlying cause. Regression test
|
||||||
|
`DebugStreamServiceTests.StartStreamAsync_StreamTerminatesBeforeSnapshot_ThrowsMeaningfulException`
|
||||||
|
fails against the pre-fix code and passes after. Fixed by the commit whose message
|
||||||
|
references `Communication-001`.
|
||||||
|
|
||||||
|
### Communication-002 — gRPC reconnect does not unsubscribe the previous stream, leaking site-side relay actors
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:170`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:143` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
On a gRPC stream error, `HandleGrpcError` increments the retry count, flips
|
||||||
|
`_useNodeA`, and schedules `OpenGrpcStream`. `OpenGrpcStream` cancels and disposes
|
||||||
|
`_grpcCts` and starts a fresh `SubscribeInstance` call — but it never calls
|
||||||
|
`client.Unsubscribe(_correlationId)` on the *old* node's client, and the site-side
|
||||||
|
`SiteStreamGrpcServer` keys active streams by `correlation_id` only. Because the new
|
||||||
|
subscription goes to the *other* node (`_useNodeA` flipped), the old node's
|
||||||
|
`SiteStreamGrpcServer` still has an active stream + `StreamRelayActor` +
|
||||||
|
`SiteStreamManager` subscription for that correlation ID. The old node only learns the
|
||||||
|
client is gone via TCP RST or keepalive — exactly the failure mode that triggered the
|
||||||
|
reconnect (network partition / silent node), so detection may take ~25s or never. Each
|
||||||
|
reconnect can therefore leave a zombie relay actor on the failed node. `CleanupGrpc`
|
||||||
|
(which *does* call `Unsubscribe`) is only invoked on terminal paths, not between
|
||||||
|
reconnect attempts.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Before reconnecting in `HandleGrpcError` / at the top of `OpenGrpcStream`, call
|
||||||
|
`Unsubscribe(_correlationId)` on the client for the *previous* endpoint (the one that
|
||||||
|
just failed) so the local CTS is cancelled and — where the channel is still alive —
|
||||||
|
the gRPC cancellation reaches the site and stops the relay actor.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source:
|
||||||
|
`HandleGrpcError` flipped `_useNodeA` and scheduled `OpenGrpcStream` without ever
|
||||||
|
unsubscribing the failed stream, leaving the old node's `StreamRelayActor` zombie until
|
||||||
|
TCP/keepalive timeout. Fix: `HandleGrpcError` now resolves the client for the
|
||||||
|
*previous* endpoint (before flipping `_useNodeA`) and calls `Unsubscribe(_correlationId)`
|
||||||
|
on it, so the local CTS is cancelled and gRPC cancellation reaches the still-alive site.
|
||||||
|
Regression test `DebugStreamBridgeActorTests.On_GrpcError_Unsubscribes_Old_Stream_Before_Reconnect`
|
||||||
|
fails against the pre-fix code and passes after.
|
||||||
|
|
||||||
|
### Communication-003 — SiteStreamGrpcClient subscription map overwritten without disposal; reconnect can cancel the wrong stream
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:77`, `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:106` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SubscribeAsync` does `_subscriptions[correlationId] = cts;` (line 77),
|
||||||
|
unconditionally overwriting any existing entry for that correlation ID without
|
||||||
|
cancelling or disposing the previous `CancellationTokenSource`. The `finally` block
|
||||||
|
then does `_subscriptions.TryRemove(correlationId, out _)` (line 106) which removes
|
||||||
|
the entry **by key only, regardless of which CTS is stored**. Because
|
||||||
|
`DebugStreamBridgeActor` reuses the same `_correlationId` across reconnect attempts
|
||||||
|
(and `SiteStreamGrpcClientFactory` returns the same `SiteStreamGrpcClient` for a site
|
||||||
|
even after a node flip), two `SubscribeAsync` calls can briefly share a correlation
|
||||||
|
ID. The first call's `finally` then removes the *second* call's CTS entry, so a later
|
||||||
|
`Unsubscribe(correlationId)` finds nothing and the live stream is never cancelled — an
|
||||||
|
orphan. Conversely the overwritten CTS is leaked (never disposed).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
When inserting, cancel+dispose any prior CTS for that correlation ID. In the `finally`,
|
||||||
|
remove only if the stored CTS is the one this call created (use the
|
||||||
|
`TryRemove(KeyValuePair)` overload, mirroring what `SiteStreamGrpcServer` already does
|
||||||
|
with `StreamEntry`). Consider keying subscriptions by a per-call GUID rather than the
|
||||||
|
caller-supplied correlation ID.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source: the
|
||||||
|
inline `_subscriptions[correlationId] = cts` overwrote a prior CTS without
|
||||||
|
cancel/dispose (leak), and the `finally`'s `TryRemove(correlationId, out _)` removed by
|
||||||
|
key only — a racing reconnect's live CTS could be removed by the prior call's `finally`,
|
||||||
|
orphaning the live stream. Fix: extracted two internal helpers used by `SubscribeAsync`
|
||||||
|
— `RegisterSubscription` cancels+disposes any existing CTS for the correlation ID before
|
||||||
|
inserting, and `RemoveSubscription` uses the `ConcurrentDictionary.TryRemove(KeyValuePair)`
|
||||||
|
overload so it removes only the CTS that call created (mirroring `SiteStreamGrpcServer`'s
|
||||||
|
`StreamEntry` pattern). Regression tests
|
||||||
|
`SiteStreamGrpcClientTests.RegisterSubscription_ReusedCorrelationId_CancelsAndDisposesPriorCts`
|
||||||
|
and `SiteStreamGrpcClientTests.RemoveSubscription_OnlyRemovesOwnCts_NotAReplacement`
|
||||||
|
fail against the pre-fix logic and pass after.
|
||||||
|
|
||||||
|
### Communication-004 — Coordinator actors declare no SupervisorStrategy (design requires Resume)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:42`, `src/ScadaLink.Communication/Actors/SiteCommunicationActor.cs:22` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
CLAUDE.md ("Explicit supervision strategies: Resume for coordinator actors, Stop for
|
||||||
|
short-lived execution actors") requires coordinator actors to use an explicit `Resume`
|
||||||
|
supervision strategy. `CentralCommunicationActor` and `SiteCommunicationActor` are
|
||||||
|
long-lived coordinators (they own the per-site ClusterClient map, debug
|
||||||
|
subscriptions, in-progress deployments) but neither overrides `SupervisorStrategy`.
|
||||||
|
They fall back to the Akka default (`OneForOneStrategy` with `Restart`). A child fault
|
||||||
|
— e.g. a `ClusterClient` child of `CentralCommunicationActor` created by
|
||||||
|
`DefaultSiteClientFactory` — would `Restart` under the default strategy, and any
|
||||||
|
exception in the coordinator itself would restart it, wiping `_siteClients`,
|
||||||
|
`_debugSubscriptions`, and `_inProgressDeployments` silently. The design intent is
|
||||||
|
`Resume` so transient child faults do not discard coordinator state.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Override `SupervisorStrategy` on both actors to return an explicit
|
||||||
|
`OneForOneStrategy` with `Directive.Resume` (or the project's standard coordinator
|
||||||
|
strategy), matching the documented decision and other coordinator actors.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-005 — gRPC keepalive and max-stream-lifetime options are defined but never applied
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:25`, `src/ScadaLink.Communication/CommunicationOptions.cs:36` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CommunicationOptions` exposes `GrpcKeepAlivePingDelay`, `GrpcKeepAlivePingTimeout`,
|
||||||
|
`GrpcMaxStreamLifetime`, and `GrpcMaxConcurrentStreams`, and the design doc's
|
||||||
|
"gRPC Connection Keepalive" section explicitly states these are configurable. However
|
||||||
|
`SiteStreamGrpcClient`'s constructor hard-codes `KeepAlivePingDelay =
|
||||||
|
TimeSpan.FromSeconds(15)` and `KeepAlivePingTimeout = TimeSpan.FromSeconds(10)`
|
||||||
|
instead of reading the options. `GrpcMaxStreamLifetime` (the documented "Session
|
||||||
|
timeout — 4 hours" third layer of dead-client detection) is not referenced anywhere
|
||||||
|
— `SiteStreamGrpcServer.SubscribeInstance` creates a linked CTS from the call
|
||||||
|
cancellation token only, with no `CancelAfter`. The 4-hour zombie-stream safety net
|
||||||
|
described in the design doc does not exist in code. `GrpcMaxConcurrentStreams` is also
|
||||||
|
not wired to the server (`SiteStreamGrpcServer` takes a `maxConcurrentStreams`
|
||||||
|
constructor parameter defaulting to 100, but nothing binds the option to it).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Flow `CommunicationOptions` into `SiteStreamGrpcClient` and `SiteStreamGrpcServer`
|
||||||
|
(via the factory / DI). Apply `GrpcKeepAlivePingDelay` / `GrpcKeepAlivePingTimeout` to
|
||||||
|
the `SocketsHttpHandler`, bind `GrpcMaxConcurrentStreams` to the server's limit, and
|
||||||
|
implement the `GrpcMaxStreamLifetime` session timeout with `CancelAfter` on the
|
||||||
|
server-side stream CTS — or, if the 4-hour cap is intentionally dropped, remove the
|
||||||
|
option and update the design doc.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-006 — Site address load failures are silently swallowed, leaving a stale cache
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:204` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LoadSiteAddressesFromDb` runs the repository query inside `Task.Run(...).PipeTo(self)`.
|
||||||
|
If `GetAllSitesAsync` throws (database unavailable, transient connection error), the
|
||||||
|
faulted task is piped to `Self` as a `Status.Failure`. `CentralCommunicationActor` has
|
||||||
|
no `Receive<Status.Failure>` handler, so the failure becomes an unhandled message
|
||||||
|
(logged at debug, not surfaced) and the periodic refresh silently fails. If the
|
||||||
|
*first* startup load fails the actor runs with an empty `_siteClients` map — every
|
||||||
|
`SiteEnvelope` is dropped (line 187) and every Ask times out with no indication of the
|
||||||
|
root cause.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `Receive<Status.Failure>` handler that logs the load failure at Warning/Error
|
||||||
|
level so operators can distinguish "site has no addresses configured" from "database
|
||||||
|
is down". Optionally surface a health metric for repeated load failures.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-007 — `SiteStreamGrpcClientFactory.Dispose` blocks on async work (sync-over-async)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClientFactory.cs:53` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`Dispose()` calls `DisposeAsync().AsTask().GetAwaiter().GetResult()`. This is the
|
||||||
|
classic sync-over-async pattern: it blocks the calling thread until all per-site
|
||||||
|
`SiteStreamGrpcClient.DisposeAsync` calls complete. If `Dispose` is invoked from a
|
||||||
|
context with a single-threaded synchronization context or from DI container shutdown
|
||||||
|
on a constrained thread pool, this can deadlock or stall host shutdown. The class
|
||||||
|
already implements `IAsyncDisposable`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Prefer registering and disposing the factory through `IAsyncDisposable` only (modern
|
||||||
|
.NET DI honours it for singletons). If a synchronous `Dispose` must remain, dispose
|
||||||
|
the underlying `GrpcChannel`s directly (synchronous) rather than blocking on the async
|
||||||
|
path, or document why blocking is safe here.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-008 — Reconnect retry-count reset can mask a flapping stream indefinitely
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:71`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:174` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`_retryCount` is reset to 0 every time a single `AttributeValueChanged` or
|
||||||
|
`AlarmStateChanged` event is received (lines 72, 77). Combined with `MaxRetries = 3`,
|
||||||
|
a stream that connects, delivers exactly one event, then fails — repeatedly — will
|
||||||
|
reconnect forever. The design doc states "max 3 retries, terminate the session if all
|
||||||
|
retries fail"; the current logic only terminates after 3 *consecutive* failures with
|
||||||
|
zero intervening events, so a flapping site never trips the limit and the debug
|
||||||
|
session (and its site-side relay) lives on indefinitely. The `ReceiveTimeout` orphan
|
||||||
|
net is also reset by every received message, so it does not bound this case either.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either reset `_retryCount` only after the stream has been stably connected for some
|
||||||
|
minimum duration (e.g. a timer armed on stream open, cancelled on the next error), or
|
||||||
|
keep a separate cumulative reconnect counter / time window that bounds total
|
||||||
|
reconnects regardless of intervening events.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-009 — `_siteClients` field is mutable and reassignable; cache update is not atomic on failure
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:53`, `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:240` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`_siteClients` is a non-`readonly` `Dictionary` field. It is only mutated on the actor
|
||||||
|
thread (correct), but the field is needlessly reassignable, and
|
||||||
|
`HandleSiteAddressCacheLoaded` mutates it in place across several loops. If
|
||||||
|
`ActorPath.Parse` throws on a malformed address mid-loop (e.g. a site row with a
|
||||||
|
garbage `NodeAAddress`), the method aborts partway through, having already stopped
|
||||||
|
some ClusterClients and added others — leaving the cache partially updated with no
|
||||||
|
recovery until the next 60s refresh. The other actor mutable collections
|
||||||
|
(`_debugSubscriptions`, `_inProgressDeployments`) are correctly `readonly`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Mark `_siteClients` `readonly`. Validate/parse all addresses up front (or wrap
|
||||||
|
`ActorPath.Parse` in a try/catch that logs and skips the bad site) so a single
|
||||||
|
malformed site record cannot abort the whole refresh and leave a half-updated cache.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-010 — `DebugStreamBridgeActor` XML doc incorrectly describes it as a "Persistent actor"
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:10` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The class summary opens with "Persistent actor (one per active debug session)...".
|
||||||
|
The actor derives from `ReceiveActor`, not a persistent actor base class, holds no
|
||||||
|
`PersistenceId`, and writes no journal/snapshot. "Persistent" is misleading — debug
|
||||||
|
sessions are explicitly "session-based and temporary" per the design doc. A reader
|
||||||
|
could assume state survives restart, which it does not.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Reword the summary to "Long-lived (per active debug session) actor on the central
|
||||||
|
side..." or similar, removing the word "Persistent".
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Communication-011 — No test coverage for snapshot-timeout cleanup, address-cache failure, or gRPC reconnect leak
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.Communication.Tests/` (module-wide) |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test suite covers happy-path routing, handler-not-registered failures, heartbeat
|
||||||
|
bumping, cache refresh, and gRPC bridge reconnect/retry. However several critical
|
||||||
|
paths identified in this review have no coverage:
|
||||||
|
|
||||||
|
- The `DebugStreamService.StartStreamAsync` snapshot-timeout path (Communication-001)
|
||||||
|
— no test verifies bridge actor / site subscription teardown on timeout, nor the
|
||||||
|
`onTerminated`-before-snapshot race that throws a non-`OperationCanceledException`.
|
||||||
|
- `CentralCommunicationActor` behaviour when `LoadSiteAddressesFromDb` faults
|
||||||
|
(Communication-006) — `RefreshSiteAddresses_UpdatesCache` only exercises success.
|
||||||
|
- `SiteStreamGrpcClient` subscription-map overwrite/removal race (Communication-003)
|
||||||
|
and gRPC reconnect not unsubscribing the old node (Communication-002).
|
||||||
|
- A malformed `NodeAAddress` aborting `HandleSiteAddressCacheLoaded` (Communication-009).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for: snapshot timeout / pre-snapshot termination cleanup; address-load
|
||||||
|
failure logging and empty-cache behaviour; reusing a correlation ID across
|
||||||
|
`SubscribeAsync` calls; and a malformed site address during cache refresh.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,415 @@
|
|||||||
|
# Code Review — ConfigurationDatabase
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.ConfigurationDatabase` |
|
||||||
|
| Design doc | `docs/requirements/Component-ConfigurationDatabase.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 10 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The ConfigurationDatabase module is a focused, conventional EF Core data-access layer:
|
||||||
|
a single `ScadaLinkDbContext`, Fluent API entity configurations, eight repository
|
||||||
|
implementations of Commons-defined interfaces, an `IAuditService` implementation, an
|
||||||
|
`IInstanceLocator`, environment-aware migration handling, and design-time tooling
|
||||||
|
support. Overall structure adheres well to the design doc and the CLAUDE.md "Code
|
||||||
|
Organization" decisions — POCO entities and interfaces live in Commons, EF mappings and
|
||||||
|
implementations live here, Fluent API only, and optimistic concurrency is correctly
|
||||||
|
applied to `DeploymentRecord` via `rowversion`. The module is generally healthy.
|
||||||
|
|
||||||
|
The main themes across findings are: (1) a genuine logic bug in
|
||||||
|
`GetTemplateWithChildrenAsync`, which loads child templates and then discards them, so
|
||||||
|
the method does not deliver what its name implies; (2) secret-bearing columns (SMTP
|
||||||
|
credentials, external-system auth config, database connection strings) persisted in
|
||||||
|
plaintext with no encryption-at-rest; (3) a hardcoded SQL `sa` connection string with a
|
||||||
|
password literal embedded in `DesignTimeDbContextFactory`; (4) the no-arg
|
||||||
|
`AddConfigurationDatabase()` overload, which silently registers nothing, making a
|
||||||
|
misconfigured central node fail late and opaquely; and (5) audit-trail robustness gaps —
|
||||||
|
`AuditService` can throw on serializing entities with navigation cycles, rolling back
|
||||||
|
the whole business operation, and the design doc's claim that audit `Id` is `Long/GUID`
|
||||||
|
disagrees with the `int` entity. Test coverage is good for the repositories that have
|
||||||
|
tests (Security, CentralUI, audit, concurrency, seed data, data protection) but several
|
||||||
|
repositories (`TemplateEngineRepository`, `DeploymentManagerRepository`,
|
||||||
|
`ExternalSystemRepository`, `InboundApiRepository`, `NotificationRepository`,
|
||||||
|
`SiteRepository`, `InstanceLocator`) have little or no direct coverage.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | `GetTemplateWithChildrenAsync` discards loaded children (CD-001); `GetApprovedKeysForMethodAsync` CSV parsing is brittle (CD-008). |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | No actors in this module; data-access layer only. No issues found. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | DbContext correctly scoped; optimistic concurrency on `DeploymentRecord` correct. Repositories hold no shared mutable state. No issues found. |
|
||||||
|
| 4 | Error handling & resilience | ✓ | `WaitForDatabaseReadyAsync` is sound. No-arg DI overload fails late and silently (CD-003); audit JSON serialization failure handling (CD-007). |
|
||||||
|
| 5 | Security | ✓ | Hardcoded `sa` credential literal (CD-002); SMTP/DB-connection/auth secrets stored unencrypted (CD-004). |
|
||||||
|
| 6 | Performance & resource management | ✓ | `GetAllTemplatesAsync` / `GetTemplateTreeAsync` eager-load multiple collections without `AsSplitQuery` (CD-009). No N+1 in audited paths. |
|
||||||
|
| 7 | Design-document adherence | ✓ | Audit `Id` type mismatch vs design doc (CD-005); seed data uses `HasData` consistent with design. |
|
||||||
|
| 8 | Code organization & conventions | ✓ | Mostly clean. `Grpc*` address columns unbounded (CD-006); inconsistent null-guard on injected context (CD-011). |
|
||||||
|
| 9 | Testing coverage | ✓ | Several repositories and `InstanceLocator` lack direct tests (CD-010). |
|
||||||
|
| 10 | Documentation & comments | ✓ | `DeploymentManagerRepository` "WP-24 stub" XML comment is stale; noted in module context but not raised as a standalone finding. No issues found beyond items above. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### ConfigurationDatabase-001 — `GetTemplateWithChildrenAsync` loads child templates then discards them
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/TemplateEngineRepository.cs:30-41` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetTemplateWithChildrenAsync` queries for all templates whose `ParentTemplateId`
|
||||||
|
equals the requested id, assigns the result to the local variable `children`, and
|
||||||
|
then returns `template` — the `children` list is never used, attached to the returned
|
||||||
|
object, or otherwise exposed. The method is therefore behaviourally identical to
|
||||||
|
`GetTemplateByIdAsync` but issues an extra database round-trip. Any caller relying on
|
||||||
|
the method name to obtain a template with its derived/child templates populated will
|
||||||
|
silently receive a template with no children, leading to incorrect template-resolution
|
||||||
|
or UI behaviour with no error.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either populate the children onto the returned aggregate (e.g. project into a result
|
||||||
|
type that carries the children, or load them into a navigation collection that is
|
||||||
|
actually returned), or remove the dead query and the misleading method if children are
|
||||||
|
not in fact needed. If the navigation does not exist on the `Template` entity, add an
|
||||||
|
explicit result tuple/DTO so the loaded data reaches the caller.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source: the
|
||||||
|
method ran a `Where(t => t.ParentTemplateId == id)` query, assigned the result to a
|
||||||
|
local `children` variable, and never used it — a misleading no-op that also issued an
|
||||||
|
extra database round-trip per call.
|
||||||
|
|
||||||
|
Triage of the three callers (`FlatteningPipeline.BuildTemplateChainAsync`,
|
||||||
|
`ManagementActor.HandleGetTemplate`, `ManagementActor.HandleValidateTemplate`) showed
|
||||||
|
none consume derived/sub-templates; they all need the template's *member* collections
|
||||||
|
(Attributes/Alarms/Scripts/Compositions), which `GetTemplateByIdAsync` already
|
||||||
|
eager-loads. The `Template` entity has no child-templates navigation collection, and
|
||||||
|
adding one (plus changing the interface signature) would require editing
|
||||||
|
`ScadaLink.Commons`, which is outside this module's scope.
|
||||||
|
|
||||||
|
Fix applied the recommendation's secondary option: removed the dead query so the
|
||||||
|
method no longer misleads or wastes a round-trip, and added an XML doc comment
|
||||||
|
clarifying that "children" means the template's member collections. The method now
|
||||||
|
honestly delegates to `GetTemplateByIdAsync`. Regression tests added in
|
||||||
|
`TemplateEngineRepositoryTests.cs`:
|
||||||
|
`GetTemplateWithChildrenAsync_ReturnsTemplateWithAllMemberCollectionsPopulated`,
|
||||||
|
`GetTemplateWithChildrenAsync_PreservesParentTemplateId_ForInheritanceChainWalk`, and
|
||||||
|
`GetTemplateWithChildrenAsync_ReturnsNull_WhenTemplateDoesNotExist` — pinning the
|
||||||
|
template-aggregate contract the callers depend on.
|
||||||
|
|
||||||
|
### ConfigurationDatabase-002 — Hardcoded `sa` connection string with embedded password literal
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/DesignTimeDbContextFactory.cs:21-22` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DesignTimeDbContextFactory` falls back to a literal connection string
|
||||||
|
`"Server=localhost,1433;Database=ScadaLink_Config;User Id=sa;Password=YourPassword;TrustServerCertificate=True"`
|
||||||
|
when no configured connection string is found. Embedding a credential literal (even a
|
||||||
|
placeholder) in source code is a poor pattern: it is committed to version control,
|
||||||
|
encourages copy-paste of `sa`/`TrustServerCertificate=True` into real environments, and
|
||||||
|
the fallback can mask a genuine misconfiguration during `dotnet ef` operations by
|
||||||
|
silently pointing tooling at an unintended database.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Remove the hardcoded fallback. If no connection string is resolved from configuration
|
||||||
|
or environment, throw a clear `InvalidOperationException` instructing the developer to
|
||||||
|
set `ScadaLink:Database:ConfigurationDb` (or an environment variable). At minimum, read
|
||||||
|
the design-time connection string from an environment variable rather than a literal,
|
||||||
|
and never use `sa`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-003 — No-arg `AddConfigurationDatabase()` silently registers nothing
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs:44-49` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The parameterless `AddConfigurationDatabase()` overload is a deliberate no-op "retained
|
||||||
|
for backward compatibility during migration." If a central node is wired up with this
|
||||||
|
overload by mistake, no `ScadaLinkDbContext`, repositories, `IAuditService`, or
|
||||||
|
`IInstanceLocator` are registered. The failure does not surface at startup; it surfaces
|
||||||
|
much later as opaque DI resolution exceptions the first time any consumer requests a
|
||||||
|
repository — far from the actual misconfiguration. The XML comment also refers to
|
||||||
|
"Phase 0 stubs," which is stale relative to the current state of the module.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either delete the no-op overload now that the connection-string overload exists, or
|
||||||
|
mark it `[Obsolete]` with an error-level message so misuse is a compile-time failure.
|
||||||
|
If a true "site node" no-op is genuinely required, give it an explicit, self-documenting
|
||||||
|
name (e.g. `AddConfigurationDatabaseNoOp()`), and remove the stale "Phase 0" wording.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-004 — Secret-bearing columns stored in plaintext with no protection
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/NotificationConfiguration.cs:56-57`, `src/ScadaLink.ConfigurationDatabase/Configurations/ExternalSystemConfiguration.cs:25-26,75-77` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SmtpConfiguration.Credentials`, `ExternalSystemDefinition.AuthConfiguration`, and
|
||||||
|
`DatabaseConnectionDefinition.ConnectionString` all hold authentication secrets (SMTP
|
||||||
|
OAuth2 client secrets / passwords, external-system API keys or Basic Auth credentials,
|
||||||
|
and database passwords respectively). They are mapped as ordinary string columns and
|
||||||
|
persisted verbatim. Anyone with read access to the configuration database — including
|
||||||
|
audit-log JSON if these entities are serialized into `AfterStateJson` — obtains the
|
||||||
|
plaintext secrets. The design doc does not call out encryption-at-rest for these
|
||||||
|
fields, so the design is also silent on a real risk.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Apply encryption to these fields, e.g. an EF Core value converter backed by ASP.NET
|
||||||
|
Data Protection (the module already configures `IDataProtectionKeyContext`), or rely on
|
||||||
|
SQL Server Always Encrypted / column encryption. Separately, ensure `IAuditService`
|
||||||
|
callers never pass these secret-bearing entities (or that the serializer redacts the
|
||||||
|
fields) so secrets do not leak into `AuditLogEntry.AfterStateJson`. Update the design
|
||||||
|
doc to state the chosen at-rest protection.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-005 — Audit `Id` type disagrees with the design doc
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/AuditConfiguration.cs:11` (entity `src/ScadaLink.Commons/Entities/Audit/AuditLogEntry.cs`) |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc's Audit Entry Schema table specifies `Id` as `Long / GUID`, and notes
|
||||||
|
the audit table is append-only and retained indefinitely. The actual `AuditLogEntry`
|
||||||
|
entity uses an `int` identity key. For a never-purged, append-only table that
|
||||||
|
accumulates one row per save operation across the system lifetime, a 32-bit identity
|
||||||
|
risks overflow over a long deployment horizon, and the code drifts from the documented
|
||||||
|
schema.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Change `AuditLogEntry.Id` to `long` (and the corresponding migration column to
|
||||||
|
`bigint`) to match the design doc and remove the overflow risk, or — if `int` is
|
||||||
|
intentional — update the design doc's schema table to say `int` and justify it.
|
||||||
|
Resolve the discrepancy in one direction.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-006 — `Site.GrpcNodeAAddress` / `GrpcNodeBAddress` columns are unbounded
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/SiteConfiguration.cs:24-25` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SiteConfiguration` explicitly sets `HasMaxLength(500)` for `NodeAAddress` and
|
||||||
|
`NodeBAddress`, but the entity also has `GrpcNodeAAddress` and `GrpcNodeBAddress`
|
||||||
|
(added per the gRPC streaming design decision) which are not configured at all. With no
|
||||||
|
length set, EF Core maps them to `nvarchar(max)`. This is inconsistent with the sibling
|
||||||
|
address columns, wastes the opportunity to constrain input, and `nvarchar(max)` columns
|
||||||
|
cannot be indexed and have different storage/performance characteristics.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add `builder.Property(s => s.GrpcNodeAAddress).HasMaxLength(500);` and the same for
|
||||||
|
`GrpcNodeBAddress`, matching the existing `NodeAAddress`/`NodeBAddress` mapping, and
|
||||||
|
generate a migration to alter the column types.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-007 — `AuditService` does not handle JSON-serialization failure of arbitrary `afterState`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Services/AuditService.cs:28-30` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LogAsync` serializes the caller-supplied `afterState` object with
|
||||||
|
`JsonSerializer.Serialize(afterState)` using default options. EF entity POCOs commonly
|
||||||
|
have navigation properties; serializing an entity that has loaded navigations (e.g. a
|
||||||
|
`Template` with `Attributes`/`Scripts`, or any entity with a cycle) will throw
|
||||||
|
`JsonException` for a reference cycle or produce a very large payload. Because audit
|
||||||
|
writes are designed to commit in the same transaction as the change, a serialization
|
||||||
|
exception thrown here will roll back the *entire* business operation — a template
|
||||||
|
update fails because its audit entry could not be serialized. This couples audit
|
||||||
|
robustness to the shape of every entity passed in.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Configure `JsonSerializerOptions` with `ReferenceHandler.IgnoreCycles` (or
|
||||||
|
`Preserve`) and a sensible `MaxDepth`, and consider serializing a projected
|
||||||
|
DTO/snapshot rather than the live tracked entity. Decide explicitly whether an audit
|
||||||
|
serialization failure should fail the operation or be logged and degraded gracefully,
|
||||||
|
and document that decision against the design doc's transactional-guarantee section.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-008 — `GetApprovedKeysForMethodAsync` CSV parsing silently drops malformed ids
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/InboundApiRepository.cs:46-58` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ApiMethod.ApprovedApiKeyIds` is stored as a comma-separated string of integer ids.
|
||||||
|
`GetApprovedKeysForMethodAsync` splits it, maps each token with
|
||||||
|
`int.TryParse(...) ? id : -1`, then filters with `id > 0`. Any token that fails to
|
||||||
|
parse, or a legitimately negative/zero id, is silently discarded. If `ApprovedApiKeyIds`
|
||||||
|
becomes corrupt (e.g. a stray name instead of an id), the method quietly returns fewer
|
||||||
|
approved keys than expected, which for an API-key authorization path means a method may
|
||||||
|
unexpectedly reject a key that should be approved. Storing a relational many-to-many as
|
||||||
|
a CSV string in a column is itself fragile (no FK integrity, no cascade on key delete).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Short term: log a warning when a token fails to parse instead of silently dropping it,
|
||||||
|
so corruption is observable. Longer term: replace the CSV column with a proper join
|
||||||
|
table (`ApiMethodApprovedKey`) with foreign keys to `ApiMethod` and `ApiKey`, which
|
||||||
|
gives referential integrity and correct cascade behaviour when an API key is deleted.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-009 — Multi-collection eager loads issue cartesian-product queries
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/TemplateEngineRepository.cs:43-51,53-61`, `src/ScadaLink.ConfigurationDatabase/Repositories/CentralUiRepository.cs:45-55` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetAllTemplatesAsync`, `GetTemplatesComposingAsync`, and `GetTemplateTreeAsync` each
|
||||||
|
`Include` three-to-four sibling collections (`Attributes`, `Alarms`, `Scripts`,
|
||||||
|
`Compositions`) in a single query. EF Core's default single-query strategy produces a
|
||||||
|
cartesian-product join across those collections, so a template with N attributes, M
|
||||||
|
alarms, and K scripts yields N×M×K rows that EF must then de-duplicate. For templates
|
||||||
|
with many members this materially inflates the result set and query time.
|
||||||
|
`GetInstanceByIdAsync`/`GetAllInstancesAsync` have the same shape with three
|
||||||
|
collections.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add `.AsSplitQuery()` to these multi-collection-include queries (or set
|
||||||
|
`UseQuerySplittingBehavior(QuerySplittingBehavior.SplitQuery)` globally in
|
||||||
|
`AddConfigurationDatabase`) so each collection is loaded with a separate query and the
|
||||||
|
cartesian explosion is avoided.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-010 — Several repositories and `InstanceLocator` lack direct test coverage
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/TemplateEngineRepository.cs`, `Repositories/DeploymentManagerRepository.cs`, `Repositories/ExternalSystemRepository.cs`, `Repositories/InboundApiRepository.cs`, `Repositories/NotificationRepository.cs`, `Repositories/SiteRepository.cs`, `Services/InstanceLocator.cs` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test project covers `SecurityRepository`, `CentralUiRepository`, `AuditService`,
|
||||||
|
optimistic concurrency, seed data, and Data Protection persistence. There are no direct
|
||||||
|
tests for `TemplateEngineRepository` (the largest repository, and the one with the
|
||||||
|
CD-001 bug, which a test would have caught), `DeploymentManagerRepository` (including
|
||||||
|
its `Local`-then-stub delete fallback and the `DeleteInstanceAsync`
|
||||||
|
restrict-FK-cleanup logic), `ExternalSystemRepository`, `InboundApiRepository` (notably
|
||||||
|
`GetApprovedKeysForMethodAsync` CSV parsing — CD-008), `NotificationRepository`,
|
||||||
|
`SiteRepository` (including its stub-attach delete path), or `InstanceLocator`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add repository-level tests using the existing `SqliteTestHelper` pattern, covering at
|
||||||
|
minimum: CRUD round-trips, the stub-attach delete fallbacks in
|
||||||
|
`DeploymentManagerRepository`/`SiteRepository`, `DeleteInstanceAsync`'s explicit
|
||||||
|
deployment-record cleanup, `GetApprovedKeysForMethodAsync` with valid/malformed CSV,
|
||||||
|
and `InstanceLocator.GetSiteIdForInstanceAsync` for found/not-found cases.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ConfigurationDatabase-011 — Inconsistent constructor null-guarding across repositories/services
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/ExternalSystemRepository.cs:11-14`, `Repositories/InboundApiRepository.cs:11-14`, `Repositories/NotificationRepository.cs:11-14`, `Services/InstanceLocator.cs:13-16` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SecurityRepository`, `CentralUiRepository`, `TemplateEngineRepository`,
|
||||||
|
`DeploymentManagerRepository`, `SiteRepository`, and `AuditService` all guard their
|
||||||
|
injected `ScadaLinkDbContext` with `?? throw new ArgumentNullException(...)`.
|
||||||
|
`ExternalSystemRepository`, `InboundApiRepository`, `NotificationRepository`, and
|
||||||
|
`InstanceLocator` assign the constructor argument directly with no guard. This is a
|
||||||
|
minor consistency/maintainability issue: although the DI container will not normally
|
||||||
|
supply null, the divergence makes the codebase look unfinished and means a future
|
||||||
|
hand-constructed instance fails with a less informative `NullReferenceException` later.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Apply the same `?? throw new ArgumentNullException(nameof(context))` guard in the four
|
||||||
|
inconsistent constructors so all data-access types behave uniformly.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,532 @@
|
|||||||
|
# Code Review — DataConnectionLayer
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.DataConnectionLayer` |
|
||||||
|
| Design doc | `docs/requirements/Component-DataConnectionLayer.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 8 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The DataConnectionLayer is a reasonably well-structured module: the Become/Stash
|
||||||
|
lifecycle state machine, the captured-`Self` marshalling of background-thread
|
||||||
|
disconnect events, and the protocol-factory abstraction all follow the design doc
|
||||||
|
and Akka.NET conventions. However, the review found one **critical** actor-model
|
||||||
|
violation — `HandleSubscribe` spawns a `Task.Run` that mutates the actor's private
|
||||||
|
dictionaries and counters from a thread-pool thread, racing with the actor's own
|
||||||
|
message loop. Several **high**-severity issues cluster around concurrency and error
|
||||||
|
handling: the subscription-failure path leaves the connection with degraded subtrees
|
||||||
|
but no real recovery, the `DataConnectionManagerActor`'s `Restart` supervision drops
|
||||||
|
all subscription state on a connection-actor crash, and `RealOpcUaClient`'s monitored-
|
||||||
|
item callback dictionary is mutated without synchronization while OPC UA notification
|
||||||
|
threads read it. The remaining findings concern stale health counters after failover,
|
||||||
|
an unused `WriteTimeout` option (writes are unbounded despite the design promising a
|
||||||
|
30 s timeout), `ReadBatchAsync` aborting mid-batch, and documentation drift between
|
||||||
|
the design doc's failover state machine and the implemented unstable-disconnect
|
||||||
|
heuristic. Test coverage is adequate for the happy paths and failover but absent for
|
||||||
|
tag-resolution retry, disconnect/re-subscribe, and concurrency around `HandleSubscribe`.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | x | `_resolvedTags` double-counting and stale counters after failover; `ReadBatchAsync` aborts mid-batch. |
|
||||||
|
| 2 | Akka.NET conventions | x | `Task.Run` mutating actor state (critical); `Restart` supervision loses state; closures capturing `_subscriptionsByInstance`. |
|
||||||
|
| 3 | Concurrency & thread safety | x | Actor state mutated off the actor thread; `RealOpcUaClient` callback dictionary unsynchronized. |
|
||||||
|
| 4 | Error handling & resilience | x | Subscription failures not surfaced; unbounded write with no timeout; reconnect after subscribe-time failure not handled. |
|
||||||
|
| 5 | Security | x | `AutoAcceptUntrustedCerts` defaults to `true`; OPC UA password handling acceptable. See finding 012. |
|
||||||
|
| 6 | Performance & resource management | x | `HandleUnsubscribe` O(n^2) over instances; initial-read loop serial per tag. |
|
||||||
|
| 7 | Design-document adherence | x | Failover heuristic (unstable-disconnect count) differs from documented state machine; `WriteTimeout` documented but unused. |
|
||||||
|
| 8 | Code organization & conventions | x | No issues found — POCOs in Commons, options class owned by component, factory pattern consistent. |
|
||||||
|
| 9 | Testing coverage | x | No tests for tag-resolution retry, disconnect/re-subscribe, bad-quality push, or `HandleSubscribe` concurrency. |
|
||||||
|
| 10 | Documentation & comments | x | XML comment on `RaiseDisconnected` claims thread safety it does not have; design doc round-robin description stale. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### DataConnectionLayer-001 — `Task.Run` in `HandleSubscribe` mutates actor state off the actor thread
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Critical |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:473-538` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleSubscribe` launches a `Task.Run(async () => ...)` that runs on a thread-pool
|
||||||
|
thread and directly mutates the actor's private mutable state: `instanceTags` (a
|
||||||
|
reference into `_subscriptionsByInstance`), `_subscriptionIds`, `_totalSubscribed`,
|
||||||
|
`_resolvedTags`, and `_unresolvedTags`. All of these are simultaneously read and
|
||||||
|
written by the actor's own message loop (`HandleTagValueReceived`, `HandleUnsubscribe`,
|
||||||
|
`ReSubscribeAll`, `HandleRetryTagResolution`, `ReplyWithHealthReport`). This is a
|
||||||
|
direct violation of the Akka.NET actor model, which guarantees single-threaded access
|
||||||
|
to actor state only when state is touched on the actor thread. Two concurrent
|
||||||
|
subscribe requests, or a subscribe overlapping a `TagValueReceived` / `GetHealthReport`,
|
||||||
|
produce data races on `Dictionary`/`HashSet`/`int` — `Dictionary` is not thread-safe
|
||||||
|
and concurrent mutation can corrupt internal buckets, throw, or lose entries. It can
|
||||||
|
also produce torn reads of the health counters.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Do not mutate actor state from the background task. Perform only the `await
|
||||||
|
_adapter.SubscribeAsync(...)` / `ReadAsync(...)` I/O in the task, collect the results
|
||||||
|
into a local immutable result object, and `PipeTo(Self)` an internal message (e.g.
|
||||||
|
`SubscribeCompleted`) whose handler — running on the actor thread — applies all state
|
||||||
|
mutations and counter updates. The response to `Sender` should be sent from that
|
||||||
|
handler too.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. `HandleSubscribe` was restructured to follow the actor's own
|
||||||
|
`PipeTo(Self)` pattern (the one already used by `HandleRetryTagResolution`): the
|
||||||
|
background `Task.Run` now performs only adapter I/O (`SubscribeAsync`/`ReadAsync`),
|
||||||
|
collects per-tag outcomes into an immutable `SubscribeCompleted` message, and pipes
|
||||||
|
that to `Self`. All mutation of `_subscriptionIds`, `_subscriptionsByInstance`,
|
||||||
|
`_totalSubscribed`, `_resolvedTags` and `_unresolvedTags` now happens in the new
|
||||||
|
`HandleSubscribeCompleted` handler on the actor thread; it is wired into the
|
||||||
|
Connected, Connecting and Reconnecting states so an in-flight subscribe is applied
|
||||||
|
regardless of state transitions. Regression test
|
||||||
|
`DCL001_ConcurrentSubscribes_DoNotCorruptSubscriptionCounters` (30×30 concurrent
|
||||||
|
subscribes) fails against the pre-fix code and passes after. Fixed by the commit
|
||||||
|
whose message references `DataConnectionLayer-001`.
|
||||||
|
|
||||||
|
### DataConnectionLayer-002 — `Restart` supervision discards all subscription state on connection-actor crash
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionManagerActor.cs:131-141` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DataConnectionManagerActor.SupervisorStrategy` returns a `OneForOneStrategy` with
|
||||||
|
`Directive.Restart` for `DataConnectionActor` failures. On restart, Akka.NET creates a
|
||||||
|
fresh actor instance, so all in-memory fields — `_subscriptionsByInstance`,
|
||||||
|
`_subscriptionIds`, `_subscribers`, `_unresolvedTags`, the quality counters — are
|
||||||
|
silently discarded. The actor re-enters `Connecting` with zero subscriptions, and the
|
||||||
|
design doc's "transparent re-subscribe" guarantee (WP-10) is broken: Instance Actors
|
||||||
|
that had subscribed before the crash never get their tags re-subscribed and will sit
|
||||||
|
at uncertain/stale quality indefinitely with no error returned. There is no durable
|
||||||
|
subscription store from which a restarted actor could rebuild state.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either (a) make the subscription registry durable/recoverable so a restarted actor
|
||||||
|
can rebuild it (persist to local SQLite as the design doc says connection definitions
|
||||||
|
are, and have `PreStart` reload subscriptions), or (b) treat a connection-actor crash
|
||||||
|
as a lifecycle event the `DataConnectionManagerActor` notices, so it can re-issue the
|
||||||
|
subscription registrations. At minimum document that subscribers must re-register
|
||||||
|
after a crash and surface the lost-state condition rather than failing silently.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. The `DataConnectionManagerActor.SupervisorStrategy` was changed
|
||||||
|
from `Directive.Restart` to `Directive.Resume` for `DataConnectionActor` failures.
|
||||||
|
`Resume` keeps the existing actor instance and all its in-memory subscription state
|
||||||
|
(`_subscriptionsByInstance`, `_subscriptionIds`, `_subscribers`, quality counters)
|
||||||
|
intact across a transient handler exception, so the design doc's "transparent
|
||||||
|
re-subscribe" guarantee (WP-10) is preserved. The actor is a long-lived stateful
|
||||||
|
coordinator and its own Become/Stash reconnect state machine already recovers
|
||||||
|
connection-level faults — it does not need a restart. This also aligns with the
|
||||||
|
ScadaLink convention of `Resume` for coordinator actors. Regression test
|
||||||
|
`DCL002_ConnectionActorCrash_PreservesSubscriptionState` crashes the connection actor
|
||||||
|
via a synchronously-throwing write and asserts the subscription survives (health
|
||||||
|
report still shows 1 subscribed/resolved tag); it fails against the pre-fix `Restart`
|
||||||
|
code and passes after. Fixed by the commit whose message references
|
||||||
|
`DataConnectionLayer-002` (commit `<pending>`).
|
||||||
|
|
||||||
|
### DataConnectionLayer-003 — `RealOpcUaClient` callback/monitored-item dictionaries mutated without synchronization
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:16-17,130-131,153,163,173,183-184` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`_monitoredItems` and `_callbacks` are plain `Dictionary<,>` instances. They are
|
||||||
|
written from `CreateSubscriptionAsync` / `RemoveSubscriptionAsync` (invoked from the
|
||||||
|
`DataConnectionActor`'s `Task.Run` / `ContinueWith` continuations, i.e. thread-pool
|
||||||
|
threads) and from `DisconnectAsync` (`.Clear()`), while being read concurrently from
|
||||||
|
the OPC Foundation SDK's `MonitoredItem.Notification` event handler, which fires on
|
||||||
|
the SDK's internal publish threads (`_callbacks.TryGetValue(handle, ...)` at line
|
||||||
|
163). Concurrent reads during a `Dictionary` resize or `Clear()` are undefined
|
||||||
|
behaviour — they can throw `InvalidOperationException`, return wrong entries, or
|
||||||
|
corrupt the dictionary. The `DataConnectionActor`'s subscribe path already runs off
|
||||||
|
the actor thread (finding 001), so multiple subscribe calls can also race each other
|
||||||
|
here.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Use `ConcurrentDictionary<,>` for `_monitoredItems` and `_callbacks`, or guard all
|
||||||
|
access with a lock. Note that fixing finding 001 (serialising subscribe through the
|
||||||
|
actor thread) reduces but does not eliminate the race, because the SDK notification
|
||||||
|
threads still read `_callbacks` concurrently with `RemoveSubscriptionAsync` /
|
||||||
|
`DisconnectAsync`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. `_monitoredItems` and `_callbacks` in `RealOpcUaClient` were
|
||||||
|
changed from plain `Dictionary<,>` to `ConcurrentDictionary<,>`, and the two
|
||||||
|
`Remove(key)` call sites switched to `TryRemove`. This makes the maps safe to read
|
||||||
|
from the OPC Foundation SDK's publish threads (`MonitoredItem.Notification` reading
|
||||||
|
`_callbacks`) concurrently with subscribe/disconnect mutations on other threads.
|
||||||
|
`RealOpcUaClient` wraps concrete OPC Foundation SDK types (`ISession`,
|
||||||
|
`Subscription`, `MonitoredItem`) and cannot be exercised without a live OPC UA
|
||||||
|
server, so the regression is guarded structurally by
|
||||||
|
`DCL003_SharedDictionaryFields_AreConcurrentCollections` (a reflection test asserting
|
||||||
|
both fields are `ConcurrentDictionary<,>`); it fails against the pre-fix `Dictionary`
|
||||||
|
code and passes after. Fixed by the commit whose message references
|
||||||
|
`DataConnectionLayer-003` (commit `<pending>`).
|
||||||
|
|
||||||
|
### DataConnectionLayer-004 — Subscribe-time tag-resolution failure leaves the connection healthy but never recovers correctly
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:495-503,529-537` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
When `_adapter.SubscribeAsync` throws inside the `HandleSubscribe` background task,
|
||||||
|
the catch block adds the tag to `_unresolvedTags` and increments `_totalSubscribed`,
|
||||||
|
treating every subscribe exception as a tag-resolution failure. But `SubscribeAsync`
|
||||||
|
also throws `InvalidOperationException` from `EnsureConnected()` when the OPC UA
|
||||||
|
client is not connected, and throws on transport faults — these are connection
|
||||||
|
problems, not bad tag paths. They get misclassified as unresolved tags and retried on
|
||||||
|
the 10 s tag-resolution timer instead of triggering the reconnection state machine.
|
||||||
|
Worse, the design doc (Tag Path Resolution, step 2) says the failed tag's attribute
|
||||||
|
must be marked quality `bad`; the code never pushes a bad-quality update to the
|
||||||
|
subscriber for a tag that fails to resolve at subscribe time, so the Instance Actor
|
||||||
|
stays at uncertain quality with no signal. The `TagResolutionFailed` message it sends
|
||||||
|
to `Self` only logs and re-arms the timer (`HandleTagResolutionFailed`).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Distinguish connection-level exceptions (raise `AdapterDisconnected` / let the
|
||||||
|
reconnect machine handle them) from genuine node-not-found errors. For genuine
|
||||||
|
resolution failures, push a `TagValueUpdate` with `QualityCode.Bad` to the subscribing
|
||||||
|
Instance Actor so it reflects the documented behaviour.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. The subscribe background task now classifies each subscribe
|
||||||
|
exception via the new `IsConnectionLevelFailure` helper (`InvalidOperationException`
|
||||||
|
— thrown by `EnsureConnected()` — plus `SocketException`/`TimeoutException`/
|
||||||
|
`IOException` count as connection-level; anything else is a genuine resolution
|
||||||
|
failure). The classification is carried on `SubscribeTagResult.ConnectionLevelFailure`
|
||||||
|
and applied on the actor thread in `HandleSubscribeCompleted`: connection-level
|
||||||
|
failures no longer become unresolved tags and instead drive the reconnection state
|
||||||
|
machine (`HandleSubscribeCompleted` returns a flag and the Connected-state handler
|
||||||
|
calls `BecomeReconnecting`); genuine resolution failures still go to `_unresolvedTags`
|
||||||
|
and the retry timer, and now also push a `TagValueUpdate` with `QualityCode.Bad` to
|
||||||
|
the subscribing Instance Actor, matching the design doc's Tag Path Resolution step 2.
|
||||||
|
Regression tests `DCL004_GenuineTagResolutionFailure_PushesBadQualityToSubscriber`
|
||||||
|
and `DCL004_ConnectionLevelSubscribeFailure_TriggersReconnect_NotTagRetry` both fail
|
||||||
|
against the pre-fix code and pass after. Fixed by the commit whose message references
|
||||||
|
`DataConnectionLayer-004` (commit `<pending>`).
|
||||||
|
|
||||||
|
### DataConnectionLayer-005 — `WriteTimeout` option is documented and configured but never applied
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/DataConnectionOptions.cs:15`, `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:573-590` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DataConnectionOptions.WriteTimeout` (default 30 s) and the design doc's "Shared
|
||||||
|
Settings" table both promise a bounded timeout for synchronous device writes. The
|
||||||
|
value is never read anywhere in the module (`grep` confirms only the declaration).
|
||||||
|
`HandleWrite` calls `_adapter.WriteAsync(request.TagPath, request.Value)` with no
|
||||||
|
`CancellationToken` and no timeout. If the OPC UA server hangs (TCP black-hole, no
|
||||||
|
RST), the write `Task` never completes, `PipeTo(sender)` never fires, and the calling
|
||||||
|
script's Ask blocks until its own ask-timeout — and the script gets no DCL-level
|
||||||
|
error. The design states write failures (including timeout) must be returned
|
||||||
|
synchronously to the script; an unbounded write violates that.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Create a `CancellationTokenSource(_options.WriteTimeout)`, pass its token to
|
||||||
|
`WriteAsync`, and in the continuation translate cancellation into a failed
|
||||||
|
`WriteTagResponse` with a timeout error message. Apply the same to the read used by
|
||||||
|
the initial-value seed and to `WriteBatchAndWaitAsync` paths if they are reachable.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. `HandleWrite` now creates a `CancellationTokenSource(_options.WriteTimeout)`,
|
||||||
|
passes its token to `_adapter.WriteAsync(...)`, and disposes the source in the
|
||||||
|
continuation. A cancelled/timed-out write (`Task.IsCanceled` or a base
|
||||||
|
`OperationCanceledException`) is translated into a failed `WriteTagResponse` with a
|
||||||
|
`"Write timeout after Ns"` message, so a hung device write is bounded and the failure
|
||||||
|
is returned synchronously to the calling script (WP-11) instead of blocking until the
|
||||||
|
script's own Ask-timeout. (The `WriteBatchAndWaitAsync` adapter path already accepts
|
||||||
|
an explicit `timeout`/`CancellationToken` and is not invoked by `HandleWrite`, so no
|
||||||
|
change was needed there.) Regression test
|
||||||
|
`DCL005_Write_ThatHangs_TimesOutAndReturnsFailureSynchronously` uses an adapter whose
|
||||||
|
`WriteAsync` only completes when its token fires; it fails against the pre-fix
|
||||||
|
unbounded code and passes after. Fixed by the commit whose message references
|
||||||
|
`DataConnectionLayer-005` (commit `<pending>`).
|
||||||
|
|
||||||
|
### DataConnectionLayer-006 — Health quality counters not reset/recomputed after failover or re-subscribe
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:645-673,721-756` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ReSubscribeAll` resets `_subscriptionIds`, `_unresolvedTags` and `_resolvedTags` to a
|
||||||
|
clean slate, but leaves `_lastTagQuality`, `_tagsGoodQuality`, `_tagsBadQuality` and
|
||||||
|
`_tagsUncertainQuality` untouched. `PushBadQualityForAllTags` (called on disconnect)
|
||||||
|
sets `_tagsBadQuality = _lastTagQuality.Count` and zeroes the others. After a
|
||||||
|
reconnect, `HandleTagValueReceived` decrements the *old* bucket using
|
||||||
|
`_lastTagQuality`'s value and increments the new one — but tags resolved for the first
|
||||||
|
time after reconnect were never in `_lastTagQuality`, so they only increment, never
|
||||||
|
decrement, and the totals can drift above `_totalSubscribed`. Over repeated
|
||||||
|
disconnect/reconnect cycles the health report's good/bad/uncertain counts become
|
||||||
|
unreliable.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
On `BecomeConnected` after a re-subscribe (or in `ReSubscribeAll`), clear
|
||||||
|
`_lastTagQuality` and the three quality counters and let them be repopulated from
|
||||||
|
fresh `TagValueReceived` messages. Alternatively recompute the buckets from
|
||||||
|
`_lastTagQuality` whenever it changes rather than maintaining incremental counters.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-007 — `ReadBatchAsync` aborts the whole batch on the first failing tag
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:187-195` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ReadBatchAsync` loops calling `ReadAsync` per tag. `ReadAsync` re-throws any
|
||||||
|
non-cancellation exception (line 184). So if any single tag in the batch throws (bad
|
||||||
|
node, transient fault), the entire `ReadBatchAsync` throws and the caller gets no
|
||||||
|
results for the tags that *did* read successfully — even though `ReadResult` already
|
||||||
|
has a `Success`/`ErrorMessage` shape designed to carry per-tag failures. The batch is
|
||||||
|
also fully serial (one round-trip per tag), defeating the point of a batch API; the
|
||||||
|
design doc lists `ReadBatch`/`WriteBatch` as first-class operations.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Catch per-tag exceptions inside the loop and store a failed `ReadResult` for that tag
|
||||||
|
so the batch returns a complete map. Ideally issue a single OPC UA `Read` service call
|
||||||
|
for all node IDs (`RealOpcUaClient.ReadValueAsync` already builds a
|
||||||
|
`ReadValueIdCollection` — extend it to accept multiple nodes).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-008 — `HandleUnsubscribe` is O(n^2) over instances and rechecks `_unresolvedTags` redundantly
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:540-569` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
For each tag of the instance being removed, `HandleUnsubscribe` scans every other
|
||||||
|
instance's tag set (`_subscriptionsByInstance.Where(...).Any()`), making the operation
|
||||||
|
O(tags x instances). On a site with many instances sharing a connection this is
|
||||||
|
needlessly expensive on every instance stop/redeploy. Separately, line 562
|
||||||
|
re-evaluates `!_unresolvedTags.Contains(tagPath)` immediately after line 561 already
|
||||||
|
removed `tagPath` from `_unresolvedTags`, so the condition is always true — dead
|
||||||
|
logic that obscures intent (the decrement of `_resolvedTags` is unconditional in
|
||||||
|
practice).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Maintain a reference count per tag path (or a `tagPath -> set<instance>` reverse index)
|
||||||
|
so the "any other subscriber" check is O(1). Remove the redundant `_unresolvedTags`
|
||||||
|
re-check or restructure so the resolved/unresolved decrement reflects the tag's actual
|
||||||
|
prior state captured before removal.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-009 — Implemented failover heuristic diverges from the documented state machine
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:189,242-297,379-449`, `docs/requirements/Component-DataConnectionLayer.md:73-85` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc's failover state machine reads "retry active endpoint (5s) -> N failures
|
||||||
|
(>= FailoverRetryCount) -> switch to other endpoint". The code implements two *separate*
|
||||||
|
failover triggers: (a) `HandleReconnectResult` counts `_consecutiveFailures` on
|
||||||
|
connect-attempt failures (matches the doc), and (b) `BecomeReconnecting` additionally
|
||||||
|
counts `_consecutiveUnstableDisconnects` — connections that succeeded but dropped
|
||||||
|
within a hard-coded 60 s `StableConnectionThreshold` — and fails over on that count
|
||||||
|
too. The unstable-disconnect path, the 60 s threshold, and the fact that failover can
|
||||||
|
happen on *successful-but-flaky* connections are not described in the component doc at
|
||||||
|
all. A reviewer or operator reading `Component-DataConnectionLayer.md` would not
|
||||||
|
predict this behaviour, and the 60 s threshold is a magic constant not exposed via
|
||||||
|
`DataConnectionOptions`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Update `Component-DataConnectionLayer.md` to document the unstable-disconnect failover
|
||||||
|
path and the stability threshold, and move the 60 s threshold into
|
||||||
|
`DataConnectionOptions` so it is configurable and consistent with the other tunables.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-010 — Tag-resolution retry can issue duplicate concurrent subscribe attempts
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:594-619,689-703` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleRetryTagResolution` fires `SubscribeAsync` for every tag in `_unresolvedTags`
|
||||||
|
via `ContinueWith(...).PipeTo(self)`, but does **not** remove the tags from
|
||||||
|
`_unresolvedTags` while the attempts are in flight. Because tags are not removed
|
||||||
|
before the retry, a slow `SubscribeAsync` overlapping the next 10 s tick issues
|
||||||
|
duplicate concurrent subscribe attempts for the same tag, which can create duplicate
|
||||||
|
monitored items / leaked subscription IDs (the second success overwrites
|
||||||
|
`_subscriptionIds[tag]` in `HandleTagResolutionSucceeded`, orphaning the first handle
|
||||||
|
with no `UnsubscribeAsync` call). The timer-cancel condition in
|
||||||
|
`HandleTagResolutionSucceeded` is also non-deterministic for the same reason.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Remove tags from `_unresolvedTags` (into an "in-flight" set) when a retry is
|
||||||
|
dispatched, and only put them back on failure. This prevents overlapping duplicate
|
||||||
|
subscribe attempts and makes the timer-cancel condition deterministic.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-011 — Stale subscription callbacks from disposed adapters can still reach the actor
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:486-489,278-285,416-425`, `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:252-262` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
On failover the actor disposes the old adapter (`_adapter.DisposeAsync()`,
|
||||||
|
fire-and-forget) and creates a fresh one. The old adapter's subscription callbacks
|
||||||
|
captured `self` and `tagPath` and `Tell` `TagValueReceived` to the actor. While the
|
||||||
|
`Reconnecting` handler ignores `TagValueReceived` (line 334), once the actor reaches
|
||||||
|
`Connected` again it processes them — and a disposed adapter whose OPC UA SDK threads
|
||||||
|
have not yet fully torn down could still deliver a value, mixing pre-failover device
|
||||||
|
data with the new endpoint's data and briefly reporting a value the active endpoint
|
||||||
|
never produced. There is no per-adapter generation/epoch tag on `TagValueReceived` to
|
||||||
|
distinguish current from stale callbacks.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add an adapter-generation counter incremented on every adapter swap; stamp it onto
|
||||||
|
`TagValueReceived` (captured in the callback closure) and drop messages whose
|
||||||
|
generation does not match the current adapter in `HandleTagValueReceived`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-012 — `AutoAcceptUntrustedCerts` defaults to `true`, accepting any server certificate
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/IOpcUaClient.cs:17`, `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:49,60-61`, `docs/requirements/Component-DataConnectionLayer.md:116` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`OpcUaConnectionOptions.AutoAcceptUntrustedCerts` defaults to `true`, and
|
||||||
|
`RealOpcUaClient.ConnectAsync` wires `CertificateValidator.CertificateValidation += (_, e) => e.Accept = true`
|
||||||
|
when it is set. With the default, every server certificate is accepted unconditionally
|
||||||
|
— there is no certificate-pinning or trust-store enforcement — which defeats the
|
||||||
|
`Sign`/`SignAndEncrypt` security modes against an active man-in-the-middle on the OPC
|
||||||
|
UA link. The design doc explicitly lists `true` as the default. For an industrial
|
||||||
|
control link this is a meaningful exposure; a secure-by-default posture would reject
|
||||||
|
untrusted certs unless an operator opts in per connection.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Default `AutoAcceptUntrustedCerts` to `false` and require explicit per-connection
|
||||||
|
opt-in, or at minimum log a prominent warning whenever the auto-accept validator is
|
||||||
|
installed. Update the design doc to reflect the secure default.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DataConnectionLayer-013 — Misleading XML comment: `RaiseDisconnected` claims thread safety it does not provide
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:270-281` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The XML doc on `RaiseDisconnected` states "Thread-safe: only the first caller triggers
|
||||||
|
the event." The implementation is a non-atomic check-then-set on a `volatile bool`
|
||||||
|
(`if (_disconnectFired) return; _disconnectFired = true;`). `volatile` guarantees
|
||||||
|
visibility, not atomicity — two threads (e.g. the OPC UA keep-alive thread via
|
||||||
|
`OnClientConnectionLost` and a `ReadAsync` failure path) can both observe
|
||||||
|
`_disconnectFired == false` and both invoke `Disconnected`. In practice the
|
||||||
|
`DataConnectionActor` tolerates a duplicate `AdapterDisconnected` message, so impact
|
||||||
|
is low, but the comment overstates the guarantee. The same pattern exists in
|
||||||
|
`RealOpcUaClient.OnSessionKeepAlive` (`_connectionLostFired`).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either make the guard atomic (`Interlocked.Exchange` with an `int` flag, or a lock),
|
||||||
|
or correct the comment to say "best-effort once-only; a duplicate event is possible
|
||||||
|
under a race and is tolerated downstream."
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,518 @@
|
|||||||
|
# Code Review — DeploymentManager
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.DeploymentManager` |
|
||||||
|
| Design doc | `docs/requirements/Component-DeploymentManager.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 12 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The DeploymentManager module is small, well-structured, and clearly maps work
|
||||||
|
packages (WP-N) onto code. The happy paths for instance deployment, lifecycle
|
||||||
|
commands, artifact broadcast, and staleness comparison are implemented
|
||||||
|
sensibly, and the operation lock correctly serializes mutating operations per
|
||||||
|
instance while allowing cross-instance parallelism. However, the review found a
|
||||||
|
significant cluster of error-handling and resilience gaps: the deployment
|
||||||
|
record can be left permanently stuck in `InProgress` when an exception other
|
||||||
|
than timeout/cancellation is thrown, the catch block writes its failure status
|
||||||
|
using a cancellation token that may already be cancelled, and the
|
||||||
|
`OperationLockManager` leaks one `SemaphoreSlim` per instance name forever.
|
||||||
|
There are also two notable design-document adherence gaps: the
|
||||||
|
"query-the-site-before-redeploy" idempotency requirement is not implemented
|
||||||
|
(`GetDeploymentStatusAsync` only reads the local DB), and the "Diff View"
|
||||||
|
feature is reduced to a bare hash comparison with no added/removed/changed
|
||||||
|
detail. Configuration is not bound to `appsettings.json`, leaving one option
|
||||||
|
entirely dead. Test coverage stops at the communication boundary and never
|
||||||
|
exercises a successful deployment or the lifecycle success paths.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | Stuck `InProgress` record on unexpected exception; cancelled-token failure write. |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | Module is a plain service layer; it calls `CommunicationService` which wraps Ask. No actors here. No issues. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | `OperationLockManager` is sound but leaks semaphores; `DeployToAllSitesAsync` correctly builds commands sequentially before parallel send. |
|
||||||
|
| 4 | Error handling & resilience | ✓ | Several gaps — see DeploymentManager-001/002/003/004. |
|
||||||
|
| 5 | Security | ✓ | SMTP credentials are serialized and broadcast to sites — see DeploymentManager-013. No injection vectors; no authz here (enforced upstream). |
|
||||||
|
| 6 | Performance & resource management | ✓ | Semaphore leak (DeploymentManager-005); artifact rebuild does N+1 method queries per external system. |
|
||||||
|
| 7 | Design-document adherence | ✓ | Missing query-before-redeploy (DeploymentManager-006); Diff View not implemented (DeploymentManager-007). |
|
||||||
|
| 8 | Code organization & conventions | ✓ | Options class not bound to configuration — DeploymentManager-008. POCO/repo placement correct. |
|
||||||
|
| 9 | Testing coverage | ✓ | No successful-deploy test, no lifecycle success test — DeploymentManager-011; dead `CreateCommand` helper — DeploymentManager-014. |
|
||||||
|
| 10 | Documentation & comments | ✓ | Misleading timeout comment — DeploymentManager-009; stale option XML doc — DeploymentManager-012. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### DeploymentManager-001 — Unexpected exceptions leave the deployment record stuck in `InProgress`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:141-199` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DeployInstanceAsync` sets the record to `InProgress` (lines 137-139), then the
|
||||||
|
`try` block calls into `CommunicationService` and the repository. The only
|
||||||
|
`catch` filter is `when (ex is TimeoutException or OperationCanceledException)`.
|
||||||
|
Any other exception — `InvalidOperationException` (thrown by
|
||||||
|
`CommunicationService.GetCommunicationActor()` when the actor is not set), a
|
||||||
|
JSON serialization error, a deserialization failure of the response, a DB
|
||||||
|
exception on `UpdateDeploymentRecordAsync`, or any transport error — escapes the
|
||||||
|
method. The deployment record remains in `DeploymentStatus.InProgress`
|
||||||
|
permanently. Because staleness and the UI both read current status, the
|
||||||
|
instance is then misreported as "deploying" forever and a re-deploy may be
|
||||||
|
blocked or misinterpreted. The design explicitly states an interrupted
|
||||||
|
deployment must be "treated as failed".
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Broaden the catch to a general `catch (Exception ex)` that records
|
||||||
|
`DeploymentStatus.Failed` with the error message, audit-logs the failure, and
|
||||||
|
re-throws or returns a failed `Result`. Keep the timeout-specific branch only
|
||||||
|
if a distinct message is desired. Ensure the failure-status write happens for
|
||||||
|
every exit path out of the `try`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`): broadened the `catch` in
|
||||||
|
`DeployInstanceAsync` to `catch (Exception ex)` so any exception (transport,
|
||||||
|
serialization, DB, `InvalidOperationException` from an uninitialized
|
||||||
|
`CommunicationService`) marks the deployment record `Failed` with the error
|
||||||
|
message and audit-logs the failure, instead of escaping and leaving the record
|
||||||
|
stuck in `InProgress`. Regression test:
|
||||||
|
`DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed`.
|
||||||
|
|
||||||
|
### DeploymentManager-002 — Failure-status write uses a possibly-cancelled cancellation token
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:186-196` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `catch (Exception ex) when (ex is TimeoutException or
|
||||||
|
OperationCanceledException)` block updates the record to `Failed` and calls
|
||||||
|
`UpdateDeploymentRecordAsync`/`SaveChangesAsync`/`LogAsync` passing the same
|
||||||
|
`cancellationToken` that was just cancelled (an `OperationCanceledException`
|
||||||
|
caught here means the token is already in the cancelled state). Those
|
||||||
|
repository and audit calls will themselves throw `OperationCanceledException`
|
||||||
|
before the failure status is persisted, so the record stays `InProgress` — the
|
||||||
|
exact bug DeploymentManager-001 describes, reached via the supposedly-handled
|
||||||
|
path.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Perform the cleanup writes with a fresh, non-cancellable token (e.g.
|
||||||
|
`CancellationToken.None`, optionally with an independent short timeout) so the
|
||||||
|
failure status is durably recorded even when the original operation was
|
||||||
|
cancelled or timed out.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`): the broadened `catch` block now
|
||||||
|
performs the failure-status write (`UpdateDeploymentRecordAsync`,
|
||||||
|
`SaveChangesAsync`) and the audit `LogAsync` with `CancellationToken.None`
|
||||||
|
instead of the operation's (possibly-cancelled) token, so the `Failed` status
|
||||||
|
is durably recorded even after a timeout/cancellation. The cleanup writes are
|
||||||
|
themselves wrapped in a `try`/`catch` that logs (without masking the original
|
||||||
|
error) if persistence still fails. Regression test:
|
||||||
|
`DeployInstanceAsync_FailureWrite_UsesNonCancellableToken`.
|
||||||
|
|
||||||
|
### DeploymentManager-003 — Successful-deployment cleanup is not atomic with the status write
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:155-170` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
After a successful site response the code calls `UpdateDeploymentRecordAsync`
|
||||||
|
(no `SaveChanges` yet), then `UpdateInstanceAsync`, then
|
||||||
|
`StoreDeployedSnapshotAsync` (which itself issues `Add`/`Update` calls), then a
|
||||||
|
single `SaveChangesAsync` at line 170. If `StoreDeployedSnapshotAsync` throws,
|
||||||
|
the exception is not caught (see DeploymentManager-001) and the
|
||||||
|
`SaveChangesAsync` never runs — the instance state, deployment status, and
|
||||||
|
snapshot are all left unpersisted even though the site has actually applied the
|
||||||
|
deployment. Central and site are now divergent: the site is running the new
|
||||||
|
config but central still shows the old state and a non-`Success` deployment
|
||||||
|
record.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the post-success persistence so that, at minimum, the deployment record's
|
||||||
|
`Success` status is committed. Consider committing the status first, then the
|
||||||
|
instance state and snapshot, so a later failure does not lose the fact that the
|
||||||
|
site succeeded. Log loudly if the snapshot write fails after a confirmed site
|
||||||
|
apply.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-004 — Site-success but central-delete-failure leaves orphaned site config
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:312-319` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
In `DeleteInstanceAsync`, when the site responds `Success` the code calls
|
||||||
|
`_repository.DeleteInstanceAsync` then `SaveChangesAsync`. If `SaveChangesAsync`
|
||||||
|
throws (DB error, concurrency), the exception propagates uncaught: the site has
|
||||||
|
already destroyed the Instance Actor and removed its config, but the central
|
||||||
|
instance record still exists. The instance is now un-deletable through the
|
||||||
|
normal path (the site no longer has it, so a re-issued delete may fail) and is
|
||||||
|
permanently orphaned. The design states central must not mark the instance
|
||||||
|
deleted until the site confirms — but it does not address the inverse failure.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Catch persistence failures in the post-success block and surface a distinct
|
||||||
|
error indicating the site succeeded but the central record could not be
|
||||||
|
removed, so an operator/retry can reconcile. Consider making the central delete
|
||||||
|
idempotent and retryable independently of the site command.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-005 — `OperationLockManager` leaks a `SemaphoreSlim` per instance name
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/OperationLockManager.cs:15-33` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AcquireAsync` does `_locks.GetOrAdd(instanceUniqueName, _ => new
|
||||||
|
SemaphoreSlim(1, 1))` and entries are never removed. Every distinct instance
|
||||||
|
unique name that is ever deployed/disabled/enabled/deleted permanently adds a
|
||||||
|
`SemaphoreSlim` (an `IDisposable` holding a kernel wait handle) to the
|
||||||
|
dictionary. Over the lifetime of a long-running central process — especially
|
||||||
|
with the bulk "deploy all out-of-date instances" workflow and instances that
|
||||||
|
are created and deleted over time — this is an unbounded leak of both managed
|
||||||
|
memory and OS handles. Deleted instances' semaphores are never reclaimed.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either accept the leak explicitly and document the expected bounded cardinality
|
||||||
|
of instance names, or implement reclamation: e.g. ref-count handles and remove
|
||||||
|
+ `Dispose()` the semaphore when the count reaches zero and the lock is free.
|
||||||
|
At minimum, remove the semaphore entry when an instance is deleted
|
||||||
|
(`DeleteInstanceAsync`).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-006 — Query-the-site-before-redeploy idempotency requirement not implemented
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:84-200,363-368` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design ("Deployment Identity & Idempotency") requires: "After a central
|
||||||
|
failover or timeout, the Deployment Manager queries the site for current
|
||||||
|
deployment state before allowing a re-deploy. This prevents duplicate
|
||||||
|
application and out-of-order config changes." The code never does this.
|
||||||
|
`GetDeploymentStatusAsync` only reads the local `DeploymentRecord` from the DB
|
||||||
|
(`GetDeploymentByDeploymentIdAsync`) — it does not contact the site.
|
||||||
|
`DeployInstanceAsync` unconditionally generates a new deployment ID and sends a
|
||||||
|
new `DeployInstanceCommand` regardless of any prior in-flight or timed-out
|
||||||
|
deployment. After a timeout where the site actually applied the config, a
|
||||||
|
re-deploy produces a second deployment with no reconciliation against the
|
||||||
|
site's current revision hash. Site-side stale-rejection is the only safety
|
||||||
|
net, and that is not verified here.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a site query (a new `CommunicationService` pattern returning the site's
|
||||||
|
currently-applied deployment ID / revision hash) and call it before re-deploy
|
||||||
|
when a prior record for the instance is in `InProgress`/`Failed` due to
|
||||||
|
timeout. Reconcile: if the site already has the target revision, mark the prior
|
||||||
|
record `Success` instead of re-sending. Either implement this or update the
|
||||||
|
design doc to reflect that reconciliation is delegated entirely to site-side
|
||||||
|
stale-rejection.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._ Finding confirmed valid against the source — `GetDeploymentStatusAsync`
|
||||||
|
only reads the local `DeploymentRecord` via `GetDeploymentByDeploymentIdAsync`,
|
||||||
|
and `DeployInstanceAsync` unconditionally generates a new deployment ID with no
|
||||||
|
site reconciliation. Left Open: a proper fix is a cross-module new feature, not
|
||||||
|
a bug fix scoped to `ScadaLink.DeploymentManager`. It requires (1) a new
|
||||||
|
request/response message contract in `ScadaLink.Commons`, (2) a new
|
||||||
|
`CommunicationService` query method in `ScadaLink.Communication`, and (3)
|
||||||
|
site-side handling of the query — all outside the DeploymentManager module — plus
|
||||||
|
a design decision on the query protocol. The reconciliation logic in
|
||||||
|
`DeploymentService` cannot be implemented without those. Recommend tracking as a
|
||||||
|
dedicated cross-module feature work item (or, alternatively, amending the design
|
||||||
|
doc to delegate reconciliation entirely to site-side stale-rejection — also
|
||||||
|
outside this module's editable scope).
|
||||||
|
|
||||||
|
### DeploymentManager-007 — "Diff View" reduced to a hash comparison with no diff detail
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:334-358,401-406` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design ("Diff View" and "Dependencies" sections) states the Deployment
|
||||||
|
Manager can request a diff from the Template Engine showing added/removed
|
||||||
|
members, changed values, and connection-binding changes.
|
||||||
|
`GetDeploymentComparisonAsync` and `DeploymentComparisonResult` only compare two
|
||||||
|
revision hashes and return a boolean `IsStale` plus the two hashes. No
|
||||||
|
added/removed/changed detail is produced, and the Template Engine's diff
|
||||||
|
capability is not invoked. The UI cannot render a meaningful diff from this
|
||||||
|
result.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either implement a real diff (deserialize the stored
|
||||||
|
`DeployedConfigSnapshot.ConfigurationJson` and the freshly flattened config and
|
||||||
|
invoke the Template Engine's diff service, surfacing structured
|
||||||
|
added/removed/changed entries), or revise the design doc to scope the feature
|
||||||
|
down to staleness detection only.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-008 — `DeploymentManagerOptions` is never bound to configuration
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs:7-14` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AddDeploymentManager` registers the services but never calls
|
||||||
|
`services.Configure<DeploymentManagerOptions>(configuration.GetSection(...))`.
|
||||||
|
`IOptions<DeploymentManagerOptions>` therefore always resolves to a
|
||||||
|
default-constructed instance — the operation-lock and artifact-deployment
|
||||||
|
timeouts cannot be tuned via `appsettings.json`, contrary to the CLAUDE.md
|
||||||
|
convention "Per-component configuration via `appsettings.json` sections bound
|
||||||
|
to options classes (Options pattern)." `Host/Program.cs` binds
|
||||||
|
`SecurityOptions` and `InboundApiOptions` from configuration sections but has
|
||||||
|
no equivalent for `DeploymentManagerOptions`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add an `IConfiguration` parameter (or a configure callback) to
|
||||||
|
`AddDeploymentManager` and bind `DeploymentManagerOptions` to a section such as
|
||||||
|
`ScadaLink:DeploymentManager`, consistent with the other components.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-009 — Misleading timeout comment on `DeleteInstanceAsync`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:288` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The XML doc says "Delete fails if site unreachable (30s timeout via
|
||||||
|
CommunicationOptions)." The actual delete timeout is whatever
|
||||||
|
`CommunicationOptions.LifecycleTimeout` is configured to (passed inside
|
||||||
|
`CommunicationService.DeleteInstanceAsync`); the "30s" figure is hard-coded
|
||||||
|
into the comment and not derived from any constant in this module. If
|
||||||
|
`LifecycleTimeout` is reconfigured, the comment becomes wrong. It also wrongly
|
||||||
|
implies the value lives in this module.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Reword to "Delete fails if the site is unreachable within
|
||||||
|
`CommunicationOptions.LifecycleTimeout`" without quoting a specific number.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-010 — `SystemArtifactDeploymentRecord` does not persist the deployment ID
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs:136,194-211` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DeployToAllSitesAsync` generates a `deploymentId` (line 136) and returns it in
|
||||||
|
the `ArtifactDeploymentSummary` and audit log, but the persisted
|
||||||
|
`SystemArtifactDeploymentRecord` has no field for it (the entity only has `Id`,
|
||||||
|
`ArtifactType`, `DeployedBy`, `DeployedAt`, `PerSiteStatus`). The deployment ID
|
||||||
|
that appears in the UI summary and audit log cannot be correlated back to the
|
||||||
|
stored record. Additionally each per-site `DeployArtifactsCommand` carries its
|
||||||
|
own separate GUID (`BuildDeployArtifactsCommandAsync` line 114), so there are in
|
||||||
|
fact N+1 unrelated IDs for one logical artifact deployment.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `DeploymentId` column to `SystemArtifactDeploymentRecord` and store the
|
||||||
|
single logical `deploymentId`; reuse that ID (or a derived per-site ID) for the
|
||||||
|
per-site commands so the audit log, UI summary, and persisted record agree.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-011 — Tests never exercise a successful deployment or lifecycle success path
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs:100-151,155-199` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DeploymentServiceTests` never sets the `CommunicationService` actor, so every
|
||||||
|
deploy/lifecycle test deliberately stops at the `InvalidOperationException`
|
||||||
|
thrown by `GetCommunicationActor()` (see lines 118-125, 147). As a result there
|
||||||
|
is no test covering: a successful deployment (`DeploymentStatus.Success`
|
||||||
|
response → instance state set to `Enabled`, snapshot stored, audit logged); a
|
||||||
|
failed-but-handled site response; the `InProgress`-stuck bug
|
||||||
|
(DeploymentManager-001); successful Disable/Enable/Delete; or the operation
|
||||||
|
lock actually serializing two concurrent deploys of the same instance. The
|
||||||
|
critical post-response branch (`DeploymentService.cs:154-184`) and the entire
|
||||||
|
delete/disable/enable success path are untested. The `AuditLogs` test
|
||||||
|
(lines 277-289) asserts nothing.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Introduce a seam to inject a fake/substitute communication path (e.g. an
|
||||||
|
interface over `CommunicationService`, or wire a TestKit actor) so success and
|
||||||
|
handled-failure paths can be unit tested. Add tests for the stuck-`InProgress`
|
||||||
|
scenario and for per-instance lock contention during deploy. Make the audit
|
||||||
|
test assert on `IAuditService.LogAsync`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-012 — `LifecycleCommandTimeout` option is dead code
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs:8-9` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DeploymentManagerOptions.LifecycleCommandTimeout` is declared with a 30s
|
||||||
|
default and an XML doc, but it is never read anywhere in the codebase
|
||||||
|
(lifecycle commands rely on `CommunicationOptions.LifecycleTimeout` inside
|
||||||
|
`CommunicationService`). The option misleads readers into thinking it controls
|
||||||
|
disable/enable/delete timeouts, when setting it has no effect.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Remove `LifecycleCommandTimeout`, or actually thread it through to the
|
||||||
|
lifecycle command calls (e.g. by creating a linked CTS with this timeout in
|
||||||
|
`DisableInstanceAsync`/`EnableInstanceAsync`/`DeleteInstanceAsync`, the way
|
||||||
|
`ArtifactDeploymentTimeoutPerSite` is used).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-013 — SMTP credentials serialized and broadcast to all sites
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs:108-111` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`BuildDeployArtifactsCommandAsync` maps `smtp.Credentials` directly into
|
||||||
|
`SmtpConfigurationArtifact` and that command is sent to every site. Distributing
|
||||||
|
SMTP credentials to sites is consistent with the design (SMTP configuration is
|
||||||
|
a deployable artifact), but the credentials travel inside a serialized command
|
||||||
|
across the inter-cluster transport and are stored on each site's SQLite. There
|
||||||
|
is no indication the value is encrypted at rest on the site or scrubbed from
|
||||||
|
logs. Worth confirming the transport is TLS-protected and the site stores the
|
||||||
|
credential securely; at minimum this should be a conscious, documented decision.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Confirm inter-cluster transport encryption covers artifact commands, ensure
|
||||||
|
`Credentials` is never written to logs, and document the at-rest protection of
|
||||||
|
SMTP credentials on site SQLite. Consider encrypting the credential field
|
||||||
|
within the artifact payload.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### DeploymentManager-014 — Dead `CreateCommand` helper in artifact tests
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs:86-90` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The private static `CreateCommand()` helper is never referenced by any test in
|
||||||
|
the file. It is dead code that suggests an intended test (e.g. a successful
|
||||||
|
multi-site artifact deployment) was never written — coverage of
|
||||||
|
`DeployToAllSitesAsync` is limited to the no-sites failure case, and
|
||||||
|
`RetryForSiteAsync` and `BuildDeployArtifactsCommandAsync` have no tests at all.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either remove the unused helper or, preferably, write the missing tests for
|
||||||
|
`DeployToAllSitesAsync` (per-site success/failure matrix, partial failure) and
|
||||||
|
`RetryForSiteAsync` using it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,552 @@
|
|||||||
|
# Code Review — ExternalSystemGateway
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.ExternalSystemGateway` |
|
||||||
|
| Design doc | `docs/requirements/Component-ExternalSystemGateway.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 11 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The External System Gateway is a small module (five source files plus options) that
|
||||||
|
implements the HTTP/REST client (`ExternalSystemClient`), the database access surface
|
||||||
|
(`DatabaseGateway`), and error classification (`ErrorClassifier`). The structure is
|
||||||
|
clean and the dual call-mode semantics broadly match the design doc. However, the
|
||||||
|
review surfaced several substantive problems that prevent the module from behaving as
|
||||||
|
designed. The most serious is that **no store-and-forward delivery handler is ever
|
||||||
|
registered** for the `ExternalSystem` or `CachedDbWrite` categories, so cached calls
|
||||||
|
and cached writes are buffered but can never actually be delivered on retry — a silent
|
||||||
|
data-loss path. Two further high-impact issues are that the **per-system call timeout
|
||||||
|
is never applied** to the HTTP client (the design's central error-handling guarantee
|
||||||
|
is absent), and that **`CachedCall` double-dispatches the HTTP request** because
|
||||||
|
`StoreAndForwardService.EnqueueAsync` itself re-attempts immediate delivery, breaking
|
||||||
|
the idempotency expectations. A cluster of medium issues concern resource leaks,
|
||||||
|
classification gaps (cancellation conflation), and the dropped `StoreAndForwardResult`.
|
||||||
|
Test coverage is thin — `CachedCall` transient/buffering paths and `DatabaseGateway`
|
||||||
|
are entirely untested. Themes: incomplete wiring against the S&F engine, and design-doc
|
||||||
|
requirements (timeout, retry settings) that are declared but not implemented.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | URL building edge cases, dropped S&F result, classification gaps — findings 003, 006, 009. |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | No actors in this module; `AddExternalSystemGatewayActors` is a no-op. Blocking-I/O isolation is delegated to Site Runtime. No issues found in this module. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Services are stateless and DI-scoped; `ExternalCallResult.Response` lazy-parse is not thread-safe but instances are single-use. No findings raised. |
|
||||||
|
| 4 | Error handling & resilience | ☑ | S&F handler never registered, double-dispatch, timeout not applied, cancellation conflation — findings 001, 002, 003, 008. |
|
||||||
|
| 5 | Security | ☑ | Auth secrets logged-safe, but error bodies echoed verbatim — finding 007. |
|
||||||
|
| 6 | Performance & resource management | ☑ | `HttpRequestMessage`/`HttpResponseMessage` and failed `SqlConnection` not disposed; full repository scan per call — findings 005, 010, 011. |
|
||||||
|
| 7 | Design-document adherence | ☑ | Timeout, retry settings, audit logging gaps — findings 002, 004, 012. |
|
||||||
|
| 8 | Code organization & conventions | ☑ | Options class correctly owned by module; `MaxConcurrentConnectionsPerSystem` unused — finding 013. |
|
||||||
|
| 9 | Testing coverage | ☑ | CachedCall buffering and DatabaseGateway untested — finding 014. |
|
||||||
|
| 10 | Documentation & comments | ☑ | XML docs reference WP numbers; permanent-failure logging requirement unverified — folded into finding 012. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### ExternalSystemGateway-001 — No S&F delivery handler registered; cached calls and writes can never be delivered
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Critical |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:109`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:81` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CachedCallAsync` and `CachedWriteAsync` enqueue messages under
|
||||||
|
`StoreAndForwardCategory.ExternalSystem` and `StoreAndForwardCategory.CachedDbWrite`.
|
||||||
|
`StoreAndForwardService.RegisterDeliveryHandler` is the only mechanism that lets the
|
||||||
|
S&F engine actually deliver a buffered message, and a repository-wide search shows it
|
||||||
|
is **never called for either category** anywhere in the codebase. Consequences:
|
||||||
|
|
||||||
|
1. On a transient failure, `EnqueueAsync` falls through to the "No handler registered
|
||||||
|
— buffer for later" branch (`StoreAndForwardService.cs:163`) and the message is
|
||||||
|
persisted.
|
||||||
|
2. During the retry sweep, `AttemptDeliveryAsync` (`StoreAndForwardService.cs:201`)
|
||||||
|
logs `"No delivery handler for category {Category}"` and returns without ever
|
||||||
|
removing or delivering the message.
|
||||||
|
|
||||||
|
The result is that every cached external call and cached DB write is silently
|
||||||
|
buffered forever and never delivered — a data-loss path for the exact "deferred
|
||||||
|
delivery is acceptable" use cases the design doc calls out (posting production data,
|
||||||
|
quality reports). The script also receives `WasBuffered: true` / a successful
|
||||||
|
`CachedWriteAsync` completion, so the failure is completely invisible.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Register delivery handlers for `StoreAndForwardCategory.ExternalSystem` and
|
||||||
|
`StoreAndForwardCategory.CachedDbWrite` during host/site startup. The `ExternalSystem`
|
||||||
|
handler should deserialize the payload, re-resolve the system/method, and re-invoke
|
||||||
|
`InvokeHttpAsync`, returning `true`/`false`/throwing per the transient-vs-permanent
|
||||||
|
contract `EnqueueAsync` expects. The `CachedDbWrite` handler should execute the SQL
|
||||||
|
against the named connection. Add an integration test that buffers a message and
|
||||||
|
verifies it is delivered by a retry sweep.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. Delivery handlers for `StoreAndForwardCategory.ExternalSystem` and
|
||||||
|
`CachedDbWrite` are now registered at site startup in `AkkaHostedService`, after
|
||||||
|
`StoreAndForwardService.StartAsync()`. Each handler resolves its consumer in a fresh DI
|
||||||
|
scope and calls a new `DeliverBufferedAsync`: `ExternalSystemClient.DeliverBufferedAsync`
|
||||||
|
re-resolves the system/method and re-invokes `InvokeHttpAsync`, and
|
||||||
|
`DatabaseGateway.DeliverBufferedAsync` executes the buffered SQL — each returning `true`
|
||||||
|
on success, `false` (park) when the target no longer exists or fails permanently, and
|
||||||
|
throwing on transient failure so the engine retries. `EnqueueAsync` gained an
|
||||||
|
`attemptImmediateDelivery` parameter; `CachedCallAsync` passes `false` so registering the
|
||||||
|
handler does not dispatch the request twice (the double-dispatch noted in
|
||||||
|
`ExternalSystemGateway-003`). Regression tests cover the success, target-removed and
|
||||||
|
transient-retry paths. Fixed by the commit whose message references
|
||||||
|
`ExternalSystemGateway-001`.
|
||||||
|
|
||||||
|
### ExternalSystemGateway-002 — Per-system call timeout is never applied to HTTP requests
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:130`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc states each external system definition specifies a timeout that
|
||||||
|
"applies to all method calls on that system" and "applies to the HTTP request
|
||||||
|
round-trip", and `ExternalSystemGatewayOptions.DefaultHttpTimeout` exists as a
|
||||||
|
fallback. In practice no timeout is ever configured. `ServiceCollectionExtensions`
|
||||||
|
calls `services.AddHttpClient()` with no per-named-client configuration, and
|
||||||
|
`InvokeHttpAsync` calls `_httpClientFactory.CreateClient($"ExternalSystem_{system.Name}")`
|
||||||
|
without setting `client.Timeout` or passing a `CancellationToken` derived from a
|
||||||
|
timeout. `SendAsync` is therefore subject only to `HttpClient`'s default 100-second
|
||||||
|
timeout, regardless of the system definition or the configured `DefaultHttpTimeout`.
|
||||||
|
A slow or hung external system will block the calling Script Execution Actor far
|
||||||
|
longer than the operator configured, and the design's core error-handling guarantee
|
||||||
|
(timeout → transient classification) does not hold within the intended window.
|
||||||
|
|
||||||
|
There is also no `Timeout` field on `ExternalSystemDefinition` at all, so even a
|
||||||
|
correct implementation has nowhere to read the per-system value from — the entity is
|
||||||
|
missing the field the design requires.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `Timeout` (TimeSpan) field to `ExternalSystemDefinition` and have
|
||||||
|
`InvokeHttpAsync` enforce it — either by setting `client.Timeout` via a typed/named
|
||||||
|
`HttpClient` registration, or by linking a `CancellationTokenSource` with the
|
||||||
|
per-system (or `DefaultHttpTimeout`) timeout to the supplied `cancellationToken`
|
||||||
|
before `SendAsync`. Ensure the resulting `TaskCanceledException`/`TimeoutException`
|
||||||
|
is classified as transient.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). `InvokeHttpAsync` now enforces a call
|
||||||
|
timeout: `ExternalSystemClient` takes an `IOptions<ExternalSystemGatewayOptions>` and
|
||||||
|
links a `CancellationTokenSource(DefaultHttpTimeout)` with the caller's token before
|
||||||
|
`SendAsync` and the response-body read, so the design's "timeout applies to the HTTP
|
||||||
|
request round-trip" guarantee now holds within the configured window (default 30s)
|
||||||
|
instead of `HttpClient`'s default 100s. A timeout is reclassified as a
|
||||||
|
`TransientExternalSystemException`; a caller-initiated cancellation is distinguished
|
||||||
|
from a timeout and propagated as `OperationCanceledException` rather than being
|
||||||
|
swallowed as transient. Regression tests:
|
||||||
|
`Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow` and
|
||||||
|
`Call_CallerCancellation_IsNotMisreportedAsTimeout`.
|
||||||
|
|
||||||
|
Note (partial scope): the per-*system* `Timeout` field on `ExternalSystemDefinition`
|
||||||
|
remains unimplemented — adding it requires a change to `ScadaLink.Commons`, which is
|
||||||
|
outside this module's edit scope. Until that entity field exists, the configured
|
||||||
|
`DefaultHttpTimeout` is the effective per-call limit for every system. A follow-up
|
||||||
|
against the Commons module should add the `Timeout` field and have `InvokeHttpAsync`
|
||||||
|
prefer it over the default. This is a tracked follow-up, not a regression.
|
||||||
|
|
||||||
|
### ExternalSystemGateway-003 — `CachedCall` double-dispatches the HTTP request
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:84-117` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CachedCallAsync` first calls `InvokeHttpAsync` directly (line 86). On a
|
||||||
|
`TransientExternalSystemException` it then calls `_storeAndForward.EnqueueAsync(...)`
|
||||||
|
(line 109). `StoreAndForwardService.EnqueueAsync` is **not** a pure enqueue — it
|
||||||
|
"Attempts immediate delivery" by invoking the registered delivery handler
|
||||||
|
(`StoreAndForwardService.cs:128-159`). If a delivery handler for the `ExternalSystem`
|
||||||
|
category is registered (as finding 001 recommends), the HTTP request will be executed
|
||||||
|
a **second time** synchronously inside `EnqueueAsync`, immediately after the first
|
||||||
|
attempt failed. For a transient failure that is actually a slow/overloaded system,
|
||||||
|
this doubles the load and — critically — if the original request did reach the
|
||||||
|
external system, the immediate retry produces a duplicate delivery before the script
|
||||||
|
even returns, worsening the idempotency hazard the design doc explicitly warns about.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide on one dispatch path. Either (a) have `CachedCall` not pre-invoke
|
||||||
|
`InvokeHttpAsync` and instead let `EnqueueAsync`'s immediate-delivery attempt be the
|
||||||
|
single first attempt (requires the handler to exist and to surface permanent vs
|
||||||
|
transient correctly); or (b) add an enqueue-only entry point to
|
||||||
|
`StoreAndForwardService` that skips the immediate-delivery attempt, and have
|
||||||
|
`CachedCall` use it after its own first attempt. Approach (a) is cleaner and removes
|
||||||
|
the duplicated logic.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). Re-triage: this finding was already fixed in
|
||||||
|
the codebase as a side effect of the `ExternalSystemGateway-001` fix and is no longer
|
||||||
|
reproducible against the current source. `StoreAndForwardService.EnqueueAsync` gained an
|
||||||
|
`attemptImmediateDelivery` parameter (recommendation approach (b)), and
|
||||||
|
`CachedCallAsync` passes `attemptImmediateDelivery: false` after its own first HTTP
|
||||||
|
attempt — so `EnqueueAsync` buffers the message for the background retry sweep without
|
||||||
|
re-invoking the registered delivery handler, eliminating the duplicate dispatch. A
|
||||||
|
dedicated regression test, `CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler`,
|
||||||
|
was added in this module's test suite: it registers a counting delivery handler, drives
|
||||||
|
a `CachedCall` whose HTTP attempt fails transiently, and asserts the handler is invoked
|
||||||
|
zero times during enqueue. The test was verified to fail if `attemptImmediateDelivery`
|
||||||
|
is flipped back to `true`.
|
||||||
|
|
||||||
|
### ExternalSystemGateway-004 — System retry settings are not honoured for cached calls/writes
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:114-115`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:86-87` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CachedCallAsync` and `CachedWriteAsync` pass the definition's `MaxRetries` /
|
||||||
|
`RetryDelay` to `EnqueueAsync` only when they are non-default
|
||||||
|
(`MaxRetries > 0 ? ... : null`, `RetryDelay > TimeSpan.Zero ? ... : null`), otherwise
|
||||||
|
falling back to the S&F defaults. The site-side repository that supplies these
|
||||||
|
definitions, `SiteExternalSystemRepository.MapExternalSystem`
|
||||||
|
(`src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:194`), never
|
||||||
|
reads `MaxRetries`/`RetryDelay` from SQLite at all — the constructed entities always
|
||||||
|
have `MaxRetries == 0` and `RetryDelay == TimeSpan.Zero`. As a result, at sites the
|
||||||
|
per-system retry settings the design doc requires are *always* discarded and the
|
||||||
|
global S&F defaults are silently used instead. The `> 0` guard in the ESG also makes
|
||||||
|
a legitimately-configured `MaxRetries` of 0 ("never retry") indistinguishable from
|
||||||
|
"unset", so an operator cannot express "do not retry".
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Within this module, drop the `> 0` / `> Zero` guards and pass the definition values
|
||||||
|
through directly (or use nullable fields on the entity to distinguish "unset"). The
|
||||||
|
companion fix in `SiteExternalSystemRepository` to actually map the retry columns
|
||||||
|
should be tracked against the SiteRuntime module.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-005 — `HttpRequestMessage` and `HttpResponseMessage` are not disposed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:133-167` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`InvokeHttpAsync` creates an `HttpRequestMessage` (line 133) and receives an
|
||||||
|
`HttpResponseMessage` from `SendAsync` (line 155); neither is wrapped in a `using` nor
|
||||||
|
explicitly disposed. Both are `IDisposable` and own resources (the request's
|
||||||
|
`StringContent`, the response's content stream). Under the per-invocation call volume
|
||||||
|
of a busy site this produces avoidable pressure on the finalizer queue and can hold
|
||||||
|
socket/stream resources longer than necessary. The success path reads the content but
|
||||||
|
never disposes the response; the error path likewise reads `errorBody` and then throws
|
||||||
|
without disposing.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the request in `using var request = ...` and the response in
|
||||||
|
`using var response = ...` (or call `Dispose()` in a `finally`). Ensure disposal still
|
||||||
|
occurs on the exception paths.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-006 — `BuildUrl` ignores path templates and appends a trailing slash for empty paths
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:180-196` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`BuildUrl` does `baseUrl.TrimEnd('/') + "/" + path.TrimStart('/')`. When `method.Path`
|
||||||
|
is empty (a method that targets the base URL itself), this still appends a `/`,
|
||||||
|
producing `https://host/api/` which some servers treat as a different resource than
|
||||||
|
`https://host/api`. More importantly, the design doc shows method paths as templates
|
||||||
|
like `/recipes/{id}`, but `BuildUrl` performs no placeholder substitution — a `{id}`
|
||||||
|
token is sent literally in the URL and the corresponding parameter is instead appended
|
||||||
|
as a query-string entry (for GET/DELETE) or placed in the JSON body (POST/PUT). Either
|
||||||
|
the design's path-template feature is unimplemented, or the doc is stale; in the
|
||||||
|
current code a method defined as `/recipes/{id}` will never produce a correct URL.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide whether path templating is in scope. If yes, implement `{name}` substitution
|
||||||
|
from `parameters` in `BuildUrl` and exclude substituted parameters from the query
|
||||||
|
string/body. If no, update the component design doc to remove the `/recipes/{id}`
|
||||||
|
example and state that paths are literal. Also avoid appending a trailing `/` when
|
||||||
|
`path` is empty.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-007 — External error response bodies are echoed verbatim into script-visible error messages
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:167-177` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
On a non-success HTTP response, the full response body is read into `errorBody` and
|
||||||
|
embedded verbatim into the exception message (`$"HTTP {code} from {name}: {errorBody}"`),
|
||||||
|
which then flows into `ExternalCallResult.ErrorMessage` and back to the calling script,
|
||||||
|
and into Site Event Logging. An external system error page can be arbitrarily large
|
||||||
|
(an HTML stack trace, a multi-megabyte body) and may contain sensitive detail. There
|
||||||
|
is no size cap, so a hostile or misbehaving endpoint can inflate every error log entry
|
||||||
|
and error string returned to scripts. There is also no content-type check before
|
||||||
|
treating the body as text.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Truncate `errorBody` to a bounded length (e.g. 1–2 KB) before embedding it, and
|
||||||
|
consider logging the full body separately at debug level rather than returning it to
|
||||||
|
the script. Optionally only include the body when the content type is textual.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-008 — Cancellation is conflated with transient timeout failure
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ErrorClassifier.cs:24-30`, `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:157-159` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ErrorClassifier.IsTransient(Exception)` returns `true` for `TaskCanceledException`
|
||||||
|
and `OperationCanceledException`. `HttpClient.SendAsync` throws `TaskCanceledException`
|
||||||
|
both when its internal timeout elapses *and* when the supplied `CancellationToken` is
|
||||||
|
cancelled (e.g. the Script Execution Actor is stopped, or the actor system is shutting
|
||||||
|
down). Because `InvokeHttpAsync`'s `catch` filter treats all of these as transient, a
|
||||||
|
caller-initiated cancellation during a `CachedCall` will be misclassified as a
|
||||||
|
transient failure and the message will be buffered for retry — work the caller
|
||||||
|
explicitly asked to abandon. For a `Call`, a shutdown-time cancellation is reported to
|
||||||
|
the script as a "Transient error" rather than an `OperationCanceledException`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
In `InvokeHttpAsync`, check `cancellationToken.IsCancellationRequested` first and
|
||||||
|
rethrow `OperationCanceledException` (or let it propagate) before applying transient
|
||||||
|
classification. Only treat a cancellation as a timeout when the supplied token is
|
||||||
|
*not* the one that was cancelled.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-009 — `StoreAndForwardResult` from `EnqueueAsync` is discarded; permanent failures during buffering are swallowed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:109-117` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CachedCallAsync` assigns the result of `_storeAndForward.EnqueueAsync(...)` to
|
||||||
|
`sfResult` and then never reads it — it unconditionally returns
|
||||||
|
`new ExternalCallResult(true, null, null, WasBuffered: true)`. `EnqueueAsync` can
|
||||||
|
return `Success == false` (a permanent failure encountered during its
|
||||||
|
immediate-delivery attempt — `StoreAndForwardService.cs:142`) or `Buffered == false`
|
||||||
|
(delivered immediately). In both cases the ESG still reports the call as buffered and
|
||||||
|
successful to the script. A permanent failure surfaced by the S&F immediate attempt is
|
||||||
|
therefore silently lost instead of being returned to the script as the design requires
|
||||||
|
("On permanent failure (HTTP 4xx), the error is returned synchronously").
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Inspect `sfResult`: if `Success == false` return an error `ExternalCallResult`; set
|
||||||
|
`WasBuffered` from `sfResult.Buffered` rather than hard-coding `true`. (This finding is
|
||||||
|
partly subsumed by the dispatch redesign in finding 003.)
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-010 — `GetConnectionAsync` leaks the `SqlConnection` when `OpenAsync` fails
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:48-50` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetConnectionAsync` constructs `new SqlConnection(...)` and calls `await
|
||||||
|
connection.OpenAsync(...)`. If `OpenAsync` throws (unreachable server, bad
|
||||||
|
credentials, cancellation) the just-created `SqlConnection` instance is never disposed
|
||||||
|
— the exception propagates and the local reference is lost. While an unopened
|
||||||
|
`SqlConnection` is lightweight, over many failing calls this is an avoidable leak. The
|
||||||
|
design doc says `Database.Connection()` failures return an error to the script; the
|
||||||
|
current code lets a raw `SqlException` escape, which is acceptable, but the leak is
|
||||||
|
not.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the open in a try/catch that disposes the connection before rethrowing:
|
||||||
|
`try { await connection.OpenAsync(ct); } catch { connection.Dispose(); throw; }`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-011 — Every call performs a full repository scan of all systems and methods
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:231-245`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:90-97` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ResolveSystemAndMethodAsync` calls `GetAllExternalSystemsAsync()` and then
|
||||||
|
`GetMethodsByExternalSystemIdAsync()` and filters in memory on every single call;
|
||||||
|
`ResolveConnectionAsync` calls `GetAllDatabaseConnectionsAsync()` and filters in memory
|
||||||
|
on every cached write / connection request. At sites this hits the SQLite repository,
|
||||||
|
and `SiteExternalSystemRepository` re-reads and re-parses the methods JSON each time.
|
||||||
|
For a hot script path this is unnecessary repeated I/O and allocation. Definitions only
|
||||||
|
change on deployment, so they are eminently cacheable.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add an in-memory cache of system/method/connection definitions keyed by name,
|
||||||
|
invalidated on artifact deployment. Alternatively use a name-keyed repository lookup
|
||||||
|
rather than fetch-all-then-filter.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-012 — Permanent-failure logging requirement is not met; `_logger` is injected but unused
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:24,169-177`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:22` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc states permanent failures are "Logged to Site Event Logging", but
|
||||||
|
`InvokeHttpAsync` performs no logging on the permanent-failure path. In fact the
|
||||||
|
injected `ILogger<ExternalSystemClient>` and `ILogger<DatabaseGateway>` fields are
|
||||||
|
never used at all in either class. Either the logging is expected to happen in the
|
||||||
|
caller (Script Execution Actor) — in which case the design doc is imprecise about
|
||||||
|
where — or it is missing. Separately, `IsTransient(HttpStatusCode)` treats any
|
||||||
|
non-success, non-(5xx/408/429) status as permanent without an explicit comment, which
|
||||||
|
is a reasonable default but undocumented.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `_logger.LogWarning` on the permanent-failure path (and a debug log on
|
||||||
|
transient), or clarify in the design doc that Site Event Logging capture is the
|
||||||
|
caller's responsibility and remove the unused `_logger` fields. Add a comment in
|
||||||
|
`ErrorClassifier` documenting the "default to permanent" behaviour.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-013 — `MaxConcurrentConnectionsPerSystem` and `DefaultHttpTimeout` options are defined but never used
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemGatewayOptions.cs:9,12`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ExternalSystemGatewayOptions.MaxConcurrentConnectionsPerSystem` (default 10) and
|
||||||
|
`DefaultHttpTimeout` (default 30s) are bound from configuration but neither is read
|
||||||
|
anywhere. `AddHttpClient()` registers the default factory with no
|
||||||
|
`ConfigurePrimaryHttpMessageHandler`/`SocketsHttpHandler` `MaxConnectionsPerServer` and
|
||||||
|
no `Timeout`, so both options have no effect. An operator setting these values gets
|
||||||
|
them silently ignored — a misleading configuration surface (`DefaultHttpTimeout` is
|
||||||
|
also referenced by finding 002).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either wire the options into a named/typed `HttpClient` registration (set
|
||||||
|
`MaxConnectionsPerServer` on the primary handler, set `Timeout`), or remove the unused
|
||||||
|
options to avoid implying behaviour that does not exist.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ExternalSystemGateway-014 — Cached-call buffering path and `DatabaseGateway` are untested
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs:1`, (no `DatabaseGatewayTests.cs`) |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ExternalSystemClientTests` covers system/method not-found, success, transient 500 and
|
||||||
|
permanent 400 for `CallAsync`, plus `CachedCall` not-found and success. It does **not**
|
||||||
|
cover: the `CachedCall` transient-failure → S&F buffering branch (the most
|
||||||
|
behaviour-rich path, including the `_storeAndForward == null` fallback and `WasBuffered`
|
||||||
|
semantics), the `CachedCall` permanent-failure branch, connection-exception
|
||||||
|
classification (`HttpRequestException` thrown by the handler), `BuildUrl` query-string
|
||||||
|
construction, and `ApplyAuth` for the apikey/basic variants. There is **no test file
|
||||||
|
for `DatabaseGateway`** at all — `GetConnectionAsync` not-found, `CachedWriteAsync`
|
||||||
|
not-found, and the `_storeAndForward == null` guard are entirely uncovered. The
|
||||||
|
`MockHttpMessageHandler` also does not assert request URL/headers/body, so auth and
|
||||||
|
URL construction are unverified.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for the `CachedCall` transient/buffering paths (with a substituted S&F
|
||||||
|
service), `DatabaseGateway` not-found and null-S&F guards, and `BuildUrl`/`ApplyAuth`
|
||||||
|
by asserting on the captured `HttpRequestMessage` in the mock handler.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,445 @@
|
|||||||
|
# Code Review — HealthMonitoring
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.HealthMonitoring` |
|
||||||
|
| Design doc | `docs/requirements/Component-HealthMonitoring.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 10 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The HealthMonitoring module is small, readable, and broadly faithful to the design
|
||||||
|
intent: per-interval error counters with atomic read-and-reset, monotonic sequence
|
||||||
|
numbers with Unix-ms seeding to survive failover, sequence-guarded staleness
|
||||||
|
rejection, and a 60s offline timeout. However, the review surfaced two recurring
|
||||||
|
themes. First, **a documented metric is silently unimplemented** — store-and-forward
|
||||||
|
buffer depths are never populated (`SetStoreAndForwardDepths` has zero callers and a
|
||||||
|
test asserts the field is always empty), so the dashboard cannot show the buffer
|
||||||
|
depth metric the design doc requires. Second, **the central aggregator's in-memory
|
||||||
|
state model has unguarded shared mutable state**: `SiteHealthState` is a mutable
|
||||||
|
class whose fields are written by a background timer thread, by `ProcessReport`, and
|
||||||
|
by `MarkHeartbeat` with no synchronization, and the same live mutable objects are
|
||||||
|
handed straight to UI callers via `GetAllSiteStates`. The `ProcessReport` logic also
|
||||||
|
mutates shared state inside a `ConcurrentDictionary.AddOrUpdate` update delegate,
|
||||||
|
which the runtime may invoke more than once under contention. Additionally there are
|
||||||
|
gaps around central self-report offline detection, heartbeats for not-yet-registered
|
||||||
|
sites being dropped, and missing test coverage for the central report loop,
|
||||||
|
heartbeat path, and most collector setters. None of the findings are crash-class,
|
||||||
|
but the concurrency issues are Medium/High and the missing S&F metric is a real
|
||||||
|
design-adherence gap.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | x | `MarkHeartbeat` drops heartbeats for unregistered sites (HealthMonitoring-007); central self-report has no heartbeat grace (HealthMonitoring-005). |
|
||||||
|
| 2 | Akka.NET conventions | x | Module itself contains no actors (transport abstracted via `IHealthReportTransport`); `AddHealthMonitoringActors` is a dead placeholder (HealthMonitoring-011). Actor-side wiring lives in Communication and is out of scope. |
|
||||||
|
| 3 | Concurrency & thread safety | x | Unguarded mutable `SiteHealthState` (HealthMonitoring-002); mutation inside `AddOrUpdate` delegate (HealthMonitoring-003); `GetAllSiteStates` leaks live mutable references (HealthMonitoring-008). Collector counters correctly use `Interlocked`. |
|
||||||
|
| 4 | Error handling & resilience | x | `HealthReportSender` silently swallows inner failures with bare `catch {}` (HealthMonitoring-010); top-level loop error handling is sound. |
|
||||||
|
| 5 | Security | x | No issues found. Module handles only numeric/string operational metrics, no secrets, no external input parsing, no auth surface. |
|
||||||
|
| 6 | Performance & resource management | x | `PeriodicTimer` instances correctly disposed via `using`. Dictionary snapshots per report are acceptable at the documented scale. No issues found. |
|
||||||
|
| 7 | Design-document adherence | x | Store-and-forward buffer depth metric unimplemented (HealthMonitoring-001); sequence seeding deviates from doc's "starting at 1" wording (HealthMonitoring-006). |
|
||||||
|
| 8 | Code organization & conventions | x | Options class correctly owned by the component; POCO/messages in Commons. Dead placeholder method noted (HealthMonitoring-011). |
|
||||||
|
| 9 | Testing coverage | x | No tests for `CentralHealthReportLoop`, `MarkHeartbeat`, offline-via-heartbeat, replica idempotency, or most collector setters (HealthMonitoring-009). |
|
||||||
|
| 10 | Documentation & comments | x | Heartbeat interval is described inconsistently (~2s vs ~5s) across XML docs (HealthMonitoring-004); `LatestReport = null!` misrepresents the contract (HealthMonitoring-012). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### HealthMonitoring-001 — Store-and-forward buffer depth metric is never populated
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs:104`, `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:79` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`Component-HealthMonitoring.md` lists "Store-and-forward buffer depth" (pending
|
||||||
|
messages by category) as a required monitored metric. `SiteHealthCollector` exposes
|
||||||
|
`SetStoreAndForwardDepths(...)` to receive it, but a codebase-wide search shows the
|
||||||
|
method has **no callers** — `_sfBufferDepths` always remains the empty dictionary it
|
||||||
|
is initialized to. `HealthReportSender` queries `GetParkedMessageCountAsync()` and
|
||||||
|
sets `ParkedMessageCount`, but parked count is a distinct metric from per-category
|
||||||
|
buffer depth. The test `SiteHealthCollectorTests.StoreAndForwardBufferDepths_IsEmptyPlaceholder`
|
||||||
|
even codifies the unimplemented state as expected behaviour. The result is that the
|
||||||
|
central dashboard cannot display buffer depth, a documented triage metric.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wire `SetStoreAndForwardDepths` into `HealthReportSender.ExecuteAsync` (alongside the
|
||||||
|
existing parked-count call) using the S&F engine's per-category depth API, or, if the
|
||||||
|
metric is intentionally deferred, record that decision in the design doc and remove
|
||||||
|
the dead setter. Update the placeholder test accordingly once implemented.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). `HealthReportSender.ExecuteAsync` now
|
||||||
|
queries the existing public `StoreAndForwardStorage.GetBufferDepthByCategoryAsync()`
|
||||||
|
API alongside the parked-count call and feeds the per-category depths into
|
||||||
|
`SiteHealthCollector.SetStoreAndForwardDepths` (category enum names as keys), so the
|
||||||
|
documented store-and-forward buffer depth metric is populated in every emitted
|
||||||
|
report. Regression test `HealthReportSenderTests.ReportsIncludeStoreAndForwardBufferDepthsFromStorage`
|
||||||
|
verifies populated per-category depths. The obsolete placeholder test
|
||||||
|
`SiteHealthCollectorTests.StoreAndForwardBufferDepths_IsEmptyPlaceholder` continues
|
||||||
|
to pass — it only exercises the collector with no setter call and still correctly
|
||||||
|
asserts the empty default; it was left in place as the collector-level default-state
|
||||||
|
test. No StoreAndForward source was modified (existing public API only).
|
||||||
|
|
||||||
|
### HealthMonitoring-002 — `SiteHealthState` mutable fields written from multiple threads without synchronization
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthState.cs:11`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:86`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:137` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SiteHealthState` is a plain mutable class. Its fields (`LatestReport`,
|
||||||
|
`LastReportReceivedAt`, `LastHeartbeatAt`, `LastSequenceNumber`, `IsOnline`) are
|
||||||
|
mutated from at least three concurrent contexts: `ProcessReport` (caller thread —
|
||||||
|
ClusterClient/PubSub message handlers), `MarkHeartbeat` (caller thread — heartbeat
|
||||||
|
handler), and `CheckForOfflineSites` (the `BackgroundService` timer thread). The
|
||||||
|
`ConcurrentDictionary` only protects the dictionary structure, not the objects it
|
||||||
|
stores. A heartbeat update and the offline-check can interleave on the same
|
||||||
|
`SiteHealthState` instance, and reads/writes of `DateTimeOffset` (a 16-byte struct)
|
||||||
|
and `long` fields are not guaranteed atomic on all platforms — producing torn reads
|
||||||
|
and lost updates of `IsOnline`/`LastHeartbeatAt`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Make state transitions atomic: either guard all reads/writes of a `SiteHealthState`
|
||||||
|
with a per-site lock, or replace `SiteHealthState` with an immutable record updated
|
||||||
|
via `ConcurrentDictionary` compare-and-swap (`TryUpdate`) so every transition is
|
||||||
|
a single atomic reference swap.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). `SiteHealthState` is now a `sealed record`
|
||||||
|
with `init`-only properties. `CentralHealthAggregator.ProcessReport`,
|
||||||
|
`MarkHeartbeat`, and `CheckForOfflineSites` were rewritten to perform every state
|
||||||
|
transition as an atomic compare-and-swap (`TryAdd`/`TryUpdate`) producing a new
|
||||||
|
record instance — no field of a stored state is ever mutated in place. `ProcessReport`
|
||||||
|
uses an explicit CAS retry loop instead of the `AddOrUpdate` update delegate so the
|
||||||
|
sequence-number guard and the field writes are evaluated against the value actually
|
||||||
|
installed (this also closes the root cause behind HealthMonitoring-003). Reads via
|
||||||
|
`GetAllSiteStates`/`GetSiteState` now hand out immutable snapshots, so a concurrent
|
||||||
|
reader can never observe a torn or half-applied state. `LatestReport` was changed
|
||||||
|
from `SiteHealthReport` (`null!`) to `SiteHealthReport?`, making the contract honest;
|
||||||
|
all existing consumers (CentralUI, integration/perf tests) already null-checked it
|
||||||
|
and continue to build clean. Regression test
|
||||||
|
`CentralHealthAggregatorTests.ProcessReport_ConcurrentUpdates_NeverLoseSequenceOrTearState`
|
||||||
|
exercises concurrent report/heartbeat/read threads and asserts snapshot consistency
|
||||||
|
and no lost updates.
|
||||||
|
|
||||||
|
### HealthMonitoring-003 — Shared state mutated inside `ConcurrentDictionary.AddOrUpdate` update delegate
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:55-78` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The update delegate passed to `AddOrUpdate` mutates the `existing` object in place
|
||||||
|
(`existing.LatestReport = report; existing.IsOnline = true; ...`). `AddOrUpdate`'s
|
||||||
|
contract explicitly allows the update delegate to be invoked **more than once** under
|
||||||
|
contention (when the CAS that installs the result loses a race and is retried). Each
|
||||||
|
invocation mutates the shared object, so a concurrent report for the same site can
|
||||||
|
observe a half-applied update, and the multi-field assignment is not atomic with
|
||||||
|
respect to readers in `GetAllSiteStates`/`CheckForOfflineSites`. The intended
|
||||||
|
"only replace if sequence is higher" guard can also be subverted because the
|
||||||
|
sequence comparison and the field writes are not a single atomic step.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Have the update delegate return a **new** `SiteHealthState` (record `with` copy)
|
||||||
|
rather than mutating `existing`, and treat the dictionary value as immutable.
|
||||||
|
Combined with HealthMonitoring-002, this makes every state transition an atomic
|
||||||
|
reference swap with no observable intermediate state.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-004 — Inconsistent heartbeat interval described across XML docs
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:146-148`, `src/ScadaLink.HealthMonitoring/SiteHealthState.cs:21`, `src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs:16` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The heartbeat cadence that offline detection relies on is documented inconsistently.
|
||||||
|
`CheckForOfflineSites` says "heartbeats arrive every ~5s"; `SiteHealthState.LastHeartbeatAt`
|
||||||
|
says "~5s heartbeat"; but `ICentralHealthAggregator.MarkHeartbeat` says "~2s
|
||||||
|
heartbeats are arriving". The actual cadence is set elsewhere (Cluster Infrastructure /
|
||||||
|
`SiteCommunicationActor`). Readers cannot reason about whether a 60s offline timeout
|
||||||
|
gives the intended grace without a single authoritative number.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Pick the correct interval (verify against the heartbeat scheduler in
|
||||||
|
`SiteCommunicationActor`/Cluster Infrastructure) and use it consistently in all three
|
||||||
|
comments, ideally referencing the owning component rather than restating a magic number.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-005 — Central self-report site can flap offline; no heartbeat grace like real sites
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:48-81`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:149` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CheckForOfflineSites` decides offline status purely from `LastHeartbeatAt`, and for
|
||||||
|
real sites that field is kept fresh by frequent (~2-5s) heartbeats so the 60s timeout
|
||||||
|
only fires on genuine total loss. The synthetic `central` site, however, has no
|
||||||
|
heartbeat source — `LastHeartbeatAt` is only bumped by `ProcessReport` from the
|
||||||
|
30s `CentralHealthReportLoop`. The loop also only runs on the cluster leader and
|
||||||
|
silently skips a cycle on any exception. Consequently, a single skipped/late central
|
||||||
|
self-report (leader GC pause, brief stall, mid-failover before the new leader's loop
|
||||||
|
spins up) leaves `central` with no signal for >60s and it is marked offline even
|
||||||
|
though the central cluster is healthy. The central card thus has no equivalent of
|
||||||
|
the "one missed report grace" the design doc grants real sites.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either feed `central` a heartbeat equivalent (e.g. have `MarkHeartbeat` called for
|
||||||
|
`CentralSiteId` on a fast timer independent of the leader-only report loop), or apply
|
||||||
|
a longer/distinct offline timeout to the `central` keyspace entry, and ensure the new
|
||||||
|
leader starts the report loop promptly on failover.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-006 — Sequence seeding contradicts the doc's "starting at 1" wording and is untestable
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:28`, `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:32` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The `HealthReportSender` class XML summary states "Sequence numbers are monotonic,
|
||||||
|
starting at 1, and reset on service restart." The implementation instead seeds
|
||||||
|
`_sequenceNumber` with `DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()` so the first
|
||||||
|
emitted sequence is a large epoch value, specifically to keep ordering correct across
|
||||||
|
failover. The summary is therefore stale and contradicts the code. Separately, the
|
||||||
|
seed reads `DateTimeOffset.UtcNow` directly at field initialization rather than
|
||||||
|
through an injected `TimeProvider` (which `CentralHealthAggregator` already uses),
|
||||||
|
making the seeding logic impossible to unit-test deterministically and dependent on
|
||||||
|
node wall-clock agreement — if one node's clock lags, its post-failover reports can
|
||||||
|
be silently rejected as stale by the aggregator.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Fix the `HealthReportSender` XML summary to describe the actual Unix-ms seeding
|
||||||
|
strategy, and inject `TimeProvider` for the seed so the behaviour is testable and the
|
||||||
|
clock dependency is explicit.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-007 — Heartbeats for not-yet-registered sites are silently dropped
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:86-99` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`MarkHeartbeat` returns immediately if the site is not already in `_siteStates`
|
||||||
|
("registration only happens on report"). Central health state is in-memory only and
|
||||||
|
not persisted. After a central restart or failover the aggregator starts empty, so
|
||||||
|
for up to one full report interval (default 30s) every site emits only heartbeats
|
||||||
|
that are all discarded — the site is reported as *unknown* (absent from
|
||||||
|
`GetAllSiteStates`) rather than *online*, even though heartbeats prove it is
|
||||||
|
reachable. This is a visible dashboard regression precisely during the failover
|
||||||
|
window, which is when operators most need accurate status.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Allow `MarkHeartbeat` to register a minimal `SiteHealthState` (online, no
|
||||||
|
`LatestReport` yet, with a UI-visible "awaiting first report" indication) when a
|
||||||
|
heartbeat arrives for an unknown site, so reachable sites show online immediately
|
||||||
|
after a central restart.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-008 — `GetAllSiteStates` / `GetSiteState` leak live mutable state objects to callers
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:104-116` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetAllSiteStates` copies the dictionary but the copy still holds references to the
|
||||||
|
same live mutable `SiteHealthState` instances; `GetSiteState` returns the live
|
||||||
|
instance directly. UI consumers (Blazor Server / SignalR circuits) read these objects
|
||||||
|
on their own threads while the aggregator's background timer and report handlers
|
||||||
|
concurrently mutate the very same instances (see HealthMonitoring-002). A UI render
|
||||||
|
can observe a `SiteHealthState` with, e.g., `IsOnline == true` but a `LatestReport`
|
||||||
|
from a different update, or a torn `DateTimeOffset`. Callers could also mutate the
|
||||||
|
shared state, corrupting aggregator state.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Return immutable snapshots: convert `SiteHealthState` to a record (per
|
||||||
|
HealthMonitoring-002/003) so handing out the reference is safe, or deep-copy each
|
||||||
|
state into an immutable DTO before returning.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-009 — Missing test coverage for central report loop, heartbeat path, replication, and collector setters
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.HealthMonitoring.Tests/` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Several behaviours have no automated coverage:
|
||||||
|
- `CentralHealthReportLoop` — leader-only gating (`SelfIsPrimary`), self-report
|
||||||
|
generation, sequence assignment: no test file at all.
|
||||||
|
- `CentralHealthAggregator.MarkHeartbeat` — keeping a site online between reports,
|
||||||
|
online recovery via heartbeat, and the unknown-site drop behaviour
|
||||||
|
(HealthMonitoring-007): untested.
|
||||||
|
- Offline detection driven by `LastHeartbeatAt` vs `LastReportReceivedAt` — the
|
||||||
|
existing offline tests only advance time after a report, never exercising the
|
||||||
|
heartbeat-keeps-alive path the design depends on.
|
||||||
|
- `SiteHealthCollector` — `SetClusterNodes`, `SetInstanceCounts`, `SetParkedMessageCount`,
|
||||||
|
`SetNodeHostname`, `SetActiveNode`/`NodeRole`, `UpdateTagQuality`,
|
||||||
|
`UpdateConnectionEndpoint`: not reflected-in-report tested.
|
||||||
|
- `SiteHealthReportReplica` idempotency under double delivery: untested.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for the central report loop (with a fake `IClusterNodeProvider`), the
|
||||||
|
heartbeat-keeps-online and unknown-site heartbeat paths, and the remaining collector
|
||||||
|
setters' presence in `CollectReport` output.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-010 — `HealthReportSender` silently swallows inner failures with bare `catch {}`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:70-87` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The cluster-nodes update and parked-message-count query are each wrapped in
|
||||||
|
`try { ... } catch { /* Non-fatal */ }` with no logging. A persistent failure (e.g.
|
||||||
|
the S&F SQLite store is permanently broken, or `GetClusterNodes()` always throws)
|
||||||
|
is then completely invisible — every report silently ships with stale cluster nodes
|
||||||
|
and a parked count of 0, with nothing in the logs to explain the wrong dashboard
|
||||||
|
values. Bare `catch` with no exception variable also catches `OperationCanceledException`
|
||||||
|
and would mask shutdown signalling if the awaited call observed the token.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Catch a specific exception type (or at least `Exception ex`) and `LogWarning`/`LogDebug`
|
||||||
|
the failure so persistent degradation is diagnosable; avoid swallowing
|
||||||
|
`OperationCanceledException`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-011 — `AddHealthMonitoringActors` is a dead no-op placeholder
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/ServiceCollectionExtensions.cs:42-46` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AddHealthMonitoringActors` does nothing but `return services` with a "Placeholder for
|
||||||
|
Phase 4+" comment. A public extension method that silently no-ops is a trap: a caller
|
||||||
|
who registers it will believe actor wiring is in place. No caller currently invokes it.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Remove the method until it has real behaviour, or throw `NotImplementedException` so
|
||||||
|
accidental use fails loudly. If the actor model for this component is genuinely
|
||||||
|
planned, track it in the design doc instead of a half-method.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### HealthMonitoring-012 — `SiteHealthState.LatestReport` initialized to `null!`, misrepresenting the contract
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthState.cs:11` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LatestReport` is declared `SiteHealthReport LatestReport { get; set; } = null!;`,
|
||||||
|
suppressing nullability. Today every code path that creates a `SiteHealthState` (only
|
||||||
|
`ProcessReport`) assigns `LatestReport`, so it is never actually null — but the
|
||||||
|
`null!` declaration tells readers and the compiler the opposite of the real
|
||||||
|
invariant. If HealthMonitoring-007 is addressed by registering state from a heartbeat
|
||||||
|
(no report yet), this becomes a live `NullReferenceException` risk for UI code that
|
||||||
|
dereferences `LatestReport`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either make `LatestReport` `required` (matching how it is genuinely always set today)
|
||||||
|
or make it properly nullable `SiteHealthReport?` and have consumers handle the
|
||||||
|
"registered, no report yet" case explicitly — consistent with whatever is decided
|
||||||
|
for HealthMonitoring-007.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,407 @@
|
|||||||
|
# Code Review — Host
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.Host` |
|
||||||
|
| Design doc | `docs/requirements/Component-Host.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 10 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The Host module is the composition root for the entire ScadaLink system: a single
|
||||||
|
binary whose behaviour (`Central` vs `Site`) is driven entirely by configuration. The
|
||||||
|
implementation is generally faithful to `Component-Host.md` — startup validation,
|
||||||
|
role-based registration, Serilog enrichment, Windows Service support, dead-letter
|
||||||
|
monitoring, CoordinatedShutdown, and gRPC hosting on site nodes are all present and
|
||||||
|
backed by a solid test suite (`tests/ScadaLink.Host.Tests`).
|
||||||
|
|
||||||
|
The most significant problem is the readiness endpoint: `/health/ready` runs **all**
|
||||||
|
registered health checks, including the leader-only `active-node` check, so a fully
|
||||||
|
operational *standby* central node permanently reports `503` on `/health/ready` —
|
||||||
|
directly contradicting REQ-HOST-4a, which defines readiness as cluster membership +
|
||||||
|
DB connectivity (not leadership). Several other findings concern configuration that
|
||||||
|
is validated-but-never-consumed (`MachineDataDb`), design-doc drift (Akka.Persistence
|
||||||
|
is required by REQ-HOST-6 but the system uses no persistent actors), an incorrect
|
||||||
|
seed-node entry in the shipped site config, blocking sync-over-async during startup,
|
||||||
|
and unguarded string interpolation when building HOCON. None are crash/data-loss
|
||||||
|
class, but the readiness bug is High because it breaks load-balancer behaviour with
|
||||||
|
no safe workaround.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | `/health/ready` includes the leader-only check (Host-001); site seed-node config points at the gRPC port (Host-004). |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | CoordinatedShutdown, receptionist registration, singleton scoping all correct. HOCON built by raw string interpolation (Host-006); `StartAsync` returns before actors are confirmed running (Host-009). |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Blocking `GetAwaiter().GetResult()` on a hosted-service startup thread (Host-005). `DeadLetterMonitorActor` state is actor-confined — no issues. |
|
||||||
|
| 4 | Error handling & resilience | ☑ | Top-level try/catch logs fatal and rethrows. No retry around DB migration / readiness preconditions (Host-010). |
|
||||||
|
| 5 | Security | ☑ | Plaintext DB password, LDAP service-account password and dev JWT key checked into `appsettings.Central.json` (Host-003). |
|
||||||
|
| 6 | Performance & resource management | ☑ | No undisposed resources. Inbound API script compilation is a synchronous startup loop — acceptable. |
|
||||||
|
| 7 | Design-document adherence | ☑ | REQ-HOST-6 mandates Akka.Persistence config but none exists and no persistent actors exist — doc is stale (Host-002). REQ-HOST-4 GrpcPort-≠-RemotingPort rule not enforced (Host-007). |
|
||||||
|
| 8 | Code organization & conventions | ☑ | `MachineDataDb` validated/declared but never consumed (Host-008). `LoggingOptions.MinimumLevel` is dead (Host-011). |
|
||||||
|
| 9 | Testing coverage | ☑ | Strong suite; no test asserts `/health/ready` excludes `active-node`, which is why Host-001 slipped through (noted in Host-001). |
|
||||||
|
| 10 | Documentation & comments | ☑ | Comments are accurate. REQ-HOST-6 in the design doc is the main stale-doc item (Host-002). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### Host-001 — `/health/ready` includes the leader-only `active-node` check
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.Host/Program.cs:135-145` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`/health/ready` is mapped with `MapHealthChecks("/health/ready", ...)` and **no
|
||||||
|
`Predicate`**, so it executes every registered check: `database`, `akka-cluster`
|
||||||
|
*and* `active-node`. `ActiveNodeHealthCheck` (`Health/ActiveNodeHealthCheck.cs:38`)
|
||||||
|
returns `Unhealthy` on any node that is not the cluster leader. As a result a
|
||||||
|
standby central node that is fully operational (cluster member `Up`, database
|
||||||
|
reachable) still returns `503` on `/health/ready`. This contradicts REQ-HOST-4a,
|
||||||
|
which defines readiness as cluster membership + DB connectivity + singletons —
|
||||||
|
explicitly *not* leadership. `/health/active` is the endpoint intended to report
|
||||||
|
leadership. A load balancer using `/health/ready` to decide whether a node may
|
||||||
|
serve traffic will permanently treat the standby as unready, defeating failover
|
||||||
|
readiness. No test covers this: `HealthCheckTests.HealthReady_Endpoint_ReturnsResponse`
|
||||||
|
only asserts a response is returned, not the standby semantics.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a `Predicate` to the `/health/ready` mapping that excludes the `active-node`
|
||||||
|
check, e.g. `Predicate = check => check.Name != "active-node"` (or tag the readiness
|
||||||
|
checks and filter by tag). Add a regression test asserting a non-leader node returns
|
||||||
|
`200` on `/health/ready`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against
|
||||||
|
`Program.cs`: the `/health/ready` mapping had no `Predicate`, so it executed all
|
||||||
|
three registered checks including the leader-only `active-node` check, while
|
||||||
|
`ActiveNodeHealthCheck` returns `Unhealthy` on any non-leader node — making a fully
|
||||||
|
operational standby central node permanently report `503`. Fix: added
|
||||||
|
`Predicate = check => check.Name != "active-node"` to the `/health/ready`
|
||||||
|
`HealthCheckOptions`, so readiness now reflects cluster membership + DB connectivity
|
||||||
|
only (REQ-HOST-4a); leadership remains reported solely by `/health/active`.
|
||||||
|
Regression test `HealthCheckTests.HealthReady_Endpoint_ExcludesActiveNodeCheck`
|
||||||
|
asserts the `active-node` check name does not appear in the `/health/ready`
|
||||||
|
response body; it failed before the fix and passes after. Full Host suite green
|
||||||
|
(156 passed).
|
||||||
|
|
||||||
|
### Host-002 — Akka.Persistence required by REQ-HOST-6 is not configured and not used
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:70-108` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
REQ-HOST-6 states the Host "must configure the Akka.NET actor system using
|
||||||
|
Akka.Hosting with ... **Persistence**: Configured with the appropriate journal and
|
||||||
|
snapshot store (SQL for central, SQLite for site)." The HOCON built in
|
||||||
|
`AkkaHostedService.StartAsync` contains no `akka.persistence` section, no journal and
|
||||||
|
no snapshot-store plugin, and `ScadaLink.Host.csproj` references neither
|
||||||
|
`Akka.Persistence.Hosting` nor any persistence plugin (the design doc Dependencies
|
||||||
|
list `Akka.Persistence.Hosting`). A repo-wide search finds **no** `PersistentActor` /
|
||||||
|
`ReceivePersistentActor` subclasses — the system deliberately uses custom SQLite
|
||||||
|
storage services instead. The code is internally consistent, but the design document
|
||||||
|
is stale: it mandates a subsystem that does not exist. This is a documented-vs-actual
|
||||||
|
drift that will mislead future maintainers and any audit against REQ-HOST-6.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Update `Component-Host.md` REQ-HOST-6 and the Dependencies list to remove the
|
||||||
|
Akka.Persistence requirement (or explicitly state persistence is provided by
|
||||||
|
component-owned SQLite storage, not Akka.Persistence). If persistence *is* intended,
|
||||||
|
add the plugin packages and HOCON. Either way, code and doc must agree.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-003 — Secrets committed in plaintext in `appsettings.Central.json`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/appsettings.Central.json:20-31` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`appsettings.Central.json` contains real-looking secrets in plaintext, checked into
|
||||||
|
source control: SQL Server passwords in the `ConfigurationDb` / `MachineDataDb`
|
||||||
|
connection strings (`Password=ScadaLink_Dev1#`), an LDAP service-account password
|
||||||
|
(`LdapServiceAccountPassword: "password"`), and a JWT signing key
|
||||||
|
(`JwtSigningKey: "scadalink-dev-jwt-signing-key-..."`). Even though these are
|
||||||
|
intended as development defaults, shipping them in the default config invites them
|
||||||
|
being reused verbatim in production, and a committed JWT signing key allows anyone
|
||||||
|
with repo access to forge session tokens. `TrustServerCertificate=true` additionally
|
||||||
|
disables TLS validation for the SQL connection.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Move all secrets out of committed `appsettings*.json` into environment variables,
|
||||||
|
user-secrets, or a secret store. Keep only non-sensitive structural defaults in the
|
||||||
|
file and document the required environment variables. At minimum add a clear comment
|
||||||
|
that these values are dev-only and must be overridden, and rotate the JWT key per
|
||||||
|
environment.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-004 — Site seed-node list points at the gRPC port, not a remoting port
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/appsettings.Site.json:10-19` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The shipped site config sets `Node:RemotingPort = 8082` and `Node:GrpcPort = 8083`,
|
||||||
|
but `Cluster:SeedNodes` is `["akka.tcp://scadalink@localhost:8082",
|
||||||
|
"akka.tcp://scadalink@localhost:8083"]`. The second seed node targets `8083`, which
|
||||||
|
is the Kestrel HTTP/2 gRPC port — not an Akka remoting endpoint. A node attempting to
|
||||||
|
join via that seed will try to establish an Akka.Remote TCP association against the
|
||||||
|
gRPC listener and fail. `StartupValidator` only checks that ≥2 seed nodes exist
|
||||||
|
(`StartupValidator.cs:54-56`), so this misconfiguration passes validation silently.
|
||||||
|
For the single-node dev site it is harmless (the first seed succeeds), but it is an
|
||||||
|
incorrect example that will be copied into multi-node site configs.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Correct the site seed-node list to reference the two site nodes' *remoting* ports
|
||||||
|
(e.g. `8082` and `8084`), never the gRPC port. Consider extending `StartupValidator`
|
||||||
|
to reject a seed node whose port equals this node's `GrpcPort`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-005 — Blocking sync-over-async (`GetAwaiter().GetResult()`) inside `StartAsync`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:345` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`RegisterSiteActors` calls `storeAndForwardService.StartAsync().GetAwaiter().GetResult()`
|
||||||
|
synchronously, blocking inside the `IHostedService.StartAsync` path. `StartAsync` is
|
||||||
|
itself declared synchronous (returns `Task.CompletedTask`), so the work cannot be
|
||||||
|
awaited cleanly. Blocking on async work risks thread-pool starvation during startup
|
||||||
|
and, if the awaited operation captures a synchronization context, deadlock. It also
|
||||||
|
hides exceptions behind an `AggregateException` wrapper.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Make `AkkaHostedService.StartAsync` genuinely `async` and `await
|
||||||
|
storeAndForwardService.StartAsync(cancellationToken)`. Propagate the
|
||||||
|
`CancellationToken` and let exceptions surface as the original type.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-006 — HOCON assembled by unescaped string interpolation
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:70-108` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The Akka HOCON is built with an interpolated string that injects
|
||||||
|
`_nodeOptions.NodeHostname`, `_clusterOptions.SeedNodes`, the computed roles, and
|
||||||
|
`SplitBrainResolverStrategy` directly into the configuration text. Values are not
|
||||||
|
escaped. A hostname or seed-node string containing a quote, backslash, brace, or
|
||||||
|
comment sequence would corrupt the HOCON and produce a confusing parse error far from
|
||||||
|
the real cause; `SplitBrainResolverStrategy` is interpolated without quoting, so a
|
||||||
|
value with whitespace breaks the document. Building cluster configuration from raw
|
||||||
|
string concatenation is also harder to maintain than the typed Akka.Hosting builder
|
||||||
|
the design doc (REQ-HOST-6) actually calls for ("via Akka.Hosting").
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Prefer the `Akka.Hosting` `AddAkka(...)` builder with strongly-typed `WithRemoting`,
|
||||||
|
`WithClustering`, and split-brain-resolver configuration instead of hand-built HOCON.
|
||||||
|
If HOCON must be retained, validate/escape interpolated values (especially hostname
|
||||||
|
and seed nodes) before substitution.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-007 — REQ-HOST-4 rule "GrpcPort ≠ RemotingPort" is not enforced
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/StartupValidator.cs:43-47` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
REQ-HOST-4 requires: "Site nodes must have `GrpcPort` in valid port range (1–65535)
|
||||||
|
**and different from `RemotingPort`**." `StartupValidator` validates the GrpcPort
|
||||||
|
range but never compares it to `RemotingPort`. A site config that sets both ports to
|
||||||
|
the same value passes validation and then fails opaquely at runtime when Kestrel and
|
||||||
|
Akka.Remote both try to bind the port. The GrpcPort range check is also skipped
|
||||||
|
entirely when the key is absent (`grpcPortStr != null`), relying on the
|
||||||
|
`NodeOptions` default of 8083 — acceptable, but the equality rule is the missing
|
||||||
|
piece.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a check in the `role == "Site"` block: if `GrpcPort` (resolved, including the
|
||||||
|
8083 default) equals `RemotingPort`, add an error
|
||||||
|
`"ScadaLink:Node:GrpcPort must differ from RemotingPort"`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-008 — `MachineDataDb` is validated and declared but never consumed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/StartupValidator.cs:33-34`, `src/ScadaLink.Host/DatabaseOptions.cs:6` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`StartupValidator` requires a non-empty `ScadaLink:Database:MachineDataDb` connection
|
||||||
|
string for Central nodes, and `DatabaseOptions` exposes a `MachineDataDb` property,
|
||||||
|
but a repo-wide search shows the value is never read anywhere outside the Host module
|
||||||
|
— only `ConfigurationDb` is passed to `AddConfigurationDatabase`
|
||||||
|
(`Program.cs:83-85`). The Host therefore fails startup if `MachineDataDb` is missing
|
||||||
|
even though nothing uses it. This is either dead configuration that should be removed
|
||||||
|
or a missing wiring (a machine-data DbContext that was never registered).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Determine whether a machine-data store is actually required. If yes, wire it into the
|
||||||
|
relevant component's DI registration. If no, remove the `MachineDataDb` validation
|
||||||
|
rule, the `DatabaseOptions` property, and the key from `appsettings.Central.json`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-009 — `StartAsync` reports success before role actors are confirmed running
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:127-141` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`StartAsync` creates actors with `ActorOf` (a fire-and-forget operation — the actor's
|
||||||
|
`PreStart` runs asynchronously on its own thread) and then returns
|
||||||
|
`Task.CompletedTask`. For site nodes, `grpcServer.SetReady(_actorSystem)` is called
|
||||||
|
synchronously at the end of `RegisterSiteActors`, marking the gRPC server ready even
|
||||||
|
though `SiteCommunicationActor`, the deployment-manager singleton, and the
|
||||||
|
`ClusterClient` may not yet have completed their `PreStart`/initial-contact handshake.
|
||||||
|
REQ-HOST-7 requires "Actor system and SiteStreamManager ... initialized before gRPC
|
||||||
|
begins accepting connections" — `SiteStreamManager.Initialize` is awaited-equivalent,
|
||||||
|
but the broader actor graph is not. The window is small and the gRPC server still
|
||||||
|
rejects streams until `SetReady`, so impact is limited, but readiness is being
|
||||||
|
asserted optimistically.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
If strict ordering matters, gate `SetReady` on confirmation that
|
||||||
|
`SiteCommunicationActor` is fully initialized (e.g. an `Ask` round-trip or a
|
||||||
|
readiness message), or document explicitly that gRPC readiness only guarantees the
|
||||||
|
actor system exists, not that the cluster handshake has completed.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-010 — No retry/backoff around startup preconditions (DB migration, readiness)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/Program.cs:112-125` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
On Central startup the Host opens a DI scope and calls
|
||||||
|
`MigrationHelper.ApplyOrValidateMigrationsAsync` directly. If the SQL Server is not
|
||||||
|
yet reachable (common in container orchestration where the DB and app start
|
||||||
|
together), the call throws, the top-level `catch` logs `Fatal`, and the process
|
||||||
|
exits. There is no bounded retry/backoff to tolerate a database that is briefly
|
||||||
|
unavailable at boot. The design intent (REQ-HOST-4a, readiness gating, `503` until
|
||||||
|
ready) is about *serving traffic*, but the migration step happens before the host
|
||||||
|
even runs and has no such tolerance.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the migration/validation step in a bounded retry with exponential backoff (e.g.
|
||||||
|
Polly), or move schema apply behind the readiness gate so the process stays up and
|
||||||
|
reports `503` until the database becomes reachable.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Host-011 — `LoggingOptions.MinimumLevel` is dead configuration
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/LoggingOptions.cs:5`, `src/ScadaLink.Host/Program.cs:42-50` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LoggingOptions` exposes a `MinimumLevel` property bound from `ScadaLink:Logging`
|
||||||
|
(`SiteServiceRegistration.BindSharedOptions`), and both `appsettings.Central.json`
|
||||||
|
and `appsettings.Site.json` set `"Logging": { "MinimumLevel": "Information" }`.
|
||||||
|
However Serilog is configured purely via `ReadFrom.Configuration(configuration)`,
|
||||||
|
which reads the standard `Serilog` section — not `ScadaLink:Logging`. The
|
||||||
|
`LoggingOptions.MinimumLevel` value is never read by any code, so changing it has no
|
||||||
|
effect. This is misleading: an operator editing `ScadaLink:Logging:MinimumLevel`
|
||||||
|
expecting a log-level change will see nothing happen.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either consume `LoggingOptions.MinimumLevel` when configuring the Serilog
|
||||||
|
`LoggerConfiguration` (e.g. set `MinimumLevel.Is(...)` from it), or remove the option
|
||||||
|
class and the `ScadaLink:Logging` sections and rely solely on the `Serilog`
|
||||||
|
configuration section. Keep one mechanism, not two.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,442 @@
|
|||||||
|
# Code Review — InboundAPI
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.InboundAPI` |
|
||||||
|
| Design doc | `docs/requirements/Component-InboundAPI.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 13 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The InboundAPI module is small (8 source files) and the happy-path flow — extract
|
||||||
|
key, validate, deserialize parameters, execute script, serialize result — is clean
|
||||||
|
and readable. However the review surfaced several real problems concentrated in two
|
||||||
|
themes: **concurrency** and **security**. The `InboundScriptExecutor` is a singleton
|
||||||
|
that mutates a plain `Dictionary` from concurrent ASP.NET request threads with no
|
||||||
|
synchronization, which can corrupt the handler cache or crash the process under load.
|
||||||
|
On the security side, API-key comparison is a non-constant-time database string
|
||||||
|
match (timing oracle), compiled scripts run with no enforcement of the documented
|
||||||
|
script trust model (forbidden APIs such as `System.IO`/`Process`/`Reflection` are
|
||||||
|
fully reachable), there is no request-body size limit, and the executor's catch-all
|
||||||
|
swallows `OperationCanceledException` from genuine client disconnects as a "timeout".
|
||||||
|
Design-doc adherence is also incomplete: the `Database.Connection()` script API
|
||||||
|
described in the design doc is entirely absent from `InboundScriptContext`, and the
|
||||||
|
endpoint never enforces that the API is central-only. Testing covers the validators
|
||||||
|
well but there is no coverage of the HTTP endpoint, concurrency, or recompilation.
|
||||||
|
None of the findings are data-loss-class, but the concurrency and trust-model issues
|
||||||
|
are High severity and should be addressed before production use.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | `CoerceValue` returns `null` for legitimately-null/`String` values indistinguishably; parameter-definition edge cases noted. |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | Module is ASP.NET-hosted, no actors of its own; routes to actors via `CommunicationService`. No correlation-ID issues — IDs are set in `RouteHelper`. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Singleton `InboundScriptExecutor` mutates a non-thread-safe `Dictionary` from concurrent request threads — see InboundAPI-001/002. |
|
||||||
|
| 4 | Error handling & resilience | ☑ | Catch-all conflates client cancellation with timeout (InboundAPI-004); compilation-failure path repeats work on every request (InboundAPI-009). |
|
||||||
|
| 5 | Security | ☑ | Non-constant-time key comparison, no trust-model enforcement, no body-size limit, missing-method enumeration oracle — see InboundAPI-003/005/006/011. |
|
||||||
|
| 6 | Performance & resource management | ☑ | Up to 3 separate DB round-trips per request in `ApiKeyValidator`; uncapped lazy recompilation. |
|
||||||
|
| 7 | Design-document adherence | ☑ | `Database.Connection()` script API missing; central-only hosting not enforced; lazy-compile diverges from "compiled at startup". |
|
||||||
|
| 8 | Code organization & conventions | ☑ | `ParameterDefinition` is an API-shaped POCO declared in the component project rather than Commons; otherwise conventions followed. |
|
||||||
|
| 9 | Testing coverage | ☑ | Good unit coverage of the two validators; no endpoint, concurrency, recompilation, or timeout-vs-cancel tests. |
|
||||||
|
| 10 | Documentation & comments | ☑ | `ApiKeyValidationResult.NotFound` XML/name says "NotFound" but returns HTTP 400 — misleading (InboundAPI-013). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### InboundAPI-001 — Singleton script handler cache mutated without synchronization
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:17`, `:32`, `:40`, `:89`, `:123-128` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`InboundScriptExecutor` is registered as a singleton (`ServiceCollectionExtensions.cs:11`)
|
||||||
|
and its handler cache is a plain `Dictionary<string, Func<...>>` (`InboundScriptExecutor.cs:17`).
|
||||||
|
`RegisterHandler`, `RemoveHandler`, `CompileAndRegister`, and the lazy-compile path in
|
||||||
|
`ExecuteAsync` all read and write this dictionary with no lock. ASP.NET serves inbound
|
||||||
|
API requests on concurrent thread-pool threads, so two requests for an as-yet-uncompiled
|
||||||
|
method (or a request racing a CLI-triggered `CompileAndRegister`) can mutate the
|
||||||
|
dictionary concurrently. `Dictionary` is explicitly not safe for concurrent
|
||||||
|
read/write — this can corrupt internal buckets, throw `InvalidOperationException`,
|
||||||
|
or return a torn/`null` handler, crashing the request or the process.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Replace the `Dictionary` with a `ConcurrentDictionary<string, Func<...>>`, or guard all
|
||||||
|
access with a lock. For the lazy-compile path use `GetOrAdd` so concurrent first-callers
|
||||||
|
compile at most once.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-002 — Lazy compilation is a check-then-act race with no atomicity
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:123-129` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ExecuteAsync` does `if (!_scriptHandlers.TryGetValue(...)) { CompileAndRegister(method); handler = _scriptHandlers[method.Name]; }`.
|
||||||
|
Even setting aside the unsynchronized dictionary (InboundAPI-001), this is a
|
||||||
|
check-then-act sequence: between `TryGetValue` failing and the re-read on line 128,
|
||||||
|
another thread could `RemoveHandler` the entry, causing the indexer on line 128 to
|
||||||
|
throw `KeyNotFoundException` — an unhandled-in-context exception that is then caught
|
||||||
|
only by the broad catch on line 143 and reported to the caller as "Internal script
|
||||||
|
error". Multiple concurrent first-callers will also each compile the same script
|
||||||
|
redundantly (wasted Roslyn work).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Make compile-and-fetch a single atomic operation (`ConcurrentDictionary.GetOrAdd`
|
||||||
|
with a lazily-evaluated factory, or a per-method lock), and have `CompileAndRegister`
|
||||||
|
return the handler it produced rather than requiring a separate dictionary read.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-003 — API key compared with non-constant-time string equality
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/InboundApiRepository.cs:22-23`, consumed by `src/ScadaLink.InboundAPI/ApiKeyValidator.cs:33` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
API-key authentication resolves the key with
|
||||||
|
`FirstOrDefaultAsync(k => k.KeyValue == keyValue)` — an ordinary equality match
|
||||||
|
translated to a SQL `WHERE KeyValue = @p` comparison. The secret is matched with
|
||||||
|
ordinary (early-exit) string/SQL comparison rather than a constant-time comparison,
|
||||||
|
which is a classic timing side-channel for secret material. Combined with the design's
|
||||||
|
explicit "no rate limiting" decision, an attacker with network access to the central
|
||||||
|
API can mount a timing attack to recover valid keys. The API key is the *sole*
|
||||||
|
credential for the inbound API, so this is the primary authentication path.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Look the key up by a non-secret indexed identifier (e.g. a key prefix/id) or fetch
|
||||||
|
candidate rows, then verify the secret in-process using
|
||||||
|
`CryptographicOperations.FixedTimeEquals` over the UTF-8 bytes. Preferably store only
|
||||||
|
a salted hash of the key value and compare hashes. Avoid leaking secret-length and
|
||||||
|
match-position timing.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-004 — Client disconnect is misreported as a script timeout
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:117-141` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ExecuteAsync` creates a linked CTS from `httpContext.RequestAborted` and the method
|
||||||
|
timeout, then catches `OperationCanceledException` and unconditionally returns
|
||||||
|
"Script execution timed out". When the *client* aborts the request (`RequestAborted`
|
||||||
|
fires), the same exception type is thrown, so a normal client disconnect is logged as
|
||||||
|
a timeout (`_logger.LogWarning("Script execution timed out ...")`) and an attempt is
|
||||||
|
made to write a 500 timeout body to an already-gone connection. This pollutes the
|
||||||
|
failure log (which the design says is reserved for genuine script errors) and obscures
|
||||||
|
real timeout incidents.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Distinguish the two cancellation sources: if `cancellationToken` (the request token)
|
||||||
|
is cancelled, treat it as a client abort — do not log a timeout and do not attempt to
|
||||||
|
write a response. Only when the timeout CTS fired should the result be "timed out".
|
||||||
|
Check `cts.Token.IsCancellationRequested && !cancellationToken.IsCancellationRequested`
|
||||||
|
or use a dedicated timeout `CancellationTokenSource` so the two are separable.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-005 — Compiled API scripts run with no script-trust-model enforcement
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:56-93` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
CLAUDE.md's Akka.NET conventions state the script trust model forbids `System.IO`,
|
||||||
|
`Process`, `Threading`, `Reflection`, and raw network access. `CompileAndRegister`
|
||||||
|
compiles arbitrary C# with `CSharpScript.Create` and only restricts the *default
|
||||||
|
imports* (`WithImports("System", ...)`). Imports are a convenience, not a sandbox — a
|
||||||
|
script can still fully-qualify any type (`System.IO.File.Delete(...)`,
|
||||||
|
`System.Diagnostics.Process.Start(...)`, `System.Reflection`, raw `Socket`) because
|
||||||
|
the core framework assemblies are referenced and Roslyn scripting performs no API
|
||||||
|
allow/deny-listing. Inbound API scripts execute on the central node with the host
|
||||||
|
process's privileges, so a malicious or buggy method definition has full host access.
|
||||||
|
Note the Design role authors these scripts (less trusted than Admin), making
|
||||||
|
enforcement material.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a compile-time analyzer/`SyntaxWalker` (as the Site Runtime does for instance
|
||||||
|
scripts) that rejects forbidden namespaces/types before registering a handler, and/or
|
||||||
|
run scripts under a constrained boundary. At minimum, share the Site Runtime's
|
||||||
|
forbidden-API checker so the trust model is enforced consistently. Reject the method
|
||||||
|
(and log) when a violation is found instead of registering it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-006 — No request body size limit on the inbound endpoint
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/EndpointExtensions.cs:54-62` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleInboundApiRequest` calls `JsonDocument.ParseAsync(httpContext.Request.Body, ...)`
|
||||||
|
with no explicit body-size cap and no `[RequestSizeLimit]`/endpoint metadata. Although
|
||||||
|
Kestrel has a default max request body size, this endpoint accepts arbitrary JSON from
|
||||||
|
external systems, fully buffers it into a `JsonDocument`, and then `Clone()`s the
|
||||||
|
root element (`:61`) which materializes the entire document on the heap. With no rate
|
||||||
|
limiting (a deliberate design choice) a single caller can drive large allocations.
|
||||||
|
Deep/wide JSON also makes the `CoerceValue` `object`/`list` deserialization
|
||||||
|
(`ParameterValidator.cs:113,117`) expensive.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Set an explicit, modest body-size limit on the endpoint
|
||||||
|
(`.WithMetadata(new RequestSizeLimitAttribute(...))` or
|
||||||
|
`IHttpMaxRequestBodySizeFeature`) and consider a `JsonDocumentOptions` `MaxDepth`.
|
||||||
|
Reject oversized bodies with 413 before buffering.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-007 — `Database.Connection()` script API from the design doc is not implemented
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:155-170` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`Component-InboundAPI.md` ("Script Runtime API -> Database Access") specifies
|
||||||
|
`Database.Connection("connectionName")` as an available script capability for
|
||||||
|
querying the configuration/machine-data databases. `InboundScriptContext` exposes only
|
||||||
|
`Parameters`, `Route`, and `CancellationToken` — there is no `Database` member. Any
|
||||||
|
method script that follows the documented API will fail to compile. Either the code
|
||||||
|
is incomplete or the design doc is stale; the two must be reconciled.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
If database access is in scope, add a `Database` property to `InboundScriptContext`
|
||||||
|
backed by a connection-factory service. If it is not, remove the "Database Access"
|
||||||
|
section from `Component-InboundAPI.md` so the design doc stops advertising an absent
|
||||||
|
API.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-008 — Inbound API endpoint not restricted to the active central node
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/EndpointExtensions.cs:19-23`, `src/ScadaLink.Host/Program.cs:149` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design states the Inbound API is "Central cluster only (active node)" and "fails
|
||||||
|
over with it". `MapInboundAPI` registers `POST /api/{methodName}` unconditionally, and
|
||||||
|
`Program.cs` maps it inside the central-role branch but with no active-node gating —
|
||||||
|
unlike `/health/active` which has an `active-node` predicate. A standby central node
|
||||||
|
will happily serve inbound API calls, executing scripts and `Route.To()` calls from a
|
||||||
|
non-leader, which can race the active node or run against stale singleton state.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Gate the endpoint on active-node status (reuse the cluster `active-node` health check
|
||||||
|
or a leader-state check) and return 503 on the standby, so Traefik/clients only reach
|
||||||
|
the live node — consistent with how the Management API and `/health/active` are
|
||||||
|
treated.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-009 — Failed compilation is retried on every subsequent request
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:123-128` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
When a method's script fails to compile, `CompileAndRegister` returns `false` and
|
||||||
|
nothing is stored in `_scriptHandlers`. Every subsequent call to that method re-enters
|
||||||
|
the lazy-compile branch and recompiles the broken script via Roslyn from scratch.
|
||||||
|
Roslyn compilation is expensive; a single broken method definition repeatedly invoked
|
||||||
|
by an external caller (no rate limiting) becomes a CPU amplification vector.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Cache the compilation *failure* (e.g. store a sentinel handler that immediately
|
||||||
|
returns the compile error, or keep a `HashSet` of known-bad method names with the
|
||||||
|
diagnostic) so a broken script is compiled at most once until the definition is
|
||||||
|
updated via `CompileAndRegister`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-010 — `ParameterValidator` ignores extra body fields and cannot validate Object/List element types
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/ParameterValidator.cs:64-90`, `:112-118` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Two related correctness gaps: (1) The validator iterates only over *defined*
|
||||||
|
parameters; any extra top-level fields in the request body are silently ignored
|
||||||
|
rather than reported, so callers get no feedback on typo'd parameter names. (2) For
|
||||||
|
`Object` and `List` types the validator only checks the JSON *kind* (`Object`/`Array`)
|
||||||
|
and then blindly `JsonSerializer.Deserialize`s the raw text — the design's extended
|
||||||
|
type system describes Objects as "named structure with typed fields" and Lists as
|
||||||
|
collections "of objects or primitive types", but no field-level or element-level type
|
||||||
|
validation is performed. Invalid nested structures pass validation and surface only
|
||||||
|
as runtime script errors.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Optionally warn/400 on unexpected body fields. For the extended types, either parse a
|
||||||
|
richer `ParameterDefinition` (with nested field definitions / element type) and
|
||||||
|
validate recursively, or document explicitly that Object/List are validated only for
|
||||||
|
shape — and update the design doc to match.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-011 — Method-existence check leaks to unapproved callers (enumeration oracle)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/ApiKeyValidator.cs:39-52` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ValidateAsync` returns 400 `Method '{methodName}' not found` when the method does not
|
||||||
|
exist, but 403 `API key not approved for this method` when it exists but the key is
|
||||||
|
not approved. A caller holding any valid enabled key can therefore enumerate which
|
||||||
|
method names exist on the central API by observing 400-vs-403 responses. The error
|
||||||
|
message also echoes the caller-supplied `methodName` back verbatim into the JSON
|
||||||
|
response (`EndpointExtensions.cs:47`), a minor reflected-input concern.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Return an indistinguishable response (e.g. 403/404) for both "method not found" and
|
||||||
|
"key not approved" so existence is not observable to unapproved callers. Avoid echoing
|
||||||
|
raw caller input in error bodies, or sanitize it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-012 — `ParameterDefinition` POCO declared in the component project, not Commons
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/ParameterValidator.cs:128-133` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ParameterDefinition` is a persistence-/contract-shaped POCO: it is the deserialized
|
||||||
|
form of `ApiMethod.ParameterDefinitions` (a column in the configuration database) and
|
||||||
|
describes the public API contract. CLAUDE.md's code-organization rules place
|
||||||
|
persistence-ignorant entity/contract types in `ScadaLink.Commons`. Defining it inside
|
||||||
|
the InboundAPI project means any other component that needs to read or produce method
|
||||||
|
parameter definitions (e.g. Central UI's method editor, CLI, Management Service)
|
||||||
|
cannot share the type and will duplicate it.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Move `ParameterDefinition` (and a matching return-definition type, if added) to
|
||||||
|
`ScadaLink.Commons` under the InboundApi entity/types namespace so it is shared by all
|
||||||
|
components that work with method definitions.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### InboundAPI-013 — `ApiKeyValidationResult.NotFound` factory returns HTTP 400, contradicting its name
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.InboundAPI/ApiKeyValidator.cs:78-79` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The static factory is named `NotFound` and is used for the "method not found" case,
|
||||||
|
but it builds a result with `StatusCode = 400` (Bad Request), not 404. The name
|
||||||
|
strongly implies 404 and will mislead future maintainers; `EndpointExtensions`
|
||||||
|
faithfully propagates whatever status code the factory sets, so the misnaming directly
|
||||||
|
affects the wire contract.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Rename the factory to match its behaviour (e.g. `BadRequest`) or change the status
|
||||||
|
code to 404 if that is the intended contract — and document the chosen "method not
|
||||||
|
found" status in `Component-InboundAPI.md`'s Error Handling section, which currently
|
||||||
|
does not list it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,432 @@
|
|||||||
|
# Code Review — ManagementService
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.ManagementService` |
|
||||||
|
| Design doc | `docs/requirements/Component-ManagementService.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 13 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The ManagementService module is a thin command-dispatch layer: a single `ManagementActor`
|
||||||
|
fronts every administrative operation, an HTTP `POST /management` endpoint authenticates and
|
||||||
|
forwards to it, and a SignalR `DebugStreamHub` provides real-time debug streaming. The code
|
||||||
|
is consistently structured and the role-based authorization gate (`GetRequiredRole`) is
|
||||||
|
broadly correct and well tested. However, the review surfaced a significant **security
|
||||||
|
theme**: site-scope enforcement, which the design document requires for instance- and
|
||||||
|
site-targeted Deployment operations, is applied inconsistently — several query handlers and
|
||||||
|
all remote-query/debug handlers perform no site-scope check at all, allowing a site-scoped
|
||||||
|
Deployment user to read or act on sites outside their scope. A second theme is **Akka.NET
|
||||||
|
convention drift**: the actor offloads all work to `Task.Run` instead of using `PipeTo`,
|
||||||
|
declares no supervision strategy, and the contract messages carry a loosely-typed `object`
|
||||||
|
payload. There are also resource-management defects in the HTTP endpoint (`JsonDocument`
|
||||||
|
instances never disposed) and dead/unused configuration. None of the findings are
|
||||||
|
crash-class, but the site-scope gaps are High severity because they are a real
|
||||||
|
authorization bypass with no workaround.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | + | `HandleResolveRoles` builds `RoleMapper` by hand; `ResolveRolesCommand` is a stale dispatch path. See 008, 011. |
|
||||||
|
| 2 | Akka.NET conventions | + | `Task.Run` instead of `PipeTo`, no supervision strategy, `object`-typed message payload. See 004, 005, 012. |
|
||||||
|
| 3 | Concurrency & thread safety | + | Actor is stateless so `Task.Run` does not corrupt state, but it defeats actor-thread serialization (004). `Sender` correctly captured to a local before the closure. |
|
||||||
|
| 4 | Error handling & resilience | + | Exceptions are caught and mapped uniformly; `SiteScopeViolationException` mapped to `Unauthorized`. Audit-logging consistency issue noted in 009. |
|
||||||
|
| 5 | Security | + | Site-scope enforcement missing on query/remote/debug paths. See 001, 002, 003. |
|
||||||
|
| 6 | Performance & resource management | + | `JsonDocument` instances never disposed in the HTTP endpoint. See 006. |
|
||||||
|
| 7 | Design-document adherence | + | Design doc states remote queries enforce site scoping; code does not. `ManagementServiceOptions` reserved-for-future config is unused. See 001, 010. |
|
||||||
|
| 8 | Code organization & conventions | + | Mixed serializers (Newtonsoft in actor, System.Text.Json in endpoint); inconsistent audit logging across mutations. See 007, 009. |
|
||||||
|
| 9 | Testing coverage | + | Authorization is well covered; site-scope enforcement, the HTTP endpoint, `DebugStreamHub`, and remote-query handlers have no tests. See 013. |
|
||||||
|
| 10 | Documentation & comments | + | XML docs are accurate where present; `ManagementServiceOptions` and `ResolveRolesCommand` paths are undocumented dead code (010, 011). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### ManagementService-001 — Remote-query and debug-snapshot handlers bypass site-scope enforcement
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:1465`, `:1481`, `:1493`, `:641`, `:649` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design document (`Component-ManagementService.md`, Authorization section) states that for
|
||||||
|
Deployment users "Site scoping is enforced for site-scoped Deployment users" and lists
|
||||||
|
"debug snapshot, parked message queries, site event log queries" among the Deployment-role
|
||||||
|
operations. `HandleQueryEventLogs`, `HandleQueryParkedMessages`, `HandleDebugSnapshot`,
|
||||||
|
`HandleRetryParkedMessage`, and `HandleDiscardParkedMessage` make no call to `EnforceSiteScope`
|
||||||
|
or `EnforceSiteScopeForInstance`. A Deployment user scoped to site A can therefore query event
|
||||||
|
logs / parked messages of site B, retry or discard another site's parked messages, and pull a
|
||||||
|
debug snapshot of any instance simply by supplying a different `SiteIdentifier` or `InstanceId`.
|
||||||
|
This is an authorization bypass with no workaround.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
In each of these handlers resolve the target site and call site-scope enforcement before
|
||||||
|
delegating to `CommunicationService`. For the `SiteIdentifier`-keyed handlers, look up the
|
||||||
|
`Site` by identifier and enforce against `Site.Id`; for `DebugSnapshotCommand` the instance
|
||||||
|
is already loaded — call `EnforceSiteScope(user, instance.SiteId)` (which requires threading
|
||||||
|
`AuthenticatedUser` into these handlers, currently dropped).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-002 — Single-entity query handlers leak data across site scope
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:510`, `:673`, `:733`, `:774`, `:631`, `:624` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleListInstances` and `HandleListSites` correctly filter their results by the user's
|
||||||
|
`PermittedSiteIds`, but the single-entity query handlers do not. `HandleGetInstance`,
|
||||||
|
`HandleGetSite`, `HandleListAreas`, and `HandleGetDataConnection` fetch by ID with no
|
||||||
|
site-scope check, so a site-scoped Deployment user can read any instance, site, area tree,
|
||||||
|
or data connection by ID even though that site is excluded from their scope. The list
|
||||||
|
endpoints having a filter while the get-by-id endpoints do not is an inconsistency that
|
||||||
|
undermines the scoping model. (`HandleGetDeploymentDiff` and `HandleListInstanceAlarmOverrides`
|
||||||
|
do enforce scope, confirming the omission elsewhere is unintentional.)
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Apply `EnforceSiteScopeForInstance` in `HandleGetInstance`, and `EnforceSiteScope` against
|
||||||
|
the resolved site ID in `HandleGetSite`, `HandleListAreas`, and `HandleGetDataConnection`
|
||||||
|
(for data connections, scope by the connection's `SiteId`).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-003 — DebugStreamHub.SubscribeInstance performs no per-instance authorization
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/DebugStreamHub.cs:104` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`OnConnectedAsync` authenticates the WebSocket connection and verifies the caller holds the
|
||||||
|
`Deployment` role, but `SubscribeInstance(int instanceId)` accepts any instance ID and starts
|
||||||
|
a stream without checking that the authenticated user is scoped to that instance's site. A
|
||||||
|
site-scoped Deployment user can therefore subscribe to the live debug stream (attribute
|
||||||
|
values, alarm states) of an instance belonging to a site outside their scope. This is the
|
||||||
|
streaming equivalent of finding 001/002.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Resolve the instance's site inside `SubscribeInstance` and reject the subscription if the
|
||||||
|
authenticated user's permitted-site set does not include it. The authenticated identity
|
||||||
|
established in `OnConnectedAsync` must be persisted on the connection (e.g. in
|
||||||
|
`Context.Items`) so it is available to `SubscribeInstance`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-004 — Actor offloads work to Task.Run instead of using PipeTo
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:61` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleEnvelope` runs every command on a thread-pool thread via `Task.Run(async () => ...)`
|
||||||
|
and replies from inside the continuation. This is the anti-pattern the project's Akka.NET
|
||||||
|
conventions warn against — the canonical approach is to start the async work and `PipeTo`
|
||||||
|
its result back to `Self`/`Sender`. Although `Sender` is correctly copied to a local before
|
||||||
|
the closure, the current code: (a) lets multiple commands execute fully concurrently with no
|
||||||
|
actor-thread serialization, so the actor provides no ordering or back-pressure guarantees
|
||||||
|
and is an actor in name only; (b) cannot be paused, supervised, or made to honour a mailbox
|
||||||
|
bound; (c) is shielded from synchronous faults only because every path is inside the
|
||||||
|
try/catch — any future code path that throws synchronously before the `Task.Run` body would
|
||||||
|
escape it.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Replace `Task.Run` with a method that returns the `Task` and `PipeTo` the mapped result
|
||||||
|
(`ManagementSuccess`/`ManagementError`/`ManagementUnauthorized`) back to the captured sender,
|
||||||
|
mapping faults in the `PipeTo` failure continuation. If genuine parallelism is desired, make
|
||||||
|
that explicit with a router/dispatcher rather than ad-hoc `Task.Run`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-005 — ManagementActor declares no supervision strategy
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:33` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The project conventions call for explicit supervision strategies (Resume for coordinator
|
||||||
|
actors). `ManagementActor` is a long-lived coordinator-style actor but overrides no
|
||||||
|
`SupervisorStrategy` and defines no `PreRestart`/`PostRestart` behaviour. In practice it
|
||||||
|
spawns no children so the default strategy is rarely exercised, but an explicit strategy
|
||||||
|
should still be declared for clarity and to match the documented convention; it also matters
|
||||||
|
if children are added later (e.g. if finding 004 introduces worker actors).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add an explicit `protected override SupervisorStrategy SupervisorStrategy()` returning a
|
||||||
|
Resume-based strategy, consistent with other central coordinator actors.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-006 — JsonDocument instances never disposed in the HTTP endpoint
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementEndpoints.cs:83`, `:112` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`JsonDocument` is `IDisposable` (it rents buffers from a pooled `ArrayPool`). `HandleRequest`
|
||||||
|
parses the request body into `doc` at line 83 and never disposes it, and line 112
|
||||||
|
(`JsonDocument.Parse("{}")`) allocates a second document inline that is also never disposed.
|
||||||
|
Every management HTTP call therefore leaks pooled buffers, increasing GC pressure and pool
|
||||||
|
churn under load.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap the parsed document in `using var doc = ...`. For the empty-payload fallback, avoid
|
||||||
|
allocating a `JsonDocument` entirely — deserialize from the literal string `"{}"`/an empty
|
||||||
|
object, or restructure so the fallback path does not parse a throwaway document.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-007 — Inconsistent and cycle-prone serialization of repository entities
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:67`; `src/ScadaLink.ManagementService/ManagementEndpoints.cs:113` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The actor serializes every command result with `Newtonsoft.Json` (`JsonConvert.SerializeObject`)
|
||||||
|
while the HTTP endpoint deserializes payloads with `System.Text.Json`. Beyond the
|
||||||
|
inconsistency, `JsonConvert.SerializeObject` is applied directly to EF-backed entities
|
||||||
|
returned by repositories (e.g. `Site`, `DataConnection`, `NotificationList` with a
|
||||||
|
`Recipients` collection, `Template` with children). With default Newtonsoft settings any
|
||||||
|
bidirectional navigation property produces a `JsonSerializationException` for self-referencing
|
||||||
|
loops, and even without cycles this serializes lazy/navigation state the CLI does not expect.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Standardise on one serializer (the rest of the HTTP path uses `System.Text.Json`). Serialize
|
||||||
|
explicit DTOs / projections rather than EF entities, or configure
|
||||||
|
`ReferenceLoopHandling.Ignore` and ignore navigation properties. Verify that handlers
|
||||||
|
returning rich entity graphs (`HandleGetTemplate`, `HandleUpdateNotificationList`) round-trip
|
||||||
|
correctly.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-008 — HandleResolveRoles constructs RoleMapper manually instead of via DI
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:285` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Every other handler resolves its collaborators from the scoped `IServiceProvider`.
|
||||||
|
`HandleResolveRoles` instead does `new RoleMapper(sp.GetRequiredService<ISecurityRepository>())`,
|
||||||
|
bypassing DI. If `RoleMapper` ever gains a dependency, caching, or options, this hand-built
|
||||||
|
instance silently diverges from the DI-registered one. It is also inconsistent with
|
||||||
|
`ManagementEndpoints`, which resolves `RoleMapper` from DI.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Resolve `RoleMapper` via `sp.GetRequiredService<RoleMapper>()` like every other dependency.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-009 — Audit logging applied inconsistently across mutating handlers
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:357`, `:1134`, `:1085`, `:526`, `:1275` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc states "All mutating operations are audit logged." Some handlers call
|
||||||
|
`AuditAsync` explicitly (`HandleCreateInstance`, `HandleCreateSite`, all repository-direct
|
||||||
|
external-system/notification/security/area mutations), but the handlers that delegate to a
|
||||||
|
domain service do **not** — `HandleCreateTemplate`/`HandleUpdateTemplate`/`HandleDeleteTemplate`,
|
||||||
|
all template-member handlers (`HandleAddAttribute` ... `HandleDeleteComposition`), template-folder
|
||||||
|
handlers, shared-script handlers, `HandleDeployArtifacts`, `HandleDeployInstance`,
|
||||||
|
`HandleEnableInstance`/`Disable`/`Delete`, and the instance-binding/override handlers. This is
|
||||||
|
correct only if every one of those services performs its own audit logging internally; the
|
||||||
|
mixed pattern makes that impossible to verify by reading this module and creates a real risk
|
||||||
|
of silent audit gaps for template authoring and deployment operations.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide on one layer that owns auditing. Either route all mutations through services that audit
|
||||||
|
internally (and remove the explicit `AuditAsync` calls here), or audit uniformly in the actor
|
||||||
|
after every successful mutation. Document the chosen contract so the inconsistency cannot
|
||||||
|
recur, and confirm template/deployment services actually audit.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-010 — ManagementServiceOptions.CommandTimeout is defined but never used
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementServiceOptions.cs:5`; `src/ScadaLink.ManagementService/ManagementEndpoints.cs:16` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ManagementServiceOptions.CommandTimeout` is bound from configuration in
|
||||||
|
`ServiceCollectionExtensions`, but no code reads it. The HTTP endpoint instead hard-codes
|
||||||
|
`AskTimeout = TimeSpan.FromSeconds(30)`. The design doc describes the options section as
|
||||||
|
"Reserved for future configuration — e.g., command timeout overrides", yet a concrete
|
||||||
|
`CommandTimeout` property already exists and is silently ignored, so an operator who sets it
|
||||||
|
in `appsettings.json` gets no effect.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either consume `ManagementServiceOptions.CommandTimeout` in `ManagementEndpoints.HandleRequest`
|
||||||
|
(inject `IOptions<ManagementServiceOptions>`), or remove the property until it is wired up so
|
||||||
|
configuration cannot be set with no effect.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-011 — ResolveRolesCommand dispatch path is stale dead code
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:273`, `:283` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc states the HTTP endpoint "collapses the CLI's previous two-step flow
|
||||||
|
(ResolveRoles + actual command) into a single HTTP round-trip", and indeed `ManagementEndpoints`
|
||||||
|
performs LDAP auth and role resolution itself before dispatching. The `ResolveRolesCommand`
|
||||||
|
case in `DispatchCommand` is therefore unreachable from the HTTP path. It remains reachable
|
||||||
|
only via a raw ClusterClient sender, but a caller able to send `ResolveRolesCommand` could
|
||||||
|
enumerate role mappings for arbitrary LDAP groups with no role requirement
|
||||||
|
(`GetRequiredRole` returns null for it) — a minor information-disclosure surface for a path
|
||||||
|
the design says no longer exists.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
If the two-step flow is genuinely retired, remove `ResolveRolesCommand`, its handler, and the
|
||||||
|
class. If it must remain for non-HTTP clients, document why and confirm exposing role-mapping
|
||||||
|
data unauthenticated is intended.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-012 — ManagementEnvelope carries a loosely-typed object payload
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Messages/Management/ManagementEnvelope.cs:7`; `src/ScadaLink.ManagementService/ManagementActor.cs:132` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ManagementEnvelope.Command` is typed `object`, so the actor relies on a large open-ended
|
||||||
|
`switch` with a `NotSupportedException` default for unknown types. While the individual
|
||||||
|
command records are immutable, `object` defeats compile-time exhaustiveness — adding a new
|
||||||
|
command record produces no compiler signal that `DispatchCommand` (and `GetRequiredRole`)
|
||||||
|
need updating, and a typo or unregistered command surfaces only as a runtime exception. The
|
||||||
|
message contract is also harder to evolve safely under the additive-only rule.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Introduce a marker interface (e.g. `IManagementCommand`) implemented by every command record
|
||||||
|
and type the envelope payload as that interface. This documents the contract, lets analyzers
|
||||||
|
flag unhandled cases, and keeps `ManagementCommandRegistry`'s reflection scan precise.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### ManagementService-013 — No tests for site-scope enforcement, the HTTP endpoint, or DebugStreamHub
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.ManagementService.Tests/ManagementActorTests.cs:1` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ManagementActorTests` covers role-based authorization, success/error mapping, and correlation
|
||||||
|
IDs thoroughly, but several critical paths are untested: (a) site-scope enforcement —
|
||||||
|
`EnforceSiteScope`/`EnforceSiteScopeForInstance` and `SiteScopeViolationException` -> `Unauthorized`
|
||||||
|
mapping have no test, which is why the gaps in findings 001/002 went unnoticed; (b)
|
||||||
|
`ManagementEndpoints` — Basic Auth decoding, malformed-header handling, LDAP/role resolution,
|
||||||
|
command deserialization, and HTTP status mapping have zero coverage; (c) `DebugStreamHub`
|
||||||
|
authentication, subscribe/unsubscribe lifecycle, and `ManagementCommandRegistry.Resolve` are
|
||||||
|
untested. The `Envelope` test helper always passes `Array.Empty<string>()` for permitted
|
||||||
|
sites, so no test ever exercises a site-scoped user.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests that exercise a site-scoped Deployment user against in-scope and out-of-scope
|
||||||
|
targets for instance and site operations, asserting `ManagementUnauthorized` on violations.
|
||||||
|
Add `WebApplicationFactory`-based tests for `ManagementEndpoints` covering auth failures,
|
||||||
|
malformed bodies, unknown commands, and the 200/400/403/401/504 mappings.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,314 @@
|
|||||||
|
# Code Review — NotificationService
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.NotificationService` |
|
||||||
|
| Design doc | `docs/requirements/Component-NotificationService.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 11 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The NotificationService module is small (6 source files) and structurally clean: it
|
||||||
|
abstracts the SMTP client behind an interface, isolates the OAuth2 token lifecycle,
|
||||||
|
and integrates with the Store-and-Forward Engine for transient-failure buffering.
|
||||||
|
However, the review surfaced several substantive defects. The most serious is that
|
||||||
|
**no Store-and-Forward delivery handler is ever registered for the `Notification`
|
||||||
|
category** — buffered notifications are persisted but never retried or delivered,
|
||||||
|
silently losing every notification that hit a transient SMTP failure. Error
|
||||||
|
classification is fragile (substring matching on exception messages) and is
|
||||||
|
applied inconsistently between `SendAsync` and `DeliverAsync`. `DeliverAsync` also
|
||||||
|
contains a resource-management bug that constructs and leaks two SMTP clients per
|
||||||
|
call. Secondary themes: the `OAuth2TokenService` singleton caches a single token
|
||||||
|
keyed to no credential identity (incorrect if multiple SMTP configs exist), several
|
||||||
|
design-doc requirements are unimplemented (connection timeout, max concurrent
|
||||||
|
connections, TLS `SSL`/`None` modes), and credentials are stored and passed as
|
||||||
|
plaintext `string` values. Test coverage exercises the happy path and the main
|
||||||
|
error branches but misses the OAuth2 delivery path, the permanent-classification
|
||||||
|
fallback in `DeliverAsync`, and concurrency on the token cache.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | Double SMTP client construction; `Auto` socket option for non-TLS; `TimeoutException`/`OperationCanceledException` misclassified. |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | No actors in this module (`AddNotificationServiceActors` is a no-op); delivery is a plain DI service. No Akka-specific issues. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | `OAuth2TokenService` is a singleton with a shared mutable token cache; double-checked locking present but cache key is wrong (NS-006). |
|
||||||
|
| 4 | Error handling & resilience | ☑ | Critical: no S&F delivery handler registered for `Notification` (NS-001). Fragile substring error classification (NS-002, NS-003). |
|
||||||
|
| 5 | Security | ☑ | Credentials handled as plaintext strings; OAuth2 client secret in DB credential blob; no recipient address validation. |
|
||||||
|
| 6 | Performance & resource management | ☑ | Two `ISmtpClientWrapper` instances created per send, one leaked; connection not pooled; `MaxConcurrentConnections` unenforced. |
|
||||||
|
| 7 | Design-document adherence | ☑ | Connection timeout, max concurrent connections, and TLS `SSL`/`None` modes from the design doc are not implemented. |
|
||||||
|
| 8 | Code organization & conventions | ☑ | `SmtpPermanentException` in the wrong file; `SmtpConfiguration` POCO has non-nullable strings with no initializer (compiler-warning risk). |
|
||||||
|
| 9 | Testing coverage | ☑ | Happy path and main error branches covered; OAuth2 delivery path, `DeliverAsync` permanent fallback, and token-cache concurrency untested. |
|
||||||
|
| 10 | Documentation & comments | ☑ | XML comment on `DeliverAsync` ("Throws on failure") and the misleading "OAuth2 token refresh if needed" comment do not match behaviour. |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### NotificationService-001 — Buffered notifications are never retried (no S&F delivery handler)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Critical |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:96`, `src/ScadaLink.NotificationService/ServiceCollectionExtensions.cs:8` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
On a transient SMTP failure the service calls `_storeAndForward.EnqueueAsync(StoreAndForwardCategory.Notification, ...)`. The Store-and-Forward Engine only delivers (immediately or on retry sweep) a category for which a delivery handler has been registered via `StoreAndForwardService.RegisterDeliveryHandler`. A repo-wide search shows the `Notification` category handler is never registered anywhere — `StoreAndForwardCategory.Notification` appears only in this module's `EnqueueAsync` call. As a result, every buffered notification falls into the `RetryMessageAsync` "No delivery handler for category" branch (`StoreAndForwardService.cs:201-204`), which logs a warning and returns without ever delivering or removing the message. Buffered notifications accumulate in SQLite forever and are never sent. This silently loses every notification that hit a transient failure, while `SendAsync` returns `Success=true, WasBuffered=true`, telling the caller the notification is safely queued. This directly violates the design doc's "integrates with the Store-and-Forward Engine for reliable delivery" guarantee.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Register a delivery handler for `StoreAndForwardCategory.Notification` during startup that deserializes the buffered payload (`ListName`, `Subject`, `Message`), re-resolves the list/recipients/SMTP config, and re-attempts `DeliverAsync`, returning `true` on success, `false` on permanent failure, and throwing on transient failure. Wire it in `AddNotificationService` or the host bootstrap. Add an integration test covering the buffer-then-retry-then-deliver round trip.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. A delivery handler for `StoreAndForwardCategory.Notification` is now
|
||||||
|
registered at site startup in `AkkaHostedService`. The handler resolves
|
||||||
|
`NotificationDeliveryService` in a fresh DI scope and calls the new `DeliverBufferedAsync`,
|
||||||
|
which re-resolves the list, recipients and SMTP config and re-attempts delivery —
|
||||||
|
returning `true` on success, `false` (park) on permanent failure or missing
|
||||||
|
configuration, and throwing on transient failure so the engine retries. `SendAsync` now
|
||||||
|
buffers with `attemptImmediateDelivery: false` so registering the handler does not send
|
||||||
|
the notification twice. Regression tests cover the happy path and the list-removed park
|
||||||
|
path. Fixed by the commit whose message references `NotificationService-001`.
|
||||||
|
|
||||||
|
### NotificationService-002 — `TimeoutException`/`OperationCanceledException` misclassified as transient
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:157-167` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`IsTransientSmtpError` treats `OperationCanceledException` (and its subtype `TaskCanceledException`) as a transient SMTP error. When the caller passes a `CancellationToken` that is cancelled — e.g. the Script Execution Actor is stopped, or the script times out — the resulting `OperationCanceledException` is caught by the `catch ... when (IsTransientSmtpError(ex))` clause and the notification is buffered as if SMTP had failed. A deliberate cancellation should propagate, not be silently buffered for retry. The same clause classifies any `IOException` as transient even though `IOException` covers unrelated failures (e.g. a serialization stream error). Additionally, `OperationCanceledException` raised by token cancellation in the OAuth2 path would be miscategorised the same way.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Re-throw `OperationCanceledException`/`TaskCanceledException` when `cancellationToken.IsCancellationRequested` is true rather than classifying it as transient. Narrow `IOException` handling to SMTP-specific I/O failures, or rely on MailKit's typed exceptions (`SmtpCommandException`, `SmtpProtocolException`, `ServiceNotConnectedException`) instead of broad base types.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-003 — Error classification by substring matching on exception messages is fragile
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:144-147`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:163-166` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Transient/permanent classification depends on `ex.Message.Contains("5.")`, `Contains("4.")`, `Contains("550")`, `Contains("421")`, etc. This is unreliable: (a) `Message.Contains("5.")` matches any message containing the literal "5." anywhere — e.g. a host name `smtp5.example.com`, a version string, or a path — producing false permanent classification; (b) `Contains("4.")` likewise matches `"v4.0"` or an IP address octet; (c) MailKit exposes the actual SMTP status code on `SmtpCommandException.StatusCode`, which is the correct, locale-independent source of truth and is being ignored; (d) message text is culture/version-dependent and not part of any stable contract. Misclassification has real consequences: a permanent failure misread as transient floods the S&F buffer (which the design doc explicitly says must be prevented), and a transient failure misread as permanent loses the notification.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Classify on MailKit's typed exceptions and `SmtpCommandException.StatusCode` (4xx → transient, 5xx → permanent), and `SocketException`/`SmtpProtocolException`/connection-refused → transient. Remove all `Message.Contains` checks.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-004 — `DeliverAsync` constructs two SMTP clients and leaks the used one
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:118-119` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
using var client = _smtpClientFactory() as IDisposable;
|
||||||
|
var smtp = _smtpClientFactory();
|
||||||
|
```
|
||||||
|
|
||||||
|
The factory is invoked twice, creating two separate `MailKitSmtpClientWrapper` instances (each owning a real `SmtpClient` with a socket). The first instance is assigned to `client` and disposed by the `using`, but it is never used. The second instance, `smtp`, is the one actually connected, authenticated, used to send, and `DisconnectAsync`'d — but it is never `Dispose`d. `MailKitSmtpClientWrapper` implements `IDisposable` and wraps an unmanaged socket; the connected client is leaked on every send. `DisconnectAsync` closes the connection but does not dispose the `SmtpClient`. Over time this leaks sockets/handles.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Create exactly one client and dispose the one that is actually used:
|
||||||
|
`using var smtp = _smtpClientFactory();` then cast to `IDisposable` only if needed (the factory's `Func<ISmtpClientWrapper>` should ideally return a type that the `using` can dispose directly — consider having `ISmtpClientWrapper` extend `IAsyncDisposable`/`IDisposable`).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-005 — Non-TLS path uses `SecureSocketOptions.Auto`, contradicting the requested mode
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:18`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:123` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ConnectAsync` maps `useTls` to either `SecureSocketOptions.StartTls` or `SecureSocketOptions.Auto`. `useTls` is computed in `DeliverAsync` as `TlsMode == "starttls"`. So a configuration of `TlsMode = "none"` produces `useTls = false` → `SecureSocketOptions.Auto`, which lets MailKit opportunistically negotiate TLS — the opposite of "None". Worse, the design doc defines three TLS modes — `None`, `StartTLS`, `SSL` — but the code collapses them to a single boolean, so `SSL` (implicit TLS, typically port 465) is treated identically to `None`/`Auto` and the SSL mode is effectively unsupported. The `bool useTls` parameter cannot represent the three-state requirement.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Pass the `TlsMode` string (or a `TlsMode` enum) through to the wrapper and map explicitly: `None` → `SecureSocketOptions.None`, `StartTLS` → `SecureSocketOptions.StartTls`, `SSL` → `SecureSocketOptions.SslOnConnect`. Validate the configured value and reject unknown modes.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-006 — OAuth2 token cache is keyed to nothing; wrong token returned when multiple SMTP configs exist
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/OAuth2TokenService.cs:14-15`, `src/ScadaLink.NotificationService/OAuth2TokenService.cs:30-35` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`OAuth2TokenService` is registered as a singleton and stores a single `_cachedToken`/`_tokenExpiry` pair. `GetTokenAsync` ignores the `credentials` argument when deciding whether the cache is valid — it only checks expiry. If two SMTP configurations with different tenant/client credentials are ever used (the repository's `GetAllSmtpConfigurationsAsync` returns a list, implying multiple configs are possible), the second caller receives the first caller's token, which will fail authentication against the second tenant. Even with a single config today this is a latent correctness bug and makes the service's behaviour depend on call order.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Key the cache by the credential identity (e.g. a dictionary keyed by `tenantId:clientId`, or by a hash of the credential string), or document and enforce the single-SMTP-config invariant. Given the design doc says one SMTP config is deployed per site, enforcing the invariant is acceptable but should be explicit.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-007 — Connection timeout and max-concurrent-connections from the design doc are not implemented
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationOptions.cs:11-14`, `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:16-20`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:111-140` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc specifies an SMTP "Connection timeout (default 30s)" and "Max concurrent connections (default 5)", and `NotificationOptions`/`SmtpConfiguration` both carry these fields. Neither is enforced: `MailKitSmtpClientWrapper.ConnectAsync` never sets `SmtpClient.Timeout`, so the connection relies on MailKit's default timeout rather than the configured value (only the caller's `CancellationToken` bounds it, and callers may pass `default`). There is no semaphore or other throttle limiting concurrent SMTP connections per site, so `MaxConcurrentConnections` has no effect. Both options exist but are dead configuration.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Set `SmtpClient.Timeout` from `ConnectionTimeoutSeconds` in `ConnectAsync` (and/or derive a linked `CancellationTokenSource`). Introduce a `SemaphoreSlim(MaxConcurrentConnections)` gating `DeliverAsync`. If these limits are intentionally deferred, mark the options `[Obsolete]`/document them as not-yet-enforced and note the gap in the design doc.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-008 — Recipient email addresses are not validated before send
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:136-137`, `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:50-53` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SendAsync` builds `bccAddresses` directly from `recipient.EmailAddress` and passes them to `MailboxAddress.Parse`. If any recipient row has a malformed address, `MailboxAddress.Parse` throws `ParseException`. `ParseException` is not a `TimeoutException`/`SocketException`/`IOException` and its message will not generally contain "4." or "5.", so it falls through `DeliverAsync`'s outer `catch ... when (... && !IsTransientSmtpError(ex))` filter, which re-throws it (`:153`); it then escapes `SendAsync` entirely as an unhandled exception (the `SendAsync` catch blocks only cover `SmtpPermanentException` and transient errors). A single bad address in a list therefore crashes the send with an exception type the calling script is not told to expect, instead of producing a clean `NotificationResult` error. The same applies to a malformed `FromAddress`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Validate addresses up front (e.g. `MailboxAddress.TryParse`) and return a `NotificationResult(false, ...)` listing invalid recipients, or wrap `DeliverAsync` so any non-classified exception becomes a permanent `NotificationResult` failure rather than escaping. Consider validating addresses at definition time in the Central UI as well.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-009 — Credentials handled as plaintext strings; OAuth2 client secret logged risk
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:127-134`, `src/ScadaLink.NotificationService/OAuth2TokenService.cs:30-65`, `src/ScadaLink.Commons/Entities/Notifications/SmtpConfiguration.cs:9` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
SMTP credentials — Basic Auth `user:pass` and OAuth2 `tenantId:clientId:clientSecret` — are stored and passed as a single colon-delimited plaintext `string` (`SmtpConfiguration.Credentials`). There is no indication the value is encrypted at rest in SQLite or in the central config DB. The colon-delimited packing is also brittle: a password or client secret containing a `:` will be split incorrectly (`Split(':', 2)` / `Split(':', 3)`), silently corrupting the secret. Separately, while the current code does not log the secret directly, the substring-based error classification logs full exception messages (`_logger.LogWarning(ex, ...)`, `LogError(ex, ...)`) and MailKit exceptions can echo back server responses; an authentication failure message could surface credential fragments into logs. There is no defensive scrubbing.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Store credentials encrypted at rest (DPAPI/Data Protection or a secret store) and model them as structured fields rather than a colon-packed string, so secrets containing `:` are safe. Ensure credential values are never written to logs; consider a redaction step on exception messages before logging.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-010 — `DeliverAsync` does not disconnect the SMTP client on failure
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:121-154` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`DisconnectAsync` is only called at `:139`, on the success path inside the `try` block. If `AuthenticateAsync` or `SendAsync` throws, control jumps to the `catch` filter at `:141` and the method exits (re-throwing or wrapping) without ever calling `DisconnectAsync`. Combined with NS-004 (the client is never disposed either), a failed send leaves an open, authenticated SMTP connection until the socket is eventually reclaimed by finalization. Under sustained transient failures this can exhaust the SMTP server's connection slots.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Move disconnect/dispose into a `finally` block (or use `await using` once `ISmtpClientWrapper` supports `IAsyncDisposable`) so the connection is always torn down regardless of outcome.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-011 — `SmtpPermanentException` declared in the wrong file; module conventions
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:173-177`, `src/ScadaLink.Commons/Entities/Notifications/SmtpConfiguration.cs:5-15` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Two minor convention issues. (1) `SmtpPermanentException` is a public exception type declared at the bottom of `NotificationDeliveryService.cs` rather than in its own file (`SmtpPermanentException.cs`), which is inconsistent with the one-type-per-file layout used elsewhere and makes it harder to locate. (2) `SmtpConfiguration` (a Commons POCO) declares non-nullable `string` properties (`Host`, `AuthType`, `FromAddress`) that are only guaranteed by the constructor; EF Core materialization or object-initializer use can leave them null while the type system says otherwise. These are persistence-ignorant POCO concerns but worth flagging because the delivery service dereferences `config.Host`, `config.AuthType`, `config.FromAddress` without null checks.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Move `SmtpPermanentException` to its own file. For `SmtpConfiguration`, either keep the constructor as the only path and document it, or use `required` members so the compiler enforces initialization.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### NotificationService-012 — Test coverage gaps: OAuth2 delivery path, permanent-classification fallback, token-cache concurrency
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.NotificationService.Tests/NotificationDeliveryServiceTests.cs`, `tests/ScadaLink.NotificationService.Tests/OAuth2TokenServiceTests.cs` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The tests cover the happy path, list-not-found, no-recipients, no-SMTP-config, permanent failure, transient-without-S&F, and transient-with-S&F buffering. Notable untested paths: (1) the OAuth2 delivery branch in `DeliverAsync:128-132` — every test uses `tokenService: null` and Basic Auth, so OAuth2 token resolution during a send is never exercised; (2) `DeliverAsync`'s permanent-classification fallback (`:144-149`) that promotes a generic exception whose message contains "550"/"553"/"554" to `SmtpPermanentException` is never tested; (3) `OAuth2TokenServiceTests` never tests concurrent `GetTokenAsync` calls (the double-checked-locking path) or token expiry/refresh — the cache test uses a 3600s token so refresh never triggers; (4) no test covers the transient-with-S&F path actually delivering after retry (which would also have caught NS-001). Given NS-001 is a critical defect, the absence of an end-to-end buffer-and-retry test is significant.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for: OAuth2-authenticated send with a mocked `OAuth2TokenService`; the `DeliverAsync` 5xx-message permanent fallback; token expiry/refresh (short `expires_in`); concurrent token acquisition; and an end-to-end buffered-notification retry once a `Notification` S&F handler is registered.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,313 @@
|
|||||||
|
# Code Reviews
|
||||||
|
|
||||||
|
Comprehensive, per-module code reviews of the ScadaLink codebase. Each module (one
|
||||||
|
buildable project under `src/`) has its own folder containing a `findings.md`. This
|
||||||
|
README is the aggregated index — the single place to see all outstanding work.
|
||||||
|
|
||||||
|
> Generated by `regen-readme.py` from the per-module `findings.md` files. Do not
|
||||||
|
> edit by hand — edit the findings files and re-run the script.
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
- Reviews are performed one module at a time against a fixed checklist.
|
||||||
|
- Every finding is recorded in the module's `findings.md` with a severity and status.
|
||||||
|
- Findings are **never deleted** — they are closed by changing their status, keeping
|
||||||
|
a full audit trail.
|
||||||
|
- This README aggregates every **pending** finding (`Open` / `In Progress`) across all
|
||||||
|
modules.
|
||||||
|
|
||||||
|
See **[REVIEW-PROCESS.md](REVIEW-PROCESS.md)** for the full procedure: the review
|
||||||
|
checklist, severity definitions, finding format, and how to mark items resolved.
|
||||||
|
|
||||||
|
## Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
code-reviews/
|
||||||
|
├── README.md # this file — process overview + pending findings
|
||||||
|
├── REVIEW-PROCESS.md # how to perform a review and track findings
|
||||||
|
├── regen-readme.py # regenerates this README from the findings files
|
||||||
|
├── _template/findings.md # copy-this template for a module review
|
||||||
|
└── <Module>/findings.md # one folder per src/ project
|
||||||
|
```
|
||||||
|
|
||||||
|
## Baseline review — 2026-05-16
|
||||||
|
|
||||||
|
All 19 modules were reviewed at commit `9c60592` (241 findings: 6 Critical, 46 High,
|
||||||
|
100 Medium, 89 Low). The tables below track what remains **open** as findings are
|
||||||
|
resolved and re-triaged; findings discovered after the baseline are appended to their
|
||||||
|
module file and counted in **Total**.
|
||||||
|
|
||||||
|
| Severity | Open findings |
|
||||||
|
|----------|---------------|
|
||||||
|
| Critical | 0 |
|
||||||
|
| High | 28 |
|
||||||
|
| Medium | 100 |
|
||||||
|
| Low | 89 |
|
||||||
|
| **Total** | **217** |
|
||||||
|
|
||||||
|
## Module Status
|
||||||
|
|
||||||
|
| Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total |
|
||||||
|
|--------|---------------|--------|----------------|------|-------|
|
||||||
|
| [CLI](CLI/findings.md) | 2026-05-16 | `9c60592` | 0/0/6/6 | 12 | 13 |
|
||||||
|
| [CentralUI](CentralUI/findings.md) | 2026-05-16 | `9c60592` | 0/0/10/5 | 15 | 19 |
|
||||||
|
| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-16 | `9c60592` | 0/1/4/3 | 8 | 8 |
|
||||||
|
| [Commons](Commons/findings.md) | 2026-05-16 | `9c60592` | 0/0/4/8 | 12 | 12 |
|
||||||
|
| [Communication](Communication/findings.md) | 2026-05-16 | `9c60592` | 0/0/5/3 | 8 | 11 |
|
||||||
|
| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-16 | `9c60592` | 0/0/4/6 | 10 | 11 |
|
||||||
|
| [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-16 | `9c60592` | 0/0/6/2 | 8 | 13 |
|
||||||
|
| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-16 | `9c60592` | 0/1/6/5 | 12 | 14 |
|
||||||
|
| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-16 | `9c60592` | 0/0/7/4 | 11 | 14 |
|
||||||
|
| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-16 | `9c60592` | 0/0/5/5 | 10 | 12 |
|
||||||
|
| [Host](Host/findings.md) | 2026-05-16 | `9c60592` | 0/0/3/7 | 10 | 11 |
|
||||||
|
| [InboundAPI](InboundAPI/findings.md) | 2026-05-16 | `9c60592` | 0/3/5/5 | 13 | 13 |
|
||||||
|
| [ManagementService](ManagementService/findings.md) | 2026-05-16 | `9c60592` | 0/3/5/5 | 13 | 13 |
|
||||||
|
| [NotificationService](NotificationService/findings.md) | 2026-05-16 | `9c60592` | 0/3/5/3 | 11 | 12 |
|
||||||
|
| [Security](Security/findings.md) | 2026-05-16 | `9c60592` | 0/3/4/4 | 11 | 11 |
|
||||||
|
| [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-16 | `9c60592` | 0/4/4/3 | 11 | 11 |
|
||||||
|
| [SiteRuntime](SiteRuntime/findings.md) | 2026-05-16 | `9c60592` | 0/3/8/5 | 16 | 16 |
|
||||||
|
| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-16 | `9c60592` | 0/2/4/6 | 12 | 14 |
|
||||||
|
| [TemplateEngine](TemplateEngine/findings.md) | 2026-05-16 | `9c60592` | 0/5/5/4 | 14 | 14 |
|
||||||
|
|
||||||
|
## Pending Findings
|
||||||
|
|
||||||
|
Every `Open` / `In Progress` finding across all modules, highest severity first.
|
||||||
|
Resolved findings drop off this list but remain recorded in their module's
|
||||||
|
`findings.md` (see [REVIEW-PROCESS.md](REVIEW-PROCESS.md) §4–§5). Full detail —
|
||||||
|
description, location, recommendation — lives in the module's `findings.md`.
|
||||||
|
|
||||||
|
### Critical (0)
|
||||||
|
|
||||||
|
_None open._
|
||||||
|
|
||||||
|
### High (28)
|
||||||
|
|
||||||
|
| ID | Module | Title |
|
||||||
|
|----|--------|-------|
|
||||||
|
| ClusterInfrastructure-001 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | Module implements none of its documented responsibilities |
|
||||||
|
| DeploymentManager-006 | [DeploymentManager](DeploymentManager/findings.md) | Query-the-site-before-redeploy idempotency requirement not implemented |
|
||||||
|
| InboundAPI-001 | [InboundAPI](InboundAPI/findings.md) | Singleton script handler cache mutated without synchronization |
|
||||||
|
| InboundAPI-003 | [InboundAPI](InboundAPI/findings.md) | API key compared with non-constant-time string equality |
|
||||||
|
| InboundAPI-005 | [InboundAPI](InboundAPI/findings.md) | Compiled API scripts run with no script-trust-model enforcement |
|
||||||
|
| ManagementService-001 | [ManagementService](ManagementService/findings.md) | Remote-query and debug-snapshot handlers bypass site-scope enforcement |
|
||||||
|
| ManagementService-002 | [ManagementService](ManagementService/findings.md) | Single-entity query handlers leak data across site scope |
|
||||||
|
| ManagementService-003 | [ManagementService](ManagementService/findings.md) | DebugStreamHub.SubscribeInstance performs no per-instance authorization |
|
||||||
|
| NotificationService-002 | [NotificationService](NotificationService/findings.md) | `TimeoutException`/`OperationCanceledException` misclassified as transient |
|
||||||
|
| NotificationService-003 | [NotificationService](NotificationService/findings.md) | Error classification by substring matching on exception messages is fragile |
|
||||||
|
| NotificationService-004 | [NotificationService](NotificationService/findings.md) | `DeliverAsync` constructs two SMTP clients and leaks the used one |
|
||||||
|
| Security-001 | [Security](Security/findings.md) | StartTLS upgrade path is unreachable dead code |
|
||||||
|
| Security-002 | [Security](Security/findings.md) | Authentication cookie is not marked `Secure` |
|
||||||
|
| Security-003 | [Security](Security/findings.md) | JWT signing key length is never validated |
|
||||||
|
| SiteEventLogging-001 | [SiteEventLogging](SiteEventLogging/findings.md) | `PRAGMA incremental_vacuum` is a no-op; storage cap cannot reclaim space |
|
||||||
|
| SiteEventLogging-002 | [SiteEventLogging](SiteEventLogging/findings.md) | Storage-cap purge deletes the entire table when space is not reclaimed |
|
||||||
|
| SiteEventLogging-003 | [SiteEventLogging](SiteEventLogging/findings.md) | Shared `SqliteConnection` used by purge and query without the write lock |
|
||||||
|
| SiteEventLogging-004 | [SiteEventLogging](SiteEventLogging/findings.md) | Event-log handler runs as a cluster singleton that can land on the standby node |
|
||||||
|
| SiteRuntime-001 | [SiteRuntime](SiteRuntime/findings.md) | `Instance.SetAttribute` never writes to the Data Connection Layer |
|
||||||
|
| SiteRuntime-002 | [SiteRuntime](SiteRuntime/findings.md) | `RouteInboundApiSetAttributes` always treats writes as static overrides |
|
||||||
|
| SiteRuntime-003 | [SiteRuntime](SiteRuntime/findings.md) | Redeployment relies on a fixed 500 ms reschedule and can collide on the child actor name |
|
||||||
|
| StoreAndForward-002 | [StoreAndForward](StoreAndForward/findings.md) | Messages enqueued with no registered handler are buffered but never deliverable |
|
||||||
|
| StoreAndForward-003 | [StoreAndForward](StoreAndForward/findings.md) | Off-by-one in retry accounting: immediate failure pre-counts as retry 1 |
|
||||||
|
| TemplateEngine-001 | [TemplateEngine](TemplateEngine/findings.md) | Deeply nested composed members are dropped during flattening |
|
||||||
|
| TemplateEngine-002 | [TemplateEngine](TemplateEngine/findings.md) | Derived templates omit all base alarms; composed alarms cannot be overridden per slot |
|
||||||
|
| TemplateEngine-003 | [TemplateEngine](TemplateEngine/findings.md) | `UpdateAttributeAsync` lets a non-locked attribute change its fixed DataType / DataSourceReference |
|
||||||
|
| TemplateEngine-004 | [TemplateEngine](TemplateEngine/findings.md) | Alarm on-trigger script references are never resolved (empty placeholder) |
|
||||||
|
| TemplateEngine-005 | [TemplateEngine](TemplateEngine/findings.md) | Collision validation is skipped when creating a child template |
|
||||||
|
|
||||||
|
### Medium (100)
|
||||||
|
|
||||||
|
| ID | Module | Title |
|
||||||
|
|----|--------|-------|
|
||||||
|
| CLI-002 | [CLI](CLI/findings.md) | Empty success body crashes table rendering with an unhandled exception |
|
||||||
|
| CLI-003 | [CLI](CLI/findings.md) | Non-JSON success body crashes table rendering |
|
||||||
|
| CLI-004 | [CLI](CLI/findings.md) | Malformed `--url` throws an unhandled `UriFormatException` |
|
||||||
|
| CLI-005 | [CLI](CLI/findings.md) | Malformed `--bindings` / `--overrides` JSON throws unhandled exceptions |
|
||||||
|
| CLI-006 | [CLI](CLI/findings.md) | Password is passed as a command-line argument with no safer alternative |
|
||||||
|
| CLI-007 | [CLI](CLI/findings.md) | `Component-CLI.md` command surface is substantially stale |
|
||||||
|
| CentralUI-005 | [CentralUI](CentralUI/findings.md) | Session expiry implementation diverges from the documented policy |
|
||||||
|
| CentralUI-006 | [CentralUI](CentralUI/findings.md) | Deployment status page polls every 10s despite the documented SignalR-push design |
|
||||||
|
| CentralUI-007 | [CentralUI](CentralUI/findings.md) | Monitoring nav links to Deployment-only pages are shown to all roles |
|
||||||
|
| CentralUI-008 | [CentralUI](CentralUI/findings.md) | Audit-log date filters treat browser-local datetimes as UTC |
|
||||||
|
| CentralUI-009 | [CentralUI](CentralUI/findings.md) | `DebugView` stream callbacks touch a possibly-disposed `ToastNotification` |
|
||||||
|
| CentralUI-010 | [CentralUI](CentralUI/findings.md) | `ToastNotification` auto-dismiss continuation runs after component disposal |
|
||||||
|
| CentralUI-011 | [CentralUI](CentralUI/findings.md) | `DiffDialog` leaves a dangling `TaskCompletionSource` when disposed while open |
|
||||||
|
| CentralUI-012 | [CentralUI](CentralUI/findings.md) | N+1 query loading data connections for the Sites page |
|
||||||
|
| CentralUI-013 | [CentralUI](CentralUI/findings.md) | `ScriptAnalysisService` blocks on async shared-script lookups |
|
||||||
|
| CentralUI-014 | [CentralUI](CentralUI/findings.md) | Test Run side effects (HTTP/SQL/SMTP) fire against production services |
|
||||||
|
| ClusterInfrastructure-002 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | No-op DI extension methods report success while doing nothing |
|
||||||
|
| ClusterInfrastructure-003 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | ClusterOptions omits several documented node-configuration settings |
|
||||||
|
| ClusterInfrastructure-004 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | ClusterOptions has no validation despite safety-critical values |
|
||||||
|
| ClusterInfrastructure-006 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | No tests for any cluster behaviour; only the options POCO is covered |
|
||||||
|
| Commons-001 | [Commons](Commons/findings.md) | `StaleTagMonitor` stale-fire race between timer and `OnValueReceived` |
|
||||||
|
| Commons-002 | [Commons](Commons/findings.md) | `DynamicJsonElement` retains a `JsonElement` whose `JsonDocument` lifetime it does not own |
|
||||||
|
| Commons-003 | [Commons](Commons/findings.md) | `ScriptParameters.GetNullable` silently swallows conversion failures |
|
||||||
|
| Commons-004 | [Commons](Commons/findings.md) | `ManagementCommandRegistry` name mapping is asymmetric and namespace-scoped |
|
||||||
|
| Communication-004 | [Communication](Communication/findings.md) | Coordinator actors declare no SupervisorStrategy (design requires Resume) |
|
||||||
|
| Communication-005 | [Communication](Communication/findings.md) | gRPC keepalive and max-stream-lifetime options are defined but never applied |
|
||||||
|
| Communication-006 | [Communication](Communication/findings.md) | Site address load failures are silently swallowed, leaving a stale cache |
|
||||||
|
| Communication-007 | [Communication](Communication/findings.md) | `SiteStreamGrpcClientFactory.Dispose` blocks on async work (sync-over-async) |
|
||||||
|
| Communication-008 | [Communication](Communication/findings.md) | Reconnect retry-count reset can mask a flapping stream indefinitely |
|
||||||
|
| ConfigurationDatabase-002 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Hardcoded `sa` connection string with embedded password literal |
|
||||||
|
| ConfigurationDatabase-003 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | No-arg `AddConfigurationDatabase()` silently registers nothing |
|
||||||
|
| ConfigurationDatabase-004 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Secret-bearing columns stored in plaintext with no protection |
|
||||||
|
| ConfigurationDatabase-007 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `AuditService` does not handle JSON-serialization failure of arbitrary `afterState` |
|
||||||
|
| DataConnectionLayer-006 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Health quality counters not reset/recomputed after failover or re-subscribe |
|
||||||
|
| DataConnectionLayer-007 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `ReadBatchAsync` aborts the whole batch on the first failing tag |
|
||||||
|
| DataConnectionLayer-009 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Implemented failover heuristic diverges from the documented state machine |
|
||||||
|
| DataConnectionLayer-010 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Tag-resolution retry can issue duplicate concurrent subscribe attempts |
|
||||||
|
| DataConnectionLayer-011 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Stale subscription callbacks from disposed adapters can still reach the actor |
|
||||||
|
| DataConnectionLayer-012 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `AutoAcceptUntrustedCerts` defaults to `true`, accepting any server certificate |
|
||||||
|
| DeploymentManager-003 | [DeploymentManager](DeploymentManager/findings.md) | Successful-deployment cleanup is not atomic with the status write |
|
||||||
|
| DeploymentManager-004 | [DeploymentManager](DeploymentManager/findings.md) | Site-success but central-delete-failure leaves orphaned site config |
|
||||||
|
| DeploymentManager-005 | [DeploymentManager](DeploymentManager/findings.md) | `OperationLockManager` leaks a `SemaphoreSlim` per instance name |
|
||||||
|
| DeploymentManager-007 | [DeploymentManager](DeploymentManager/findings.md) | "Diff View" reduced to a hash comparison with no diff detail |
|
||||||
|
| DeploymentManager-008 | [DeploymentManager](DeploymentManager/findings.md) | `DeploymentManagerOptions` is never bound to configuration |
|
||||||
|
| DeploymentManager-011 | [DeploymentManager](DeploymentManager/findings.md) | Tests never exercise a successful deployment or lifecycle success path |
|
||||||
|
| ExternalSystemGateway-004 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | System retry settings are not honoured for cached calls/writes |
|
||||||
|
| ExternalSystemGateway-005 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `HttpRequestMessage` and `HttpResponseMessage` are not disposed |
|
||||||
|
| ExternalSystemGateway-006 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `BuildUrl` ignores path templates and appends a trailing slash for empty paths |
|
||||||
|
| ExternalSystemGateway-007 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | External error response bodies are echoed verbatim into script-visible error messages |
|
||||||
|
| ExternalSystemGateway-008 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Cancellation is conflated with transient timeout failure |
|
||||||
|
| ExternalSystemGateway-009 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `StoreAndForwardResult` from `EnqueueAsync` is discarded; permanent failures during buffering are swallowed |
|
||||||
|
| ExternalSystemGateway-010 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `GetConnectionAsync` leaks the `SqlConnection` when `OpenAsync` fails |
|
||||||
|
| HealthMonitoring-003 | [HealthMonitoring](HealthMonitoring/findings.md) | Shared state mutated inside `ConcurrentDictionary.AddOrUpdate` update delegate |
|
||||||
|
| HealthMonitoring-005 | [HealthMonitoring](HealthMonitoring/findings.md) | Central self-report site can flap offline; no heartbeat grace like real sites |
|
||||||
|
| HealthMonitoring-007 | [HealthMonitoring](HealthMonitoring/findings.md) | Heartbeats for not-yet-registered sites are silently dropped |
|
||||||
|
| HealthMonitoring-008 | [HealthMonitoring](HealthMonitoring/findings.md) | `GetAllSiteStates` / `GetSiteState` leak live mutable state objects to callers |
|
||||||
|
| HealthMonitoring-009 | [HealthMonitoring](HealthMonitoring/findings.md) | Missing test coverage for central report loop, heartbeat path, replication, and collector setters |
|
||||||
|
| Host-002 | [Host](Host/findings.md) | Akka.Persistence required by REQ-HOST-6 is not configured and not used |
|
||||||
|
| Host-003 | [Host](Host/findings.md) | Secrets committed in plaintext in `appsettings.Central.json` |
|
||||||
|
| Host-004 | [Host](Host/findings.md) | Site seed-node list points at the gRPC port, not a remoting port |
|
||||||
|
| InboundAPI-002 | [InboundAPI](InboundAPI/findings.md) | Lazy compilation is a check-then-act race with no atomicity |
|
||||||
|
| InboundAPI-004 | [InboundAPI](InboundAPI/findings.md) | Client disconnect is misreported as a script timeout |
|
||||||
|
| InboundAPI-006 | [InboundAPI](InboundAPI/findings.md) | No request body size limit on the inbound endpoint |
|
||||||
|
| InboundAPI-007 | [InboundAPI](InboundAPI/findings.md) | `Database.Connection()` script API from the design doc is not implemented |
|
||||||
|
| InboundAPI-008 | [InboundAPI](InboundAPI/findings.md) | Inbound API endpoint not restricted to the active central node |
|
||||||
|
| ManagementService-004 | [ManagementService](ManagementService/findings.md) | Actor offloads work to Task.Run instead of using PipeTo |
|
||||||
|
| ManagementService-006 | [ManagementService](ManagementService/findings.md) | JsonDocument instances never disposed in the HTTP endpoint |
|
||||||
|
| ManagementService-007 | [ManagementService](ManagementService/findings.md) | Inconsistent and cycle-prone serialization of repository entities |
|
||||||
|
| ManagementService-009 | [ManagementService](ManagementService/findings.md) | Audit logging applied inconsistently across mutating handlers |
|
||||||
|
| ManagementService-013 | [ManagementService](ManagementService/findings.md) | No tests for site-scope enforcement, the HTTP endpoint, or DebugStreamHub |
|
||||||
|
| NotificationService-005 | [NotificationService](NotificationService/findings.md) | Non-TLS path uses `SecureSocketOptions.Auto`, contradicting the requested mode |
|
||||||
|
| NotificationService-006 | [NotificationService](NotificationService/findings.md) | OAuth2 token cache is keyed to nothing; wrong token returned when multiple SMTP configs exist |
|
||||||
|
| NotificationService-007 | [NotificationService](NotificationService/findings.md) | Connection timeout and max-concurrent-connections from the design doc are not implemented |
|
||||||
|
| NotificationService-008 | [NotificationService](NotificationService/findings.md) | Recipient email addresses are not validated before send |
|
||||||
|
| NotificationService-009 | [NotificationService](NotificationService/findings.md) | Credentials handled as plaintext strings; OAuth2 client secret logged risk |
|
||||||
|
| Security-004 | [Security](Security/findings.md) | Search filter uses `uid=` while fallback DN construction uses `cn=` |
|
||||||
|
| Security-005 | [Security](Security/findings.md) | DN injection in the no-service-account bind fallback |
|
||||||
|
| Security-006 | [Security](Security/findings.md) | JWT validation disables issuer and audience checks |
|
||||||
|
| Security-007 | [Security](Security/findings.md) | Idle-timeout claim is reset on every token refresh |
|
||||||
|
| SiteEventLogging-005 | [SiteEventLogging](SiteEventLogging/findings.md) | `LogEventAsync` performs synchronous disk I/O on the caller's thread |
|
||||||
|
| SiteEventLogging-007 | [SiteEventLogging](SiteEventLogging/findings.md) | `ISiteEventLogger` consumers downcast to the concrete type and reach into the DB connection |
|
||||||
|
| SiteEventLogging-008 | [SiteEventLogging](SiteEventLogging/findings.md) | Event-recording write failures are silently swallowed |
|
||||||
|
| SiteEventLogging-010 | [SiteEventLogging](SiteEventLogging/findings.md) | Test coverage gaps: actor bridge, purge/write concurrency, vacuum effectiveness, query error path |
|
||||||
|
| SiteRuntime-004 | [SiteRuntime](SiteRuntime/findings.md) | `_totalDeployedCount` is incremented on redeployment of an existing instance |
|
||||||
|
| SiteRuntime-005 | [SiteRuntime](SiteRuntime/findings.md) | Deployment reports `Success` to central before persistence completes |
|
||||||
|
| SiteRuntime-006 | [SiteRuntime](SiteRuntime/findings.md) | Site-local repositories read `SiteStorageService` private field via reflection |
|
||||||
|
| SiteRuntime-007 | [SiteRuntime](SiteRuntime/findings.md) | Synthetic entity IDs use the non-deterministic `string.GetHashCode()` |
|
||||||
|
| SiteRuntime-008 | [SiteRuntime](SiteRuntime/findings.md) | Blocking `.GetAwaiter().GetResult()` on the actor thread during startup |
|
||||||
|
| SiteRuntime-009 | [SiteRuntime](SiteRuntime/findings.md) | Script execution actors run scripts on the default thread pool, not a dedicated dispatcher |
|
||||||
|
| SiteRuntime-010 | [SiteRuntime](SiteRuntime/findings.md) | `EnsureDclConnections` never updates a connection whose configuration changed |
|
||||||
|
| SiteRuntime-011 | [SiteRuntime](SiteRuntime/findings.md) | Trust-model validation is a substring scan and is both over- and under-inclusive |
|
||||||
|
| StoreAndForward-004 | [StoreAndForward](StoreAndForward/findings.md) | `RegisterDeliveryHandler` XML doc contradicts the implemented contract |
|
||||||
|
| StoreAndForward-005 | [StoreAndForward](StoreAndForward/findings.md) | Parked-message retry/discard can race with the in-progress retry sweep |
|
||||||
|
| StoreAndForward-010 | [StoreAndForward](StoreAndForward/findings.md) | Retry of a parked message does not reset `LastAttemptAt`, so its retry timing is unspecified |
|
||||||
|
| StoreAndForward-013 | [StoreAndForward](StoreAndForward/findings.md) | Critical paths lack test coverage: retry-due timing, replication-from-active, and the actor bridge |
|
||||||
|
| TemplateEngine-006 | [TemplateEngine](TemplateEngine/findings.md) | Forbidden-API enforcement is a naive substring scan (bypassable and false-positive prone) |
|
||||||
|
| TemplateEngine-007 | [TemplateEngine](TemplateEngine/findings.md) | Brace-balance "compilation" misjudges verbatim / interpolated / raw strings |
|
||||||
|
| TemplateEngine-008 | [TemplateEngine](TemplateEngine/findings.md) | `SetAlarmOverrideAsync` accepts overrides for unknown / composed alarms with no validation |
|
||||||
|
| TemplateEngine-009 | [TemplateEngine](TemplateEngine/findings.md) | N+1 query in `TemplateDeletionService.CanDeleteTemplateAsync` |
|
||||||
|
| TemplateEngine-010 | [TemplateEngine](TemplateEngine/findings.md) | `InstanceService` documents optimistic concurrency that is not implemented |
|
||||||
|
|
||||||
|
### Low (89)
|
||||||
|
|
||||||
|
| ID | Module | Title |
|
||||||
|
|----|--------|-------|
|
||||||
|
| CLI-008 | [CLI](CLI/findings.md) | `--format` value is not validated |
|
||||||
|
| CLI-009 | [CLI](CLI/findings.md) | Exit-code documentation does not match `HandleResponse` behaviour |
|
||||||
|
| CLI-010 | [CLI](CLI/findings.md) | `debug stream` reports Ctrl+C during connect as a connection failure |
|
||||||
|
| CLI-011 | [CLI](CLI/findings.md) | `CancellationTokenSource` in `debug stream` is never disposed |
|
||||||
|
| CLI-012 | [CLI](CLI/findings.md) | `debug stream` exit code is unreliable after stream termination |
|
||||||
|
| CLI-013 | [CLI](CLI/findings.md) | HTTP client, `debug stream`, and JSON-argument parsing are untested |
|
||||||
|
| CentralUI-015 | [CentralUI](CentralUI/findings.md) | `DialogService` continuations resolve off the render thread |
|
||||||
|
| CentralUI-016 | [CentralUI](CentralUI/findings.md) | Pagers render one button per page with no windowing |
|
||||||
|
| CentralUI-017 | [CentralUI](CentralUI/findings.md) | `/auth/logout` POST disables antiforgery, enabling logout CSRF |
|
||||||
|
| CentralUI-018 | [CentralUI](CentralUI/findings.md) | Broad `catch {}` blocks swallow JS interop and storage errors silently |
|
||||||
|
| CentralUI-019 | [CentralUI](CentralUI/findings.md) | Sparse unit-test coverage for a large module; critical paths untested |
|
||||||
|
| ClusterInfrastructure-005 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | No configuration section name constant for the Options pattern binding |
|
||||||
|
| ClusterInfrastructure-007 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | ClusterOptions lacks XML documentation comments |
|
||||||
|
| ClusterInfrastructure-008 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | "Phase 0 skeleton" status is undocumented at the module level |
|
||||||
|
| Commons-005 | [Commons](Commons/findings.md) | `OpcUaEndpointConfigSerializer.Deserialize` discards malformed legacy input and over-reports `IsLegacy` |
|
||||||
|
| Commons-006 | [Commons](Commons/findings.md) | `DynamicJsonElement.TryConvert` reports success for unconvertible target types |
|
||||||
|
| Commons-007 | [Commons](Commons/findings.md) | Several Commons types carry non-trivial logic, stretching REQ-COM-6 |
|
||||||
|
| Commons-008 | [Commons](Commons/findings.md) | `SetConnectionBindingsCommand` uses `ValueTuple` in a wire message contract |
|
||||||
|
| Commons-009 | [Commons](Commons/findings.md) | `Component-Commons.md` is stale relative to the actual file set |
|
||||||
|
| Commons-010 | [Commons](Commons/findings.md) | Behavior-bearing Commons types have no unit tests |
|
||||||
|
| Commons-011 | [Commons](Commons/findings.md) | `Result<T>.Failure` accepts a null error string |
|
||||||
|
| Commons-012 | [Commons](Commons/findings.md) | `ValueFormatter` uses current-culture formatting without documenting it |
|
||||||
|
| Communication-009 | [Communication](Communication/findings.md) | `_siteClients` field is mutable and reassignable; cache update is not atomic on failure |
|
||||||
|
| Communication-010 | [Communication](Communication/findings.md) | `DebugStreamBridgeActor` XML doc incorrectly describes it as a "Persistent actor" |
|
||||||
|
| Communication-011 | [Communication](Communication/findings.md) | No test coverage for snapshot-timeout cleanup, address-cache failure, or gRPC reconnect leak |
|
||||||
|
| ConfigurationDatabase-005 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Audit `Id` type disagrees with the design doc |
|
||||||
|
| ConfigurationDatabase-006 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `Site.GrpcNodeAAddress` / `GrpcNodeBAddress` columns are unbounded |
|
||||||
|
| ConfigurationDatabase-008 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `GetApprovedKeysForMethodAsync` CSV parsing silently drops malformed ids |
|
||||||
|
| ConfigurationDatabase-009 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Multi-collection eager loads issue cartesian-product queries |
|
||||||
|
| ConfigurationDatabase-010 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Several repositories and `InstanceLocator` lack direct test coverage |
|
||||||
|
| ConfigurationDatabase-011 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Inconsistent constructor null-guarding across repositories/services |
|
||||||
|
| DataConnectionLayer-008 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `HandleUnsubscribe` is O(n^2) over instances and rechecks `_unresolvedTags` redundantly |
|
||||||
|
| DataConnectionLayer-013 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Misleading XML comment: `RaiseDisconnected` claims thread safety it does not provide |
|
||||||
|
| DeploymentManager-009 | [DeploymentManager](DeploymentManager/findings.md) | Misleading timeout comment on `DeleteInstanceAsync` |
|
||||||
|
| DeploymentManager-010 | [DeploymentManager](DeploymentManager/findings.md) | `SystemArtifactDeploymentRecord` does not persist the deployment ID |
|
||||||
|
| DeploymentManager-012 | [DeploymentManager](DeploymentManager/findings.md) | `LifecycleCommandTimeout` option is dead code |
|
||||||
|
| DeploymentManager-013 | [DeploymentManager](DeploymentManager/findings.md) | SMTP credentials serialized and broadcast to all sites |
|
||||||
|
| DeploymentManager-014 | [DeploymentManager](DeploymentManager/findings.md) | Dead `CreateCommand` helper in artifact tests |
|
||||||
|
| ExternalSystemGateway-011 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Every call performs a full repository scan of all systems and methods |
|
||||||
|
| ExternalSystemGateway-012 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Permanent-failure logging requirement is not met; `_logger` is injected but unused |
|
||||||
|
| ExternalSystemGateway-013 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `MaxConcurrentConnectionsPerSystem` and `DefaultHttpTimeout` options are defined but never used |
|
||||||
|
| ExternalSystemGateway-014 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Cached-call buffering path and `DatabaseGateway` are untested |
|
||||||
|
| HealthMonitoring-004 | [HealthMonitoring](HealthMonitoring/findings.md) | Inconsistent heartbeat interval described across XML docs |
|
||||||
|
| HealthMonitoring-006 | [HealthMonitoring](HealthMonitoring/findings.md) | Sequence seeding contradicts the doc's "starting at 1" wording and is untestable |
|
||||||
|
| HealthMonitoring-010 | [HealthMonitoring](HealthMonitoring/findings.md) | `HealthReportSender` silently swallows inner failures with bare `catch {}` |
|
||||||
|
| HealthMonitoring-011 | [HealthMonitoring](HealthMonitoring/findings.md) | `AddHealthMonitoringActors` is a dead no-op placeholder |
|
||||||
|
| HealthMonitoring-012 | [HealthMonitoring](HealthMonitoring/findings.md) | `SiteHealthState.LatestReport` initialized to `null!`, misrepresenting the contract |
|
||||||
|
| Host-005 | [Host](Host/findings.md) | Blocking sync-over-async (`GetAwaiter().GetResult()`) inside `StartAsync` |
|
||||||
|
| Host-006 | [Host](Host/findings.md) | HOCON assembled by unescaped string interpolation |
|
||||||
|
| Host-007 | [Host](Host/findings.md) | REQ-HOST-4 rule "GrpcPort ≠ RemotingPort" is not enforced |
|
||||||
|
| Host-008 | [Host](Host/findings.md) | `MachineDataDb` is validated and declared but never consumed |
|
||||||
|
| Host-009 | [Host](Host/findings.md) | `StartAsync` reports success before role actors are confirmed running |
|
||||||
|
| Host-010 | [Host](Host/findings.md) | No retry/backoff around startup preconditions (DB migration, readiness) |
|
||||||
|
| Host-011 | [Host](Host/findings.md) | `LoggingOptions.MinimumLevel` is dead configuration |
|
||||||
|
| InboundAPI-009 | [InboundAPI](InboundAPI/findings.md) | Failed compilation is retried on every subsequent request |
|
||||||
|
| InboundAPI-010 | [InboundAPI](InboundAPI/findings.md) | `ParameterValidator` ignores extra body fields and cannot validate Object/List element types |
|
||||||
|
| InboundAPI-011 | [InboundAPI](InboundAPI/findings.md) | Method-existence check leaks to unapproved callers (enumeration oracle) |
|
||||||
|
| InboundAPI-012 | [InboundAPI](InboundAPI/findings.md) | `ParameterDefinition` POCO declared in the component project, not Commons |
|
||||||
|
| InboundAPI-013 | [InboundAPI](InboundAPI/findings.md) | `ApiKeyValidationResult.NotFound` factory returns HTTP 400, contradicting its name |
|
||||||
|
| ManagementService-005 | [ManagementService](ManagementService/findings.md) | ManagementActor declares no supervision strategy |
|
||||||
|
| ManagementService-008 | [ManagementService](ManagementService/findings.md) | HandleResolveRoles constructs RoleMapper manually instead of via DI |
|
||||||
|
| ManagementService-010 | [ManagementService](ManagementService/findings.md) | ManagementServiceOptions.CommandTimeout is defined but never used |
|
||||||
|
| ManagementService-011 | [ManagementService](ManagementService/findings.md) | ResolveRolesCommand dispatch path is stale dead code |
|
||||||
|
| ManagementService-012 | [ManagementService](ManagementService/findings.md) | ManagementEnvelope carries a loosely-typed object payload |
|
||||||
|
| NotificationService-010 | [NotificationService](NotificationService/findings.md) | `DeliverAsync` does not disconnect the SMTP client on failure |
|
||||||
|
| NotificationService-011 | [NotificationService](NotificationService/findings.md) | `SmtpPermanentException` declared in the wrong file; module conventions |
|
||||||
|
| NotificationService-012 | [NotificationService](NotificationService/findings.md) | Test coverage gaps: OAuth2 delivery path, permanent-classification fallback, token-cache concurrency |
|
||||||
|
| Security-008 | [Security](Security/findings.md) | N+1 query loading site-scope rules in `RoleMapper` |
|
||||||
|
| Security-009 | [Security](Security/findings.md) | CancellationToken not honored inside `Task.Run` LDAP calls |
|
||||||
|
| Security-010 | [Security](Security/findings.md) | Design doc contradicts itself on Windows Integrated Authentication |
|
||||||
|
| Security-011 | [Security](Security/findings.md) | Missing tests for security-critical paths |
|
||||||
|
| SiteEventLogging-006 | [SiteEventLogging](SiteEventLogging/findings.md) | Missing indexes for severity and keyword-search query paths |
|
||||||
|
| SiteEventLogging-009 | [SiteEventLogging](SiteEventLogging/findings.md) | XML doc on `LogEventAsync` claims asynchronous behaviour |
|
||||||
|
| SiteEventLogging-011 | [SiteEventLogging](SiteEventLogging/findings.md) | Stale "Phase 4+" placeholder in `ServiceCollectionExtensions` |
|
||||||
|
| SiteRuntime-012 | [SiteRuntime](SiteRuntime/findings.md) | `AttributeAccessor`/`ScopeAccessors` block the script on a synchronous Ask |
|
||||||
|
| SiteRuntime-013 | [SiteRuntime](SiteRuntime/findings.md) | `HandleUnsubscribeDebugView` does nothing despite documented behaviour |
|
||||||
|
| SiteRuntime-014 | [SiteRuntime](SiteRuntime/findings.md) | Trigger-expression evaluation blocks the coordinator actor thread |
|
||||||
|
| SiteRuntime-015 | [SiteRuntime](SiteRuntime/findings.md) | `LoggerFactory` created per Instance Actor and never disposed |
|
||||||
|
| SiteRuntime-016 | [SiteRuntime](SiteRuntime/findings.md) | Short-lived execution actors, replication actor, and repositories are untested |
|
||||||
|
| StoreAndForward-006 | [StoreAndForward](StoreAndForward/findings.md) | `GetParkedMessagesAsync` count and page run without a transaction |
|
||||||
|
| StoreAndForward-007 | [StoreAndForward](StoreAndForward/findings.md) | Async work in `ParkedMessageHandlerActor` uses `ContinueWith` without scheduler/affinity guarantees |
|
||||||
|
| StoreAndForward-008 | [StoreAndForward](StoreAndForward/findings.md) | A SQLite connection is opened and torn down on every storage call |
|
||||||
|
| StoreAndForward-009 | [StoreAndForward](StoreAndForward/findings.md) | `OnActivity` event invocation is not thread-safe against concurrent subscribe/unsubscribe |
|
||||||
|
| StoreAndForward-011 | [StoreAndForward](StoreAndForward/findings.md) | `StoreAndForwardMessageStatus.InFlight` is unused and the doc's "retrying" status is unmodelled |
|
||||||
|
| StoreAndForward-012 | [StoreAndForward](StoreAndForward/findings.md) | `StoreAndForwardMessage` is a persistence entity but lives in the component, not Commons |
|
||||||
|
| TemplateEngine-011 | [TemplateEngine](TemplateEngine/findings.md) | `SortedPropertiesConverterFactory` is dead code with a misleading comment |
|
||||||
|
| TemplateEngine-012 | [TemplateEngine](TemplateEngine/findings.md) | `DataType` enum naming diverges from the design doc |
|
||||||
|
| TemplateEngine-013 | [TemplateEngine](TemplateEngine/findings.md) | `ToDictionary(t => t.Id)` throws on duplicate IDs; cycle detectors overload Id 0 as a sentinel |
|
||||||
|
| TemplateEngine-014 | [TemplateEngine](TemplateEngine/findings.md) | Template-deletion constraint logic is duplicated and divergent |
|
||||||
@@ -0,0 +1,113 @@
|
|||||||
|
# Code Review Process
|
||||||
|
|
||||||
|
This document describes how to perform a comprehensive, per-module code review of
|
||||||
|
the ScadaLink codebase and how to track findings to resolution.
|
||||||
|
|
||||||
|
A **module** is one buildable project under `src/` (e.g. `src/ScadaLink.TemplateEngine`).
|
||||||
|
Each module has its own folder under `code-reviews/` containing a single `findings.md`.
|
||||||
|
|
||||||
|
## 1. Before you start
|
||||||
|
|
||||||
|
1. Pick the module to review. Its folder is `code-reviews/<Module>/` where `<Module>`
|
||||||
|
is the project name with the `ScadaLink.` prefix stripped.
|
||||||
|
2. Identify the design context for the module:
|
||||||
|
- Its component design doc: `docs/requirements/Component-<Name>.md`.
|
||||||
|
- The relevant **Key Design Decisions** in `CLAUDE.md`.
|
||||||
|
- `docs/requirements/HighLevelReqs.md` for cross-cutting requirements.
|
||||||
|
3. Record the exact commit being reviewed: `git rev-parse --short HEAD`. Every review
|
||||||
|
is a snapshot — a finding only means something relative to a known commit.
|
||||||
|
4. Open `code-reviews/<Module>/findings.md` and fill in the header table
|
||||||
|
(reviewer, date, commit SHA).
|
||||||
|
|
||||||
|
## 2. Review checklist
|
||||||
|
|
||||||
|
Work through **every** category below for the module. A comprehensive review means
|
||||||
|
the checklist is completed even where it produces no findings — record "No issues
|
||||||
|
found" for a category rather than leaving it ambiguous.
|
||||||
|
|
||||||
|
1. **Correctness & logic bugs** — off-by-one, null handling, incorrect conditionals,
|
||||||
|
misuse of APIs, broken edge cases.
|
||||||
|
2. **Akka.NET conventions** — supervision strategies (Resume for coordinators, Stop
|
||||||
|
for short-lived actors), `Tell` for hot paths / `Ask` only at system boundaries,
|
||||||
|
message immutability, no blocking on non-blocking dispatchers, no `sender`/`this`
|
||||||
|
captured in closures (`PipeTo` instead), correlation IDs on request/response.
|
||||||
|
3. **Concurrency & thread safety** — shared mutable state, actor state mutated only
|
||||||
|
on the actor thread, race conditions, correct use of async/await.
|
||||||
|
4. **Error handling & resilience** — exception paths, store-and-forward integration,
|
||||||
|
reconnect/retry logic, failover behaviour, transient vs permanent error
|
||||||
|
classification, graceful degradation.
|
||||||
|
5. **Security** — authentication/authorization checks, input validation, the script
|
||||||
|
trust model (forbidden APIs: `System.IO`, `Process`, `Threading`, `Reflection`,
|
||||||
|
raw network), secret handling, SQL/LDAP injection, logging of sensitive data.
|
||||||
|
6. **Performance & resource management** — `IDisposable` disposal, stream/connection
|
||||||
|
lifetimes, buffering and back-pressure, unnecessary allocations, N+1 queries.
|
||||||
|
7. **Design-document adherence** — does the code match `Component-<Name>.md` and the
|
||||||
|
relevant CLAUDE.md decisions? Flag both code that drifts from the design and design
|
||||||
|
docs that are now stale.
|
||||||
|
8. **Code organization & conventions** — persistence-ignorant POCO entities in
|
||||||
|
Commons, repository interfaces in Commons / implementations in ConfigurationDatabase,
|
||||||
|
namespace hierarchy, Options pattern (options classes owned by component projects),
|
||||||
|
additive-only message contract evolution.
|
||||||
|
9. **Testing coverage** — are the module's behaviours covered by tests in `tests/`?
|
||||||
|
Note untested critical paths and missing edge-case tests.
|
||||||
|
10. **Documentation & comments** — XML doc accuracy, misleading or stale comments,
|
||||||
|
undocumented non-obvious behaviour.
|
||||||
|
|
||||||
|
## 3. Recording findings
|
||||||
|
|
||||||
|
Add one entry per finding to the `## Findings` section of the module's `findings.md`,
|
||||||
|
using the entry format in [`_template/findings.md`](_template/findings.md).
|
||||||
|
|
||||||
|
- **Finding ID** — `<Module>-NNN`, numbered sequentially within the module and never
|
||||||
|
reused (e.g. `TemplateEngine-001`). IDs are permanent even after resolution.
|
||||||
|
- **Severity:**
|
||||||
|
- **Critical** — data loss, security breach, crash/deadlock, or cluster-wide outage.
|
||||||
|
- **High** — incorrect behaviour with significant impact; no safe workaround.
|
||||||
|
- **Medium** — incorrect or risky behaviour with limited impact or a workaround.
|
||||||
|
- **Low** — minor issues, style, maintainability, documentation.
|
||||||
|
- **Category** — one of the 10 checklist categories above.
|
||||||
|
- **Location** — `file:line` (clickable), or a list of locations.
|
||||||
|
- **Description** — what is wrong and why it matters.
|
||||||
|
- **Recommendation** — concrete suggested fix.
|
||||||
|
|
||||||
|
After recording findings, update the module header table (status, open-finding count)
|
||||||
|
and refresh the base README (step 5).
|
||||||
|
|
||||||
|
## 4. Marking an item resolved
|
||||||
|
|
||||||
|
Findings are **never deleted** — they are an audit trail. To close one, change its
|
||||||
|
**Status** and complete the **Resolution** field:
|
||||||
|
|
||||||
|
- `Open` — newly recorded, not yet addressed.
|
||||||
|
- `In Progress` — a fix is actively being worked on.
|
||||||
|
- `Resolved` — fixed. The Resolution field must state the fixing commit SHA, the
|
||||||
|
date, and a one-line description of the fix.
|
||||||
|
- `Won't Fix` — intentionally not fixed. The Resolution field must justify why.
|
||||||
|
- `Deferred` — valid but postponed. The Resolution field must say what it is waiting
|
||||||
|
on (e.g. a tracked issue or a later milestone).
|
||||||
|
|
||||||
|
`Resolved`, `Won't Fix`, and `Deferred` findings are all considered **closed** and
|
||||||
|
drop off the base README's pending list. `Open` and `In Progress` are **pending**.
|
||||||
|
|
||||||
|
## 5. Updating the base README
|
||||||
|
|
||||||
|
`code-reviews/README.md` holds the single cross-module view (process overview, the
|
||||||
|
Pending Findings tables, and the Module Status table). It is **generated** from the
|
||||||
|
per-module `findings.md` files — do not edit it by hand.
|
||||||
|
|
||||||
|
After any review or status change, regenerate it:
|
||||||
|
|
||||||
|
```
|
||||||
|
python3 code-reviews/regen-readme.py
|
||||||
|
```
|
||||||
|
|
||||||
|
`regen-readme.py --check` exits non-zero if `README.md` is stale, for use in CI.
|
||||||
|
|
||||||
|
The per-module `findings.md` files are the source of truth; `README.md` is the
|
||||||
|
aggregated index and must always agree with them — which the script guarantees.
|
||||||
|
|
||||||
|
## 6. Re-reviewing a module
|
||||||
|
|
||||||
|
Re-reviews append to the same `findings.md`. Update the header to the new commit and
|
||||||
|
date, continue the finding numbering from the last used ID, and leave prior findings
|
||||||
|
(including closed ones) in place as history.
|
||||||
@@ -0,0 +1,365 @@
|
|||||||
|
# Code Review — Security
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.Security` |
|
||||||
|
| Design doc | `docs/requirements/Component-Security.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 11 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The Security module is small and reasonably structured: a stateless `LdapAuthService`
|
||||||
|
for search-then-bind authentication, a `JwtTokenService` for HMAC-signed cookie tokens,
|
||||||
|
a `RoleMapper` that resolves LDAP groups to roles, and ASP.NET Core authorization
|
||||||
|
policies plus a site-scope handler. Unit-test coverage of the happy paths is decent.
|
||||||
|
However, the review surfaced several real security weaknesses, the most serious being
|
||||||
|
that **StartTLS is dead code** (the design's "LDAPS or StartTLS" requirement is only
|
||||||
|
half met), that **the authentication cookie is not marked `Secure`** despite the design
|
||||||
|
mandating it, and that **the JWT signing key is never length-validated** so a weak or
|
||||||
|
empty key is silently accepted. There is also a genuine **DN-injection** gap in the
|
||||||
|
no-service-account fallback path, a filter/DN attribute mismatch (`uid=` vs `cn=`) that
|
||||||
|
makes that fallback path internally inconsistent, and an N+1 query in `RoleMapper`.
|
||||||
|
JWT validation also disables issuer/audience checks and the idle-timeout claim is reset
|
||||||
|
on every refresh, weakening the documented 30-minute idle policy. None of these are
|
||||||
|
crash/data-loss bugs, but the TLS, cookie, and key-validation items are security
|
||||||
|
defects that should be fixed before any production deployment.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | `uid=`/`cn=` attribute mismatch between search filter and fallback DN construction (Security-004); StartTLS branch is unreachable (Security-001). |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | No actors in this module — `AddSecurityActors` is an empty placeholder. Nothing to assess. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Services are stateless and DI-scoped; LDAP sync calls wrapped in `Task.Run`. No shared mutable state. No issues found. |
|
||||||
|
| 4 | Error handling & resilience | ☑ | LDAP failure paths return structured `LdapAuthResult`; group-lookup failure is tolerated per design. `ct` not honored inside `Task.Run` bodies (Security-009). |
|
||||||
|
| 5 | Security | ☑ | StartTLS dead code (Security-001), cookie not `Secure` (Security-002), JWT key unvalidated (Security-003), DN injection (Security-005), no issuer/audience validation (Security-006), idle-timeout reset on refresh (Security-007). |
|
||||||
|
| 6 | Performance & resource management | ☑ | N+1 scope-rule query in `RoleMapper` (Security-008). `LdapConnection` correctly disposed via `using`. |
|
||||||
|
| 7 | Design-document adherence | ☑ | StartTLS unsupported and Secure cookie missing both contradict the design doc; design also says "Windows Integrated Authentication" in Responsibilities, contradicting its own Authentication section (Security-010). |
|
||||||
|
| 8 | Code organization & conventions | ☑ | `SecurityOptions` correctly owned by the component; repository interface in Commons. No issues found. |
|
||||||
|
| 9 | Testing coverage | ☑ | No tests for `RoleMapper` N+1 behavior, DN-injection inputs, StartTLS path, or idle-timeout-after-refresh. Insecure-config combinations under-tested (Security-011). |
|
||||||
|
| 10 | Documentation & comments | ☑ | `SecurityOptions` XML docs say direct bind uses `cn={username}` while the search filter uses `uid=` — comment is misleading (covered under Security-004). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### Security-001 — StartTLS upgrade path is unreachable dead code
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/LdapAuthService.cs:37-47` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
When `LdapUseTls` is true the code sets `connection.SecureSocketLayer = true` (LDAPS).
|
||||||
|
The subsequent StartTLS block is guarded by `if (_options.LdapUseTls && !connection.SecureSocketLayer)`.
|
||||||
|
Because `SecureSocketLayer` was just set to `true`, the second condition `!connection.SecureSocketLayer`
|
||||||
|
is always false, so `connection.StartTls()` is never called. The design doc explicitly
|
||||||
|
states LDAP connections must use **"LDAPS (port 636) or StartTLS"** — StartTLS is in
|
||||||
|
practice unsupported. A deployment that intends to use StartTLS on port 389 would get a
|
||||||
|
plaintext LDAPS-mode connection attempt that fails, or worse, an operator may disable
|
||||||
|
TLS entirely to make it work, sending credentials in cleartext.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Introduce an explicit transport mode (e.g. `LdapTransport { Ldaps, StartTls, None }`)
|
||||||
|
or a separate `LdapUseStartTls` flag. For StartTLS, leave `SecureSocketLayer` false,
|
||||||
|
call `connection.Connect`, then call `connection.StartTls()` and verify the negotiated
|
||||||
|
session is encrypted before binding. Remove the unreachable conditional.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-002 — Authentication cookie is not marked `Secure`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/ServiceCollectionExtensions.cs:16-23` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AddCookie` sets `HttpOnly = true` and `SameSite = Strict` but never sets
|
||||||
|
`options.Cookie.SecurePolicy`. The ASP.NET Core default is `CookieSecurePolicy.SameAsRequest`,
|
||||||
|
which permits the cookie (carrying the embedded JWT — a bearer credential) to be sent
|
||||||
|
over plain HTTP. The design doc states the cookie is **"HttpOnly and Secure (requires
|
||||||
|
HTTPS)"**. As written, the module does not enforce that requirement; a misconfigured or
|
||||||
|
HTTP-fronted deployment would transmit the session token in cleartext.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Set `options.Cookie.SecurePolicy = CookieSecurePolicy.Always` in `AddCookie`. Consider
|
||||||
|
also setting `ExpireTimeSpan` and `SlidingExpiration` to align the cookie lifetime with
|
||||||
|
the documented 15-minute JWT / 30-minute idle policy.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-003 — JWT signing key length is never validated
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/JwtTokenService.cs:33`, `src/ScadaLink.Security/SecurityOptions.cs:42` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SecurityOptions.JwtSigningKey` defaults to `string.Empty` and is fed directly into
|
||||||
|
`new SymmetricSecurityKey(Encoding.UTF8.GetBytes(_options.JwtSigningKey))` with no
|
||||||
|
validation. HMAC-SHA256 requires a key of at least 256 bits (32 bytes); a short or empty
|
||||||
|
key produces a trivially forgeable token. The `SecurityHardeningTests` comment claims a
|
||||||
|
minimum length is "enforced", but no code in this module enforces it — the test only
|
||||||
|
asserts that a 32+ char key works. A deployment with a missing or short `JwtSigningKey`
|
||||||
|
would start successfully and issue weakly-signed tokens.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Validate `JwtSigningKey` at startup — fail fast if it is empty or shorter than 32 bytes.
|
||||||
|
Use an `IValidateOptions<SecurityOptions>` validator or guard in the `JwtTokenService`
|
||||||
|
constructor so a weak key is rejected before any token is issued.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-004 — Search filter uses `uid=` while fallback DN construction uses `cn=`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/LdapAuthService.cs:66`, `:138`, `:157-159` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AuthenticateAsync` and `ResolveUserDnAsync` build the search filter as
|
||||||
|
`(uid={username})`, but the no-service-account fallback in `ResolveUserDnAsync`
|
||||||
|
constructs the bind DN as `cn={username},{LdapSearchBase}`. The `SecurityOptions.LdapServiceAccountDn`
|
||||||
|
XML comment also documents the fallback as `cn={username},{LdapSearchBase}`. A directory
|
||||||
|
keyed on `uid` will succeed via search-then-bind but fail via the direct-bind fallback
|
||||||
|
(and vice versa). The attribute used for lookup is hard-coded and inconsistent across
|
||||||
|
the two code paths, so the two configuration modes are not interchangeable.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Introduce a single configurable `LdapUserIdAttribute` (default `uid`) and use it
|
||||||
|
consistently in both the search filter and the fallback DN. Update the XML doc to match.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-005 — DN injection in the no-service-account bind fallback
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/LdapAuthService.cs:157-159` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
When no service account is configured, the user-supplied `username` is interpolated
|
||||||
|
directly into a distinguished name: `$"cn={username},{LdapSearchBase}"`. `EscapeLdapFilter`
|
||||||
|
escapes *search-filter* metacharacters, but DN construction requires a different
|
||||||
|
escaping scheme (RFC 4514 — `,`, `+`, `"`, `\`, `<`, `>`, `;`, leading/trailing spaces).
|
||||||
|
No DN escaping is applied here. A username such as `victim,ou=admins` alters the DN
|
||||||
|
structure, allowing a caller to attempt a bind as a different DN than intended. Combined
|
||||||
|
with the `username.Contains('=')` shortcut at line 129 — which lets a caller supply a
|
||||||
|
full arbitrary DN — the fallback path gives the client undue control over the bind
|
||||||
|
identity.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Apply RFC 4514 DN-component escaping to `username` before interpolation, or use the
|
||||||
|
LDAP library's DN-builder API. Reconsider the `Contains('=')` shortcut — accepting a
|
||||||
|
raw DN from untrusted input is risky; restrict it or remove it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-006 — JWT validation disables issuer and audience checks
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/JwtTokenService.cs:67-75`, `:56-59` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ValidateToken` sets `ValidateIssuer = false` and `ValidateAudience = false`, and
|
||||||
|
`GenerateToken` never sets an `iss` or `aud`. With a shared symmetric HMAC key, any
|
||||||
|
other system or component that signs JWTs with the same key would produce tokens this
|
||||||
|
service accepts. While the design states the key is shared only between the two central
|
||||||
|
nodes, omitting issuer/audience binding removes a cheap defense-in-depth control and
|
||||||
|
makes accidental key reuse (e.g. the same secret used for another internal token)
|
||||||
|
silently exploitable.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Set a fixed `Issuer` and `Audience` (e.g. `"scadalink-central"`) when generating tokens
|
||||||
|
and enable `ValidateIssuer`/`ValidateAudience` with the matching expected values during
|
||||||
|
validation.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-007 — Idle-timeout claim is reset on every token refresh
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/JwtTokenService.cs:40`, `:111-123` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design states the 30-minute idle timeout is tracked via a "last-activity timestamp
|
||||||
|
in the token", and `IsIdleTimedOut` reads the `LastActivity` claim. But `RefreshToken`
|
||||||
|
calls `GenerateToken`, which unconditionally writes `LastActivity = DateTimeOffset.UtcNow`.
|
||||||
|
Token refresh fires whenever a request arrives within ~5 minutes of expiry. The result
|
||||||
|
is that `LastActivity` reflects *token issuance time*, not genuine user activity — and
|
||||||
|
since refresh itself is a request, the timestamp keeps moving forward. A more subtle
|
||||||
|
consequence: the idle window is effectively measured from the last refresh, not the
|
||||||
|
last real interaction, so the documented "no requests within the idle window" semantics
|
||||||
|
are not faithfully implemented. The claim name `LastActivity` is also misleading.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide explicitly how activity is tracked. Either (a) carry the original `LastActivity`
|
||||||
|
forward across refreshes and update it only on real request handling in the middleware,
|
||||||
|
or (b) rename the claim to `IssuedAt`/`TokenCreated` and document that the idle window
|
||||||
|
is measured from issuance. Whichever is chosen, ensure `IsIdleTimedOut` and the refresh
|
||||||
|
path agree on the semantics.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-008 — N+1 query loading site-scope rules in `RoleMapper`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/RoleMapper.cs:25-48` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`MapGroupsToRolesAsync` first calls `GetAllMappingsAsync`, then inside the per-mapping
|
||||||
|
loop calls `GetScopeRulesForMappingAsync(mapping.Id, ct)` once for every matched
|
||||||
|
Deployment mapping. This is an N+1 query pattern executed on the login hot path and on
|
||||||
|
every 15-minute token refresh. With multiple site-scoped Deployment groups it issues a
|
||||||
|
round-trip per group.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add a repository method that loads scope rules for a set of mapping IDs in one query
|
||||||
|
(or eager-loads them with the mappings), and resolve all scope rules with a single call.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-009 — CancellationToken not honored inside `Task.Run` LDAP calls
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Security/LdapAuthService.cs:42`, `:46`, `:51`, `:56-57`, `:67-73`, `:135`, `:139-145` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The synchronous Novell LDAP calls are wrapped in `Task.Run(() => ..., ct)`. The `ct`
|
||||||
|
argument only prevents the work item from *starting* if cancellation is already
|
||||||
|
signaled; once a `connection.Connect`/`Bind`/`Search` call is in progress it cannot be
|
||||||
|
cancelled. A cancelled or timed-out login request will continue to occupy a thread-pool
|
||||||
|
thread and an LDAP connection until the blocking call returns on its own. There is also
|
||||||
|
no explicit network/operation timeout configured on the `LdapConnection`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Configure `LdapConnection.ConnectionTimeout` and search/operation time limits so a
|
||||||
|
hung LDAP server cannot pin a thread indefinitely. Document that `ct` only guards
|
||||||
|
work-item scheduling, or implement a timeout-with-disconnect fallback.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-010 — Design doc contradicts itself on Windows Integrated Authentication
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `docs/requirements/Component-Security.md:13` (vs. `:23`) |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The Responsibilities section states the component authenticates "using Windows
|
||||||
|
Integrated Authentication", but the Authentication section (line 23) and CLAUDE.md
|
||||||
|
explicitly state **"No Windows Integrated Authentication ... authenticates directly
|
||||||
|
against LDAP/AD, not via Kerberos/NTLM"** — which is what the code actually does
|
||||||
|
(direct LDAP bind). The Responsibilities line is stale and contradicts both the rest of
|
||||||
|
the doc and the implementation.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Fix `Component-Security.md:13` to say "using a direct LDAP/Active Directory bind"
|
||||||
|
to match the implemented behavior and the rest of the document.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### Security-011 — Missing tests for security-critical paths
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.Security.Tests/UnitTest1.cs` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test suite covers happy paths well but omits several security-relevant cases:
|
||||||
|
no test exercises the StartTLS path (Security-001), the DN-injection / `Contains('=')`
|
||||||
|
fallback inputs (Security-005), JWT validation with a too-short or empty signing key
|
||||||
|
(Security-003), `IsIdleTimedOut` returning true after a token has been refreshed
|
||||||
|
(Security-007), or the `uid`/`cn` mismatch in the no-service-account path (Security-004).
|
||||||
|
The integration `SecurityHardeningTests` only asserts default option values, not
|
||||||
|
enforcement. The test file is still named `UnitTest1.cs`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add negative/edge-case tests for the items above, particularly key-length rejection,
|
||||||
|
DN-escaping of hostile usernames, and idle-timeout behavior across a refresh. Rename
|
||||||
|
`UnitTest1.cs` to a descriptive name.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,402 @@
|
|||||||
|
# Code Review — SiteEventLogging
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.SiteEventLogging` |
|
||||||
|
| Design doc | `docs/requirements/Component-SiteEventLogging.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 11 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The SiteEventLogging module is small and broadly well-structured: a SQLite-backed
|
||||||
|
recorder (`SiteEventLogger`), a query service with keyset pagination, a background
|
||||||
|
purge service, and a thin Akka actor bridge. The query path is parameterised
|
||||||
|
correctly (no SQL injection) and reasonably well tested. However, the storage-cap
|
||||||
|
enforcement is functionally broken: `PRAGMA incremental_vacuum` is a no-op because
|
||||||
|
`auto_vacuum = INCREMENTAL` is never set, so the cap-purge loop never sees the
|
||||||
|
database shrink and over-deletes the entire table when triggered. There is also a
|
||||||
|
genuine concurrency hazard: the purge service and query service share the single
|
||||||
|
`SqliteConnection` owned by `SiteEventLogger` but bypass its `_writeLock`, so a purge
|
||||||
|
running on the background thread can collide with a write or a query on another
|
||||||
|
thread. The `LogEventAsync` API is synchronous despite its name and `Task` return,
|
||||||
|
which silently blocks Akka actor threads on disk I/O. Other findings concern the
|
||||||
|
cluster-singleton placement of the handler actor (which can pin to the standby
|
||||||
|
node), missing indexes for common query filters, retention/cap purge not enforcing
|
||||||
|
the requirement strictly, and several documentation/maintainability issues.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | `incremental_vacuum` no-op breaks cap purge (-001); over-delete on cap (-002). |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | Handler actor has no supervision/correlation concerns of its own; singleton placement issue (-004). `Ask` boundary is appropriate. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Shared `SqliteConnection` used by purge/query without the write lock (-003). |
|
||||||
|
| 4 | Error handling & resilience | ☑ | `LogEventAsync` swallows write failures silently into a log line only (-008); purge catches broadly. |
|
||||||
|
| 5 | Security | ☑ | Queries fully parameterised. No authz in module (delegated to caller) — noted, not a finding. |
|
||||||
|
| 6 | Performance & resource management | ☑ | Synchronous I/O on actor threads (-005); missing indexes for severity/source/message (-006). |
|
||||||
|
| 7 | Design-document adherence | ☑ | Singleton placement contradicts "active node" model (-004); cap purge does not honour "oldest first within budget" cleanly (-002). |
|
||||||
|
| 8 | Code organization & conventions | ☑ | Concrete-type downcast of `ISiteEventLogger` (-007); `internal Connection` leaks DB handle (-007). |
|
||||||
|
| 9 | Testing coverage | ☑ | No tests for purge interaction with live writes, vacuum effectiveness, the actor bridge, or query error path (-010). |
|
||||||
|
| 10 | Documentation & comments | ☑ | `LogEventAsync` XML doc says "asynchronously" but is synchronous (-009); stale "Phase 4+" placeholder (-011). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### SiteEventLogging-001 — `PRAGMA incremental_vacuum` is a no-op; storage cap cannot reclaim space
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:100-102`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:36-55` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`PurgeByStorageCap` issues `PRAGMA incremental_vacuum` after each delete batch to
|
||||||
|
reclaim space, then re-measures the database size via `page_count * page_size`.
|
||||||
|
`incremental_vacuum` only has any effect when the database was created with
|
||||||
|
`auto_vacuum = INCREMENTAL`. `InitializeSchema` never sets `auto_vacuum`, so the
|
||||||
|
database uses the SQLite default (`auto_vacuum = NONE`). With `NONE`,
|
||||||
|
`incremental_vacuum` is silently ignored and `page_count` does not decrease when
|
||||||
|
rows are deleted (free pages are retained in the file). Consequently the
|
||||||
|
`while (currentSizeBytes > capBytes)` loop never observes the size dropping. The
|
||||||
|
storage-cap feature required by the design ("configurable maximum database size...
|
||||||
|
oldest events are purged first") is therefore non-functional — it cannot bring the
|
||||||
|
file back under the cap.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Set `PRAGMA auto_vacuum = INCREMENTAL` in `InitializeSchema` before any tables are
|
||||||
|
created (it must be set before table creation or followed by a full `VACUUM` to take
|
||||||
|
effect on an existing database). Alternatively, run a full `VACUUM` after cap-purge
|
||||||
|
deletes, or measure logical data size (e.g. `page_count - freelist_count` times
|
||||||
|
`page_size`) instead of relying on `incremental_vacuum`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-002 — Storage-cap purge deletes the entire table when space is not reclaimed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:87-105` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Because of SiteEventLogging-001 the on-disk size never shrinks after a delete batch,
|
||||||
|
so `currentSizeBytes` stays above `capBytes`. The loop then keeps deleting 1000-row
|
||||||
|
batches on every iteration until `ExecuteNonQuery` returns 0 — i.e. until the table
|
||||||
|
is completely empty. The design states the cap should purge "the oldest events...
|
||||||
|
first" to stay within budget, not wipe the whole log. When the cap is hit (e.g.
|
||||||
|
during an alarm storm) this destroys all retained diagnostic history rather than
|
||||||
|
trimming it to the budget. The unit test `PurgeByStorageCap_DeletesOldestWhenOverCap`
|
||||||
|
masks the problem because it uses `MaxStorageMb = 0`, which legitimately expects an
|
||||||
|
empty table, so the over-delete behaviour is never exercised against a realistic cap.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Fix the size measurement / vacuum (SiteEventLogging-001) so the loop terminates when
|
||||||
|
the file is genuinely under the cap. Add a guard so the loop stops once
|
||||||
|
`currentSizeBytes` has stopped decreasing across iterations, and add a test with a
|
||||||
|
non-zero cap and a known oversized dataset to assert that only the oldest events are
|
||||||
|
removed.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-003 — Shared `SqliteConnection` used by purge and query without the write lock
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:64,90,100,110,114`, `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:36`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:34,72` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SiteEventLogger` owns a single `SqliteConnection` and serialises its own writes via
|
||||||
|
`lock (_writeLock)`. `EventLogPurgeService` and `EventLogQueryService` both reach
|
||||||
|
into `_eventLogger.Connection` and execute commands directly, without acquiring
|
||||||
|
`_writeLock`. The purge runs on a `BackgroundService` thread (a different thread from
|
||||||
|
event-recording callers and from the actor that drives the query service). A single
|
||||||
|
`SqliteConnection` / `SqliteCommand` is not thread-safe; concurrent use from the
|
||||||
|
purge thread and a recording thread (or query thread) can throw
|
||||||
|
`SqliteException`/`InvalidOperationException` ("DataReader already open",
|
||||||
|
"connection busy") or corrupt command state. The purge `DELETE` and the recorder
|
||||||
|
`INSERT` racing is the most likely collision because event recording is continuous.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Funnel all access to the connection through a single synchronisation point: either
|
||||||
|
expose lock-guarded methods on `SiteEventLogger` for purge/query to call, or give the
|
||||||
|
purge and query services their own dedicated `SqliteConnection` instances (SQLite
|
||||||
|
supports multiple connections to the same file; `Cache=Shared` plus a `busy_timeout`
|
||||||
|
makes this safer). Do not share one `SqliteConnection` across threads.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-004 — Event-log handler runs as a cluster singleton that can land on the standby node
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:313-336`, `src/ScadaLink.SiteEventLogging/EventLogHandlerActor.cs:21-25` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`EventLogHandlerActor` is hosted as a `ClusterSingletonManager` singleton with the
|
||||||
|
stated intent that "queries always reach the active node". However, an Akka.NET
|
||||||
|
cluster singleton is pinned to the *oldest* member of the role, which is not the
|
||||||
|
same concept as the SCADA "active node" (the node currently running the Deployment
|
||||||
|
Manager singleton / serving live traffic). The design doc is explicit: "Only the
|
||||||
|
active node generates and stores events... the new active node starts logging to its
|
||||||
|
own SQLite database." The event-log SQLite file is node-local and unreplicated.
|
||||||
|
Nothing guarantees the event-log singleton co-locates with the active node, so a
|
||||||
|
remote query can be served by the standby node and read that node's near-empty
|
||||||
|
database, returning no events even though the active node has a full log. The
|
||||||
|
explanatory comment in `AkkaHostedService.cs` asserts the opposite of what actually
|
||||||
|
happens.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either (a) host the query handler as a normal per-node actor and route queries to
|
||||||
|
the active node explicitly (the node owning the Deployment Manager singleton), or
|
||||||
|
(b) make the event-log writer follow the same singleton so the writer and the query
|
||||||
|
handler are guaranteed co-located. Reconcile the design doc and the inline comment
|
||||||
|
with whichever model is chosen.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-005 — `LogEventAsync` performs synchronous disk I/O on the caller's thread
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:57-99` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LogEventAsync` is declared `async`-shaped (returns `Task`, `Async` suffix) but its
|
||||||
|
body is entirely synchronous: it takes `lock (_writeLock)`, runs
|
||||||
|
`cmd.ExecuteNonQuery()` (a blocking SQLite write), then returns `Task.CompletedTask`.
|
||||||
|
Callers across the codebase invoke it fire-and-forget as `_ = LogEventAsync(...)`
|
||||||
|
(e.g. `ScriptExecutionActor.cs:133`, `DataConnectionActor.cs:292`,
|
||||||
|
`ScriptActor.cs:250`) expecting it to be non-blocking. In reality the SQLite write,
|
||||||
|
and any contention on `_writeLock`, executes inline on the Akka actor thread of the
|
||||||
|
calling subsystem. Under an event burst (alarm storm, script failure loop) this
|
||||||
|
serialises actor threads on disk I/O and the global write lock, degrading the
|
||||||
|
hot-path subsystems the design intends to keep responsive.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either make recording genuinely asynchronous (offload to a dedicated single-threaded
|
||||||
|
writer / `Channel<T>` consumer so callers truly fire-and-forget), or rename the
|
||||||
|
method to `LogEvent` and document that it blocks, so callers can decide. Given the
|
||||||
|
design's emphasis on not impacting runtime subsystems, an internal queue with a
|
||||||
|
background flush is preferable.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-006 — Missing indexes for severity and keyword-search query paths
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:50-52`, `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:65-81` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`InitializeSchema` creates indexes on `timestamp`, `event_type`, and `instance_id`.
|
||||||
|
The query service also filters on `severity` (`severity = $severity`) and performs
|
||||||
|
`message LIKE '%...%'` / `source LIKE '%...%'` keyword search. `severity` has no
|
||||||
|
index, and a leading-wildcard `LIKE` cannot use a normal index at all. With up to a
|
||||||
|
1 GB database and a 500-row page size, severity-filtered and keyword queries do full
|
||||||
|
table scans on every page. The design explicitly lists keyword search as a supported,
|
||||||
|
expected query type.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add an index on `severity` (or a composite index aligned with common filter
|
||||||
|
combinations such as `(event_type, severity, id)`). For keyword search, consider an
|
||||||
|
FTS5 virtual table over `message` and `source`, or accept the scan but document the
|
||||||
|
cost.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-007 — `ISiteEventLogger` consumers downcast to the concrete type and reach into the DB connection
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:25`, `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:26`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:34` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Both `EventLogPurgeService` and `EventLogQueryService` take `ISiteEventLogger` via
|
||||||
|
DI and immediately downcast it: `_eventLogger = (SiteEventLogger)eventLogger;`. They
|
||||||
|
then access the `internal SqliteConnection Connection` property to run arbitrary SQL.
|
||||||
|
This defeats the purpose of the interface abstraction, makes the registration
|
||||||
|
fragile (any `ISiteEventLogger` that is not exactly `SiteEventLogger` causes an
|
||||||
|
`InvalidCastException` at construction), and leaks the database handle and raw SQL
|
||||||
|
surface out of the recorder. It is also the root cause of the unsynchronised
|
||||||
|
connection sharing in SiteEventLogging-003.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Introduce a proper data-access abstraction (e.g. an `IEventLogStore` with
|
||||||
|
`Insert`, `Query`, `PurgeOlderThan`, `PurgeToSize`, `GetSizeBytes`) that owns the
|
||||||
|
connection and its locking, and inject that into the recorder, query, and purge
|
||||||
|
services. Remove the `internal Connection` property and the concrete-type downcasts.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-008 — Event-recording write failures are silently swallowed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:92-95` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
If `ExecuteNonQuery` throws (disk full, database locked, file corruption), the
|
||||||
|
exception is caught, written to `ILogger`, and discarded; `LogEventAsync` still
|
||||||
|
returns `Task.CompletedTask` as if successful. Callers fire-and-forget the result so
|
||||||
|
they cannot detect failure. The event log is the site's diagnostic audit trail; a
|
||||||
|
sustained write failure (for example a locked-database storm caused by the
|
||||||
|
unsynchronised purge in SiteEventLogging-003) means events vanish with no signal to
|
||||||
|
operators except a local log line that nobody is watching. There is no failure
|
||||||
|
counter, no health-metric hook, and no retry.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Expose a failure signal: increment a counter that the Health Monitoring component
|
||||||
|
can surface (the design notes script/alarm error rates are derived from the event
|
||||||
|
log — a logging outage should be visible). At minimum, escalate repeated failures to
|
||||||
|
a Warning/Error health metric rather than only a local log line.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-009 — XML doc on `LogEventAsync` claims asynchronous behaviour
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/ISiteEventLogger.cs:8-10`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:57` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The interface XML doc states "Record an event asynchronously." and the method is
|
||||||
|
named `LogEventAsync`, but the implementation is fully synchronous (see
|
||||||
|
SiteEventLogging-005). The documentation and naming are misleading: a reader will
|
||||||
|
reasonably assume the write is offloaded and the caller's thread is not blocked,
|
||||||
|
which is false. The `details` parameter doc says "Optional JSON details" but nothing
|
||||||
|
validates or requires JSON, so callers may pass arbitrary text.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Align the name, signature, and documentation with the actual behaviour — either make
|
||||||
|
the method genuinely asynchronous or rename to `LogEvent` and correct the doc.
|
||||||
|
Clarify that `details` is free-form text unless JSON is actually enforced.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-010 — Test coverage gaps: actor bridge, purge/write concurrency, vacuum effectiveness, query error path
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.SiteEventLogging.Tests/` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test suite covers recording, query filtering/pagination, and basic purge, but
|
||||||
|
several critical behaviours are untested:
|
||||||
|
|
||||||
|
- `EventLogHandlerActor` has no test — the actor message contract
|
||||||
|
(`EventLogQueryRequest` -> `EventLogQueryResponse`, `Sender.Tell`) is unverified.
|
||||||
|
- No test exercises purge running concurrently with active writes/queries, so the
|
||||||
|
connection-sharing race (SiteEventLogging-003) is invisible to CI.
|
||||||
|
- `PurgeByStorageCap` is only tested with `MaxStorageMb = 0`, which hides the
|
||||||
|
no-op-vacuum / over-delete bug (SiteEventLogging-001, -002). No test asserts the
|
||||||
|
file shrinks or that only oldest events are removed under a realistic cap.
|
||||||
|
- `EventLogQueryService.ExecuteQuery`'s catch block (`Success: false`,
|
||||||
|
`ErrorMessage`) has no test.
|
||||||
|
- `SiteEventLogger.Dispose` semantics (logging after dispose returns
|
||||||
|
`Task.CompletedTask`) and re-entrant dispose are untested.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for the actor bridge, a concurrency stress test (purge + write + query in
|
||||||
|
parallel), a realistic non-zero-cap purge test asserting size reduction and
|
||||||
|
oldest-first deletion, and a query-error-path test (e.g. corrupt/closed connection).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteEventLogging-011 — Stale "Phase 4+" placeholder in `ServiceCollectionExtensions`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteEventLogging/ServiceCollectionExtensions.cs:18-22` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AddSiteEventLoggingActors` is an empty method with a comment "Placeholder for Akka
|
||||||
|
actor registration (Phase 4+)". The actor (`EventLogHandlerActor`) is in fact already
|
||||||
|
implemented and is registered directly in `AkkaHostedService.cs:313-336`, not through
|
||||||
|
this method. The placeholder is dead code: it is either never called or called with
|
||||||
|
no effect, and the comment is stale. A reader looking for where the event-log actor
|
||||||
|
is wired up will be misdirected.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either implement the actor registration here and have `AkkaHostedService` call it
|
||||||
|
(centralising the wiring), or delete `AddSiteEventLoggingActors` entirely and remove
|
||||||
|
the misleading comment.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,564 @@
|
|||||||
|
# Code Review — SiteRuntime
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.SiteRuntime` |
|
||||||
|
| Design doc | `docs/requirements/Component-SiteRuntime.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 16 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The SiteRuntime module is broadly well-structured: the actor hierarchy matches the
|
||||||
|
design doc, supervision strategies are explicit, and the trigger/alarm evaluation
|
||||||
|
logic is thorough. However the review surfaced one genuinely serious correctness
|
||||||
|
defect — `Instance.SetAttribute` never routes writes to the Data Connection Layer
|
||||||
|
for data-sourced attributes, contradicting a core design decision and silently
|
||||||
|
turning device writes into local-only static overrides. Several other findings
|
||||||
|
cluster around two themes: (1) actor-thread discipline is violated in a few hot
|
||||||
|
paths (blocking `.GetAwaiter().GetResult()` calls on the actor thread, a fragile
|
||||||
|
fixed-delay reschedule for redeployment), and (2) the site-local repositories reach
|
||||||
|
into `SiteStorageService` private state via reflection and mint entity IDs with the
|
||||||
|
non-deterministic `string.GetHashCode()`. Script execution runs on the default
|
||||||
|
thread pool rather than a dedicated blocking dispatcher (the code acknowledges this
|
||||||
|
in a comment but ships it anyway). Test coverage exists for the coordinator actors,
|
||||||
|
persistence and scripting, but the short-lived execution actors, the replication
|
||||||
|
actor, and the repositories are untested.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | SetAttribute mis-routing, deploy double-count, redeploy reschedule race. |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | Blocking on actor thread, script execution not on a dedicated dispatcher, premature success reply. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | `_attributes` dictionary shared with child actors by reference; `_executionCounter` is actor-confined (OK). |
|
||||||
|
| 4 | Error handling & resilience | ✓ | Deploy reports Success before persistence; replicated artifact/S&F failures only logged (matches best-effort design). |
|
||||||
|
| 5 | Security | ✓ | Trust-model validation is substring-based and weak; reflection used to read private fields. |
|
||||||
|
| 6 | Performance & resource management | ✓ | Per-call SQLite connections (acceptable); CPU-bound scripts not interruptible by timeout. |
|
||||||
|
| 7 | Design-document adherence | ✓ | SetAttribute DCL routing missing; staggered-startup and supervision otherwise conform. |
|
||||||
|
| 8 | Code organization & conventions | ✓ | Repositories reflect into another class; synthetic IDs non-deterministic. |
|
||||||
|
| 9 | Testing coverage | ✓ | No tests for ScriptExecutionActor, AlarmExecutionActor, SiteReplicationActor, or the two repositories. |
|
||||||
|
| 10 | Documentation & comments | ✓ | Several XML comments describe behaviour the code does not implement (see findings). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### SiteRuntime-001 — `Instance.SetAttribute` never writes to the Data Connection Layer
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs:106`, `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:204` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc (Component-SiteRuntime.md, "GetAttribute / SetAttribute" and
|
||||||
|
"Script Runtime API") states that `Instance.SetAttribute` on a *data-connected*
|
||||||
|
attribute must send a write request to the DCL, which writes to the physical
|
||||||
|
device, and that the in-memory value is **not** optimistically updated. For *static*
|
||||||
|
attributes it updates memory and persists an override.
|
||||||
|
|
||||||
|
The implementation makes no such distinction. `ScriptRuntimeContext.SetAttribute`
|
||||||
|
unconditionally sends a `SetStaticAttributeCommand`, and `InstanceActor.HandleSetStaticAttribute`
|
||||||
|
unconditionally treats every write as a static override: it mutates `_attributes`,
|
||||||
|
publishes an `AttributeValueChanged` with hard-coded `"Good"` quality, notifies
|
||||||
|
children, and persists a SQLite override. A script writing a data-sourced attribute
|
||||||
|
therefore never reaches the device, the write failure can never be returned
|
||||||
|
synchronously to the script, and the in-memory value diverges from the device
|
||||||
|
until the next subscription update overwrites it. The persisted override is also
|
||||||
|
wrong: data-sourced attributes should not have static overrides.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
In `InstanceActor`, look up the target attribute in `_configuration.Attributes`. If
|
||||||
|
it has a non-empty `DataSourceReference`, issue a DCL write (e.g. a `WriteTagRequest`
|
||||||
|
to `_dclManager`) and surface success/failure to the caller; do not persist an
|
||||||
|
override and do not optimistically mutate `_attributes`. Only attributes with no
|
||||||
|
data source reference should follow the current static-override path. Consider
|
||||||
|
splitting the message into `SetStaticAttributeCommand` vs `SetDataAttributeCommand`,
|
||||||
|
or branching inside the handler.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-002 — `RouteInboundApiSetAttributes` always treats writes as static overrides
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:632` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`RouteInboundApiSetAttributes` (handling `Route.To().SetAttribute(s)` from the
|
||||||
|
Inbound API) emits a `SetStaticAttributeCommand` for every attribute, so it inherits
|
||||||
|
the same defect as SiteRuntime-001: writes to data-sourced attributes never reach
|
||||||
|
the device and are instead persisted as static overrides. In addition the response
|
||||||
|
is sent back as unconditionally successful (`true`) before the Instance Actor has
|
||||||
|
even processed the command, so a non-existent attribute or a future DCL write
|
||||||
|
failure is reported to the external caller as success.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Route through the same corrected `InstanceActor` write handler as SiteRuntime-001 so
|
||||||
|
the static-vs-data distinction is honoured. The optimistic ack is acceptable for
|
||||||
|
fire-and-forget static writes per the doc, but the XML comment should make the
|
||||||
|
limitation explicit, and once data-attribute writes are supported they need a real
|
||||||
|
response path.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-003 — Redeployment relies on a fixed 500 ms reschedule and can collide on the child actor name
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:222` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleDeploy` stops an existing Instance Actor with `Context.Stop` and then
|
||||||
|
reschedules the same `DeployInstanceCommand` to itself after a hard-coded 500 ms,
|
||||||
|
hoping the child has fully terminated by then. `Context.Stop` is asynchronous; the
|
||||||
|
child is only removed from the parent's children collection after it actually stops
|
||||||
|
(including running `PostStop` on its descendants). If a deeply nested or slow
|
||||||
|
hierarchy takes longer than 500 ms, `CreateInstanceActor` calls `Context.ActorOf`
|
||||||
|
with a name that still belongs to the terminating child and throws
|
||||||
|
`InvalidActorNameException`. The `_instanceActors` dictionary check does not prevent
|
||||||
|
this — the dictionary entry is removed immediately, but the Akka child registry is
|
||||||
|
not. The 500 ms delay is also unconditionally added to every redeploy latency.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Watch the terminating child (`Context.Watch`) and recreate the Instance Actor only
|
||||||
|
after receiving the `Terminated` message, instead of guessing with a timer. Buffer
|
||||||
|
or stash the in-flight `DeployInstanceCommand` (and any further commands for that
|
||||||
|
instance) until termination completes.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-004 — `_totalDeployedCount` is incremented on redeployment of an existing instance
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:239` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
In `HandleDeploy`, the existing-actor branch (line 223) reschedules the command and
|
||||||
|
returns. When the rescheduled command runs, no actor exists, so the code falls
|
||||||
|
through to the "new instance" branch and executes `_totalDeployedCount++`
|
||||||
|
(line 239). A redeployment is an *update* of an already-deployed instance, not a new
|
||||||
|
one, so the deployed count is over-counted by one on every redeploy.
|
||||||
|
`StoreDeployedConfigAsync` uses UPSERT semantics, so the SQLite row count does not
|
||||||
|
grow, but the in-memory `_totalDeployedCount` (reported to the health collector via
|
||||||
|
`UpdateInstanceCounts`) drifts upward and the reported "disabled" count becomes
|
||||||
|
wrong.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Only increment `_totalDeployedCount` when the instance is genuinely new. Either
|
||||||
|
track whether this deploy replaced an existing config, or derive the deployed count
|
||||||
|
from storage / the union of running actors and disabled configs rather than
|
||||||
|
maintaining a hand-incremented counter.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-005 — Deployment reports `Success` to central before persistence completes
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:272` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleDeploy` replies to central with `DeploymentStatus.Success` immediately after
|
||||||
|
creating the Instance Actor, while the SQLite persistence (`StoreDeployedConfigAsync`
|
||||||
|
+ `ClearStaticOverridesAsync`) runs asynchronously on a `Task.Run`. If persistence
|
||||||
|
fails, `HandleDeployPersistenceResult` only logs an error — central has already been
|
||||||
|
told the deployment succeeded. On a subsequent node restart or failover the instance
|
||||||
|
will not be re-created (it is not in SQLite), so the deployment is silently lost
|
||||||
|
despite central recording success. This contradicts the design's intent that the
|
||||||
|
site is the durable source of truth for deployed configs.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Persist the config before replying, or treat a persistence failure as a deployment
|
||||||
|
failure and send a corrective `DeploymentStatusResponse`/health signal to central.
|
||||||
|
At minimum, do not report `Success` until the config row is committed.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-006 — Site-local repositories read `SiteStorageService` private field via reflection
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:183`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs:181` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Both repositories' `CreateConnection()` use `Type.GetField("_connectionString",
|
||||||
|
BindingFlags.NonPublic | BindingFlags.Instance)` to extract the private connection
|
||||||
|
string out of `SiteStorageService`. This is brittle (any rename or refactor of the
|
||||||
|
field breaks it at runtime, not compile time), defeats encapsulation, and the
|
||||||
|
accompanying XML comment openly describes it as a "pragmatic" hack and is internally
|
||||||
|
contradictory (it states a connection string is "passed separately at DI
|
||||||
|
registration time" which is not what the code does). It also sits awkwardly against
|
||||||
|
the project's own script trust model, which forbids `System.Reflection` in scripts.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Expose the connection string properly: add an `ISiteStorageConnectionProvider`
|
||||||
|
(already referenced in `ServiceCollectionExtensions` XML docs but not used), or have
|
||||||
|
`SiteStorageService` expose a `CreateConnection()` factory, and inject that into the
|
||||||
|
repositories. Remove the reflection entirely.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-007 — Synthetic entity IDs use the non-deterministic `string.GetHashCode()`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:241`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs:254` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GenerateSyntheticId` computes `name.GetHashCode() & 0x7FFFFFFF`. On .NET Core,
|
||||||
|
`string.GetHashCode()` is randomized per process by default, so the "stable
|
||||||
|
deterministic synthetic ID" promised by the XML comment is not stable at all — it
|
||||||
|
changes every time the process restarts. Any caller that obtained an ID and later
|
||||||
|
calls `GetExternalSystemByIdAsync`/`GetNotificationListByIdAsync` after a restart
|
||||||
|
will fail to find the entity. It also risks collisions: distinct names can hash to
|
||||||
|
the same 31-bit value, and `GetExternalSystemByIdAsync` would then return the wrong
|
||||||
|
row.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Use a deterministic, collision-resistant hash (e.g. a stable FNV-1a or the first
|
||||||
|
bytes of a SHA-256 of the name) if a synthetic integer ID is genuinely required, or
|
||||||
|
better, change the repository contract to key these site-local artifacts by name
|
||||||
|
rather than synthesising integer IDs.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-008 — Blocking `.GetAwaiter().GetResult()` on the actor thread during startup
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:479` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LoadSharedScriptsFromStorage` is called synchronously from
|
||||||
|
`HandleStartupConfigsLoaded` (the actor's message handler) and performs
|
||||||
|
`_storage.GetAllSharedScriptsAsync().GetAwaiter().GetResult()` followed by Roslyn
|
||||||
|
compilation of every shared script. This blocks the DeploymentManager singleton's
|
||||||
|
mailbox thread for the full duration of the SQLite read and all shared-script
|
||||||
|
compilation. On the default dispatcher this also ties up a thread-pool thread and
|
||||||
|
risks thread-pool starvation, and the singleton cannot process any other message
|
||||||
|
(deployments, lifecycle commands, debug routing) until it returns. The rest of the
|
||||||
|
class correctly uses `PipeTo`/`ContinueWith`.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Load shared scripts asynchronously and `PipeTo(Self)` an internal message, the same
|
||||||
|
pattern already used for `StartupConfigsLoaded`. Perform compilation either inside
|
||||||
|
the piped continuation handler (still on the actor thread but at least off the
|
||||||
|
synchronous startup path) or on a dedicated background task whose result is piped
|
||||||
|
back.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-009 — Script execution actors run scripts on the default thread pool, not a dedicated dispatcher
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs:72`, `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:289`, `src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs:57` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design (CLAUDE.md "Architecture & Runtime") states Script Execution Actors run
|
||||||
|
on a *dedicated blocking I/O dispatcher*. The code does not do this: `ScriptActor.SpawnExecution`
|
||||||
|
and `AlarmActor.SpawnAlarmExecution` create the execution actors with no
|
||||||
|
`.WithDispatcher(...)`, and the execution itself runs inside a bare `Task.Run`,
|
||||||
|
i.e. on the shared .NET thread pool. The `// NOTE: In production, configure a
|
||||||
|
dedicated ... dispatcher` comments acknowledge the gap but it ships unconfigured.
|
||||||
|
Scripts can perform synchronous blocking I/O (`Database.Connection`, synchronous
|
||||||
|
`ExternalSystem.Call`); running them on the shared pool can starve it and stall
|
||||||
|
unrelated Akka dispatchers and HTTP request handling under load.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Define the dedicated dispatcher in HOCON and chain `.WithDispatcher(...)` on the
|
||||||
|
execution actor `Props`. If the `Task.Run` model is kept, run script bodies on a
|
||||||
|
dedicated `TaskScheduler` / bounded scheduler rather than the global pool. Either
|
||||||
|
way, remove the "in production, configure…" comments by actually configuring it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-010 — `EnsureDclConnections` never updates a connection whose configuration changed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:413` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`EnsureDclConnections` tracks created connections in `_createdConnections` and skips
|
||||||
|
any name already present (`if (_createdConnections.Contains(name)) continue;`). The
|
||||||
|
skip is purely name-based: if a redeployment (or an artifact deployment) changes the
|
||||||
|
endpoint, credentials, backup endpoint, or `FailoverRetryCount` of an existing
|
||||||
|
connection, the new configuration is silently ignored and the DCL keeps using the
|
||||||
|
stale `CreateConnectionCommand`. There is no `UpdateConnectionCommand` path. The
|
||||||
|
design states that after artifact deployment the site is fully self-contained with
|
||||||
|
current configuration; this caching breaks that for connection changes.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Compare the incoming connection config against the last one sent and re-issue a
|
||||||
|
create/update command when it differs, or have the DCL treat `CreateConnectionCommand`
|
||||||
|
as idempotent upsert and always forward it. Key the cache on a config hash, not just
|
||||||
|
the name.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-011 — Trust-model validation is a substring scan and is both over- and under-inclusive
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs:52` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ValidateTrustModel` enforces the script trust model by doing raw `string.Contains` /
|
||||||
|
`IndexOf` on the script source text for forbidden namespace strings. This is
|
||||||
|
unreliable in both directions:
|
||||||
|
|
||||||
|
- **Bypass (under-inclusive):** the check looks only for the literal namespace
|
||||||
|
strings. A script can reach forbidden APIs without ever writing `System.IO` etc. —
|
||||||
|
e.g. via fully-qualified type use through aliasing, `global::`-prefixed names, or
|
||||||
|
simply because the namespace is already imported transitively. The compilation
|
||||||
|
references include `typeof(object).Assembly` (the whole of `System.Private.CoreLib`,
|
||||||
|
which contains `System.IO.File`, `System.Threading.Thread`, `System.Reflection`,
|
||||||
|
etc.), so forbidden types are fully resolvable at compile time and the only barrier
|
||||||
|
is this textual scan.
|
||||||
|
- **False positives (over-inclusive):** any occurrence of the substring in a comment,
|
||||||
|
string literal, or an unrelated identifier (e.g. a variable named `ProcessThreading`)
|
||||||
|
triggers a violation; the `AllowedExceptions` logic only rescues exact prefixes.
|
||||||
|
- The dead `isAllowed` variable at line 64 is computed and never used.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Enforce the trust model with a Roslyn `SyntaxWalker`/semantic analysis (inspect
|
||||||
|
resolved symbols and their containing namespaces/assemblies), or restrict the
|
||||||
|
compilation's metadata references and `AssemblyLoadContext` so forbidden types are
|
||||||
|
genuinely unavailable, rather than relying on source-text matching. Remove the
|
||||||
|
unused `isAllowed` variable.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-012 — `AttributeAccessor`/`ScopeAccessors` block the script on a synchronous Ask
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Scripts/ScopeAccessors.cs:28` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`AttributeAccessor`'s indexer getter calls `_ctx.GetAttribute(...).GetAwaiter().GetResult()`,
|
||||||
|
synchronously blocking the script-execution thread on an actor Ask. Combined with
|
||||||
|
SiteRuntime-009 (scripts run on the shared thread pool) this means a script that
|
||||||
|
reads several attributes via `Attributes["X"]` holds a pool thread blocked for each
|
||||||
|
round-trip. The async variants (`GetAsync`/`SetAsync`) exist but the ergonomic
|
||||||
|
indexer encourages the blocking path. The XML comment notes "Reads block on the
|
||||||
|
actor Ask" but does not warn about the thread-pool impact.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Once a dedicated script dispatcher exists (SiteRuntime-009) the blocking is contained
|
||||||
|
to that pool, which is acceptable; until then, document the cost clearly and prefer
|
||||||
|
steering script authors to the async accessors. Consider making the indexer
|
||||||
|
internal-only and exposing only the async API.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-013 — `HandleUnsubscribeDebugView` does nothing despite documented behaviour
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:414` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`HandleUnsubscribeDebugView` is documented ("Debug view unsubscribe — removes
|
||||||
|
subscription") and the actor registers a handler for `UnsubscribeDebugViewRequest`,
|
||||||
|
but the body only logs a debug message — there is no subscription state in the
|
||||||
|
Instance Actor to remove. The design places the actual subscription lifecycle in
|
||||||
|
`SiteStreamManager` (`Subscribe`/`Unsubscribe`/`RemoveSubscriber`), so the Instance
|
||||||
|
Actor genuinely has nothing to do here. The handler and its XML comment are
|
||||||
|
therefore misleading: a reader expects it to tear down a subscription.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either remove the no-op handler and route `UnsubscribeDebugViewRequest` to wherever
|
||||||
|
the `SiteStreamManager` subscription is actually cancelled, or correct the XML
|
||||||
|
comment to state explicitly that subscription teardown is handled by
|
||||||
|
`SiteStreamManager` and this handler is a no-op acknowledgement.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-014 — Trigger-expression evaluation blocks the coordinator actor thread
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:219`, `src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs:389` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`EvaluateExpressionTrigger` (ScriptActor) and `EvaluateExpression` (AlarmActor) run a
|
||||||
|
compiled Roslyn script with `.RunAsync(...).GetAwaiter().GetResult()` directly inside
|
||||||
|
the actor's `AttributeValueChanged` message handler. This blocks the coordinator
|
||||||
|
actor's mailbox thread for up to the 2-second timeout on every monitored attribute
|
||||||
|
change. Coordinator actors are on the default dispatcher and process the hot path of
|
||||||
|
attribute-change fan-out; a slow expression delays all other messages to that actor
|
||||||
|
and consumes a thread-pool thread for the duration. The inline comments correctly
|
||||||
|
note CPU-bound expressions are not interruptible but do not address the
|
||||||
|
mailbox-blocking concern.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Trigger expressions are expected to be cheap, but to keep the actor responsive
|
||||||
|
consider evaluating them off the actor thread (pipe the boolean result back as an
|
||||||
|
internal message) or pre-compiling to a plain delegate that executes near-instantly
|
||||||
|
without the Roslyn scripting `RunAsync` machinery.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-015 — `LoggerFactory` created per Instance Actor and never disposed
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:746` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CreateInstanceActor` does `var loggerFactory = new LoggerFactory();` for every
|
||||||
|
Instance Actor it creates, uses it once to produce an `ILogger<InstanceActor>`, and
|
||||||
|
never disposes it. `LoggerFactory` is `IDisposable`. With up to 500 instances (and
|
||||||
|
churn from redeployments) this leaks a factory per instance, and the produced
|
||||||
|
loggers are detached from the application's configured logging providers, so
|
||||||
|
Instance Actor logs may not be routed/filtered consistently with the rest of the
|
||||||
|
host.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Inject the application's `ILoggerFactory` (or an `ILogger<InstanceActor>` factory
|
||||||
|
delegate) into `DeploymentManagerActor` via DI and reuse it, rather than newing one
|
||||||
|
up per child. Do not create a fresh `LoggerFactory` in a hot creation path.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### SiteRuntime-016 — Short-lived execution actors, replication actor, and repositories are untested
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.SiteRuntime.Tests/` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The test project covers the coordinator actors (`InstanceActor`, `ScriptActor`,
|
||||||
|
`AlarmActor`, `DeploymentManagerActor`), persistence, scripting and streaming, but a
|
||||||
|
search of the test sources finds no references to `ScriptExecutionActor`,
|
||||||
|
`AlarmExecutionActor`, `SiteReplicationActor`, `SiteExternalSystemRepository`, or
|
||||||
|
`SiteNotificationRepository`. These cover critical paths: script timeout/failure
|
||||||
|
handling and result reply, alarm on-trigger execution, peer config/S&F replication
|
||||||
|
(including the `SendToPeer` no-peer drop), and the reflection-based repository reads.
|
||||||
|
Several findings above (001/002 mis-routing, 007 ID instability, 011 trust bypass)
|
||||||
|
would likely have been caught by targeted tests.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add unit/integration tests for the execution actors (success, timeout, exception,
|
||||||
|
Ask-reply, PoisonPill self-stop), `SiteReplicationActor` (outbound forward, inbound
|
||||||
|
apply, peer tracking on cluster events), and the two repositories (round-trip read,
|
||||||
|
synthetic-ID lookup, missing-row behaviour).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,510 @@
|
|||||||
|
# Code Review — StoreAndForward
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.StoreAndForward` |
|
||||||
|
| Design doc | `docs/requirements/Component-StoreAndForward.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 12 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The Store-and-Forward module is small and readable, with a clean SQLite persistence
|
||||||
|
layer, a sensible service API, and reasonable test coverage of the storage and service
|
||||||
|
happy paths. However the review surfaced two issues that undermine the module's core
|
||||||
|
purpose. First, the active delivery path never invokes the `ReplicationService` —
|
||||||
|
`ReplicateEnqueue/Remove/Park` have no callers anywhere in the codebase, so buffered
|
||||||
|
messages are not replicated to the standby node and the design's failover-durability
|
||||||
|
guarantee (Component doc "Persistence", CLAUDE.md "Store-and-Forward") is not met.
|
||||||
|
Second, there is an off-by-one in retry accounting: the immediate-failure path stores a
|
||||||
|
buffered message with `RetryCount = 1`, so a message configured with `MaxRetries = N`
|
||||||
|
is actually attempted `N` times in total rather than one immediate attempt plus `N`
|
||||||
|
retries, and a per-source `MaxRetries` of 1 produces zero retry attempts. Additional
|
||||||
|
themes: SQLite connection-per-call with no transactional grouping of multi-statement
|
||||||
|
operations, no concurrency guard against a parked message being retried while the
|
||||||
|
sweep is mid-flight, an unused enum member (`InFlight`) that drifts from the documented
|
||||||
|
status set, and untested critical paths (retry-due timing, replication-from-active,
|
||||||
|
the actor bridge). None of the findings are blockers for compilation, but the
|
||||||
|
replication and retry-count issues are functional defects against the design.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☑ | Off-by-one in retry counting (003); parked-message retry timing (010). |
|
||||||
|
| 2 | Akka.NET conventions | ☑ | `ContinueWith` used instead of `PipeTo`-friendly continuations; default supervision; see 007. |
|
||||||
|
| 3 | Concurrency & thread safety | ☑ | Sweep guarded by `Interlocked`, but no guard against retry-vs-manage races (005); `OnActivity` event not thread-safe (009). |
|
||||||
|
| 4 | Error handling & resilience | ☑ | Replication never invoked from active path (001); no-handler messages buffered then stuck (002). |
|
||||||
|
| 5 | Security | ☑ | No issues found — parameterised SQL throughout; no secrets handled directly; payload JSON treated opaquely. |
|
||||||
|
| 6 | Performance & resource management | ☑ | New SQLite connection per call; multi-statement operations not wrapped in a transaction (006, 008). |
|
||||||
|
| 7 | Design-document adherence | ☑ | Replication gap (001); `InFlight` status undocumented/unused (011); "retrying" status from design doc not modelled. |
|
||||||
|
| 8 | Code organization & conventions | ☑ | `StoreAndForwardMessage` is an entity-like POCO living in the component, not Commons (012). |
|
||||||
|
| 9 | Testing coverage | ☑ | Retry-due timing, replication-from-active, and `ParkedMessageHandlerActor` are untested (013). |
|
||||||
|
| 10 | Documentation & comments | ☑ | XML doc on `RegisterDeliveryHandler` contract is inconsistent with code (004). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### StoreAndForward-001 — Replication to standby is never triggered by the active node
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Critical |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/ReplicationService.cs:40`, `:53`, `:66`; `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:155`, `:212`, `:222`, `:236` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ReplicationService` exposes `ReplicateEnqueue`, `ReplicateRemove` and `ReplicatePark`
|
||||||
|
to forward buffer operations to the standby node, but a codebase-wide search shows these
|
||||||
|
methods have no callers. `StoreAndForwardService` — which performs every add (`EnqueueAsync`
|
||||||
|
line 155 / 163), remove (`RemoveMessageAsync` call at line 212) and park
|
||||||
|
(`UpdateMessageAsync` calls at lines 222/236) — holds no reference to `ReplicationService`
|
||||||
|
and never invokes it. Only the receiving half is wired (`SetReplicationHandler` and
|
||||||
|
`ApplyReplicatedOperationAsync` are used by `SiteReplicationActor`). The Component design
|
||||||
|
doc ("Persistence") and CLAUDE.md ("Store-and-Forward") require the active node to
|
||||||
|
forward every buffer operation to the standby so that, on failover, the new active node
|
||||||
|
"has a near-complete copy of the buffer." As written, the standby's S&F SQLite database
|
||||||
|
stays empty and a failover loses the entire buffer — a data-loss defect against a core
|
||||||
|
requirement.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Inject `ReplicationService` into `StoreAndForwardService` and call `ReplicateEnqueue`
|
||||||
|
after a successful `_storage.EnqueueAsync`, `ReplicateRemove` after `RemoveMessageAsync`,
|
||||||
|
and `ReplicatePark` after a park-causing `UpdateMessageAsync`. Update
|
||||||
|
`ServiceCollectionExtensions.AddStoreAndForward` to pass the dependency. Add a test that
|
||||||
|
asserts the replication handler observes each operation type.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. `ReplicationService` is now injected into `StoreAndForwardService`
|
||||||
|
(wired in `AddStoreAndForward`), and every buffer operation is forwarded to the standby:
|
||||||
|
a new `BufferAsync` helper calls `ReplicateEnqueue` after each persist, `ReplicateRemove`
|
||||||
|
runs after a successful retry removes a message, and `ReplicatePark` runs on both park
|
||||||
|
paths. Replication stays fire-and-forget and is a no-op when `ReplicationEnabled` is
|
||||||
|
false or no handler is wired. Regression tests `StoreAndForwardReplicationTests` assert
|
||||||
|
the replication handler observes the Add, Remove and Park operations. Fixed by the
|
||||||
|
commit whose message references `StoreAndForward-001`.
|
||||||
|
|
||||||
|
### StoreAndForward-002 — Messages enqueued with no registered handler are buffered but never deliverable
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:162`, `:201` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`EnqueueAsync` falls through to "No handler registered — buffer for later" (line 162)
|
||||||
|
when no delivery handler is registered for the category. The retry sweep
|
||||||
|
(`RetryMessageAsync`, line 201) then logs "No delivery handler for category" and
|
||||||
|
`return`s without touching the message. No caller in the codebase ever calls
|
||||||
|
`RegisterDeliveryHandler` (the External System Gateway, Notification Service and
|
||||||
|
Database Gateway only call `EnqueueAsync`), so in the current wiring **every** buffered
|
||||||
|
message lands in this dead state: it is persisted, counts toward buffer depth, but can
|
||||||
|
never be retried, delivered or parked. It will sit Pending forever. Either the handler
|
||||||
|
registration is missing from Host/gateway startup, or the "buffer for later" path is a
|
||||||
|
silent trap. Either way the engine cannot deliver anything.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Decide the intended contract. If handlers are expected to be registered before
|
||||||
|
`EnqueueAsync` is reachable, make `EnqueueAsync` reject (or log an error) when no
|
||||||
|
handler exists rather than silently buffering an undeliverable message, and wire
|
||||||
|
`RegisterDeliveryHandler` calls in Host startup for all three categories. If late
|
||||||
|
registration is intended, the retry sweep should treat a still-missing handler as a
|
||||||
|
transient condition with bounded logging rather than a permanent no-op.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-003 — Off-by-one in retry accounting: immediate failure pre-counts as retry 1
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:153`, `:229`, `:233` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
On a transient immediate-delivery failure, `EnqueueAsync` buffers the message with
|
||||||
|
`message.RetryCount = 1` (line 153). The retry sweep then increments `RetryCount` before
|
||||||
|
the max check (`RetryCount++` at line 229; `RetryCount >= MaxRetries` at line 233).
|
||||||
|
Consequences: (1) a message configured with `MaxRetries = 1` is parked on the *first*
|
||||||
|
retry sweep without ever being retried, because after the immediate attempt `RetryCount`
|
||||||
|
is already 1 and the first sweep makes it 2 ≥ 1 — zero actual retries occur, contradicting
|
||||||
|
the design intent that the immediate attempt and the retry budget are distinct;
|
||||||
|
(2) the design doc's `Retry Count` field is "Number of attempts so far," but here it is
|
||||||
|
seeded to 1 before any *retry* has happened, making the parked-message `AttemptCount`
|
||||||
|
shown to operators off by one relative to configured `MaxRetries`. The
|
||||||
|
`EnqueueAsync_TransientFailure_BuffersForRetry` test even asserts `RetryCount == 1`,
|
||||||
|
locking in the ambiguity.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Choose one consistent meaning for `RetryCount` (recommended: total delivery attempts,
|
||||||
|
including the immediate one) and apply it uniformly. If `MaxRetries` is meant to bound
|
||||||
|
*retries* after the immediate attempt, buffer with `RetryCount = 0` and treat the
|
||||||
|
immediate failure as attempt 0; if it bounds *total attempts*, document that and adjust
|
||||||
|
the comparison. Update the affected test to match the chosen semantics.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-004 — `RegisterDeliveryHandler` XML doc contradicts the implemented contract
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:38`, `:60` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The XML comment on the handler delegate (lines 37–40) says "Returns true on success,
|
||||||
|
throws on transient failure. Permanent failures should return false (message will NOT
|
||||||
|
be buffered)." That last clause is wrong for the retry path: in `RetryMessageAsync`,
|
||||||
|
a handler returning `false` does not "not buffer" — the message is already buffered, and
|
||||||
|
the code *parks* it immediately (lines 218–224). The comment describes only the
|
||||||
|
`EnqueueAsync` immediate path and misleads anyone implementing a handler about what
|
||||||
|
`false` means once a message is in the retry loop.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Reword the contract to cover both paths explicitly: `true` = delivered (remove from
|
||||||
|
buffer); `false` = permanent failure (not buffered on immediate attempt, parked on a
|
||||||
|
retry); exception = transient failure (buffer / increment retry).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-005 — Parked-message retry/discard can race with the in-progress retry sweep
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:184`, `:266`, `:280` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`RetryPendingMessagesAsync` loads a snapshot of due messages (line 179) and then
|
||||||
|
processes them one by one (line 184), `await`-ing delivery for each. Meanwhile
|
||||||
|
`RetryParkedMessageAsync` / `DiscardParkedMessageAsync` (operator actions arriving via
|
||||||
|
`ParkedMessageHandlerActor`) run on unrelated threads and mutate the same rows. Because
|
||||||
|
each operation opens its own SQLite connection and there is no row-level coordination,
|
||||||
|
an operator can `DiscardParkedMessageAsync` a message that the sweep is concurrently
|
||||||
|
delivering: the sweep's later `RemoveMessageAsync`/`UpdateMessageAsync` operates on a
|
||||||
|
now-deleted row (harmless) — but if an operator `RetryParkedMessageAsync` resets a row
|
||||||
|
to Pending while the sweep simultaneously parks the same in-flight message, the operator
|
||||||
|
intent is silently overwritten. The `Interlocked` guard only prevents *overlapping
|
||||||
|
sweeps*, not sweep-vs-management races.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Funnel all message-state mutations through a single serialization point — e.g. perform
|
||||||
|
all S&F state changes inside the `ParkedMessageHandlerActor` (or a dedicated S&F actor)
|
||||||
|
so the actor mailbox serialises them, or make status transitions conditional in SQL
|
||||||
|
(e.g. `UPDATE ... WHERE id = @id AND status = @expected`) and re-check the affected
|
||||||
|
row count.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-006 — `GetParkedMessagesAsync` count and page run without a transaction
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:166`, `:175` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`GetParkedMessagesAsync` issues a `COUNT(*)` and then a separate paged `SELECT` on two
|
||||||
|
commands on the same connection with no surrounding transaction. A concurrent
|
||||||
|
enqueue/park/discard between the two statements yields a `TotalCount` inconsistent with
|
||||||
|
the returned page (e.g. total reported as 51 while only 50 distinct parked rows now
|
||||||
|
exist, or a row visible in the page but excluded from the count). For a paginated UI
|
||||||
|
this produces flickering totals and occasional off-by-one page math.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Wrap both reads in a single transaction (`BeginTransaction`) so they see a consistent
|
||||||
|
snapshot, or accept the staleness and document it. A transaction is cheap here and
|
||||||
|
removes the inconsistency.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-007 — Async work in `ParkedMessageHandlerActor` uses `ContinueWith` without scheduler/affinity guarantees
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Akka.NET conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/ParkedMessageHandlerActor.cs:34`, `:68`, `:87` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The three handlers call a `Task`-returning service method and chain `.ContinueWith(...)
|
||||||
|
.PipeTo(sender)`. `Sender` is correctly captured into a local first, so the closure is
|
||||||
|
safe. However `ContinueWith` without an explicit `TaskScheduler` runs the continuation
|
||||||
|
on a thread-pool thread and the captured continuation builds the response objects there
|
||||||
|
— acceptable since it only touches locals, but it bypasses the idiomatic
|
||||||
|
`PipeTo`-with-success/failure-projection pattern and is fragile if someone later adds a
|
||||||
|
line touching actor state inside the continuation. There is also no `TaskContinuationOptions`,
|
||||||
|
so a faulted antecedent still runs the continuation (handled here via `IsCompletedSuccessfully`,
|
||||||
|
but only by convention).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Replace `ContinueWith(...).PipeTo(sender)` with `PipeTo(sender, success: result => ...,
|
||||||
|
failure: ex => ...)`, which is the documented Akka pattern, keeps response construction
|
||||||
|
off the actor thread safely, and makes the success/failure branches explicit.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-008 — A SQLite connection is opened and torn down on every storage call
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:28`, `:61`, `:93`, `:117`, `:144`, `:162`, `:199`, `:221`, `:237`, `:267`, `:285`, `:305`, `:319` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Every method in `StoreAndForwardStorage` constructs a fresh `SqliteConnection` and calls
|
||||||
|
`OpenAsync`. Microsoft.Data.Sqlite pools connections, so this is not a correctness bug,
|
||||||
|
but a retry sweep over a large buffer performs one open per `UpdateMessageAsync`/
|
||||||
|
`RemoveMessageAsync` call inside the loop (`RetryMessageAsync`), multiplying connection
|
||||||
|
churn under load. With no max buffer size (by design) the buffer can grow large, so the
|
||||||
|
per-message connection acquisition is a measurable overhead on the hot retry path.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Consider a batched retry API that opens one connection (and one transaction) per sweep,
|
||||||
|
or pass an open connection into the per-message update calls. At minimum, document that
|
||||||
|
the design relies on the Sqlite connection pool for acceptable performance.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-009 — `OnActivity` event invocation is not thread-safe against concurrent subscribe/unsubscribe
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Concurrency & thread safety |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:46`, `:309` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`OnActivity` is a public `event Action<...>` raised via `OnActivity?.Invoke(...)` in
|
||||||
|
`RaiseActivity` (line 309). `RaiseActivity` is called from both `EnqueueAsync` (caller
|
||||||
|
thread) and `RetryMessageAsync` (timer thread). The `?.Invoke` null-conditional captures
|
||||||
|
the delegate once so it will not NRE, but there is no synchronisation around the event
|
||||||
|
field itself; a subscriber added/removed concurrently with a raise has no defined
|
||||||
|
ordering. More importantly, subscriber callbacks run synchronously on the timer thread,
|
||||||
|
so a slow or throwing subscriber stalls or aborts the retry sweep (an exception in a
|
||||||
|
subscriber propagates out of `RaiseActivity` into `RetryMessageAsync`'s `try` and is
|
||||||
|
swallowed as a "transient failure," wrongly incrementing the message's retry count).
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Snapshot the delegate (already done) and additionally wrap subscriber invocation in a
|
||||||
|
`try/catch` so a faulting logging subscriber cannot be misclassified as a delivery
|
||||||
|
failure. Document that handlers must be fast and non-throwing, or dispatch activity
|
||||||
|
notifications asynchronously.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-010 — Retry of a parked message does not reset `LastAttemptAt`, so its retry timing is unspecified
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:203`, `:101` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`RetryParkedMessageAsync` sets `status = Pending, retry_count = 0, last_error = NULL`
|
||||||
|
but leaves `last_attempt_at` unchanged (line 203–206). The retry-due query
|
||||||
|
(`GetMessagesForRetryAsync`, line 101–105) selects Pending rows where
|
||||||
|
`last_attempt_at IS NULL OR ... elapsed >= retry_interval_ms`. A message parked after
|
||||||
|
exhausting retries has an old `last_attempt_at`; once re-queued, the elapsed time since
|
||||||
|
that stale timestamp is almost certainly already greater than the retry interval, so the
|
||||||
|
operator-retried message is attempted on the very next sweep regardless of the
|
||||||
|
configured interval. That is probably the desired behaviour (operator wants it tried
|
||||||
|
now), but it is unspecified and inconsistent — if `retry_interval_ms` were very large the
|
||||||
|
behaviour would instead be "try immediately" by accident rather than by design.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Explicitly decide and encode the intent: either set `last_attempt_at = NULL` on
|
||||||
|
re-queue so the message is unambiguously due now, or set it to "now" so it waits one
|
||||||
|
interval. Document the chosen behaviour in the method's XML comment.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-011 — `StoreAndForwardMessageStatus.InFlight` is unused and the doc's "retrying" status is unmodelled
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.Commons/Types/Enums/StoreAndForwardMessageStatus.cs:9`; `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:219`, `:235` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The enum defines `Pending, InFlight, Parked, Delivered`. The module only ever uses
|
||||||
|
`Pending` and `Parked` — `InFlight` and `Delivered` are never assigned (delivered
|
||||||
|
messages are deleted, not marked `Delivered`). Meanwhile the Component design doc
|
||||||
|
("Message Format" -> Status) specifies the set "Pending, retrying, or parked." So the
|
||||||
|
code's enum drifts from the doc in two directions: it carries dead members the doc does
|
||||||
|
not mention (`InFlight`, `Delivered`) and omits the doc's `retrying` state. A message
|
||||||
|
mid-retry is indistinguishable from one that has never been attempted.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Reconcile the enum with the design. Either drop the unused members and update the doc,
|
||||||
|
or implement the documented `retrying` state and use `InFlight` to mark a message the
|
||||||
|
sweep is actively delivering (which would also help with finding 005).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-012 — `StoreAndForwardMessage` is a persistence entity but lives in the component, not Commons
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardMessage.cs:9` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`StoreAndForwardMessage` is a persistence-ignorant POCO that maps directly to the
|
||||||
|
`sf_messages` table and is also carried across the network inside `ReplicationOperation`
|
||||||
|
(replicated to the standby node over Akka remoting). CLAUDE.md "Code Organization" states
|
||||||
|
that entity classes are persistence-ignorant POCOs in Commons and that message contracts
|
||||||
|
follow additive-only evolution. Because this type doubles as a replication wire contract
|
||||||
|
but lives in the component assembly, it is not co-located with the other Commons
|
||||||
|
entities and its evolution is not governed by the additive-only message-contract rule.
|
||||||
|
This is a borderline case (the type is site-local), but the cross-node use via
|
||||||
|
`ReplicationOperation` makes it a de-facto message contract.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either move `StoreAndForwardMessage` (and `ReplicationOperation`) into the Commons
|
||||||
|
`Entities`/`Messages` hierarchy so they are governed by the contract-evolution rules, or
|
||||||
|
introduce a separate DTO for replication and keep `StoreAndForwardMessage` purely as the
|
||||||
|
local persistence model. Document the decision.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-013 — Critical paths lack test coverage: retry-due timing, replication-from-active, and the actor bridge
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Testing coverage |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `tests/ScadaLink.StoreAndForward.Tests/` (whole directory); `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:101`; `src/ScadaLink.StoreAndForward/ParkedMessageHandlerActor.cs` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The existing tests cover storage CRUD and the service happy/failure paths well, but
|
||||||
|
three important behaviours are untested: (1) the retry-due time filter in
|
||||||
|
`GetMessagesForRetryAsync` — every service test sets `DefaultRetryInterval = TimeSpan.Zero`,
|
||||||
|
so the `julianday` elapsed-time comparison (the most error-prone SQL in the module) is
|
||||||
|
never exercised with a non-zero interval; a message that is *not yet due* should be
|
||||||
|
skipped, and that is never verified. (2) Replication from the active side — no test
|
||||||
|
asserts that an enqueue/remove/park causes a `Replicate*` call (this is exactly the gap
|
||||||
|
behind finding 001; a test would have caught it). (3) `ParkedMessageHandlerActor` has no
|
||||||
|
test at all — the Query/Retry/Discard request-to-response mapping and the
|
||||||
|
`ExtractMethodName` JSON parsing are unverified, including the malformed-JSON branch.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add tests for: a non-zero retry interval where a recently-attempted message is excluded
|
||||||
|
and an older one is included; active-side replication invocation per operation type
|
||||||
|
(once finding 001 is fixed); and `ParkedMessageHandlerActor` using `Akka.TestKit`,
|
||||||
|
including `ExtractMethodName` for `MethodName`, `Subject`, missing-property and
|
||||||
|
invalid-JSON payloads.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### StoreAndForward-014 — Storage does not create its SQLite database directory
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Resolved |
|
||||||
|
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:26` |
|
||||||
|
|
||||||
|
**Found 2026-05-16** while verifying the store-and-forward fixes — this defect was
|
||||||
|
not part of the original baseline review.
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`StoreAndForwardStorage.InitializeAsync` opened a `SqliteConnection` against the
|
||||||
|
configured `SqliteDbPath` (default `./data/store-and-forward.db`) without ensuring the
|
||||||
|
parent directory exists. SQLite creates the database *file* on demand but not its
|
||||||
|
*directory*, so when `data/` does not already exist the connection fails to open with
|
||||||
|
`SQLite Error 14: 'unable to open database file'`. Every site-host boot therefore failed
|
||||||
|
in any environment whose working directory has no `data/` folder — the cause of the six
|
||||||
|
failing `SiteActorPathTests` (the host's `RegisterSiteActors` aborts at
|
||||||
|
`StoreAndForwardService.StartAsync`). Production masked it because `data/` is created by
|
||||||
|
the Docker image / deployment.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Create the parent directory of a file-backed SQLite database before opening it.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
Resolved 2026-05-16. `InitializeAsync` now calls a new `EnsureDatabaseDirectoryExists`
|
||||||
|
helper that parses the connection string with `SqliteConnectionStringBuilder` and, for a
|
||||||
|
file-backed database, creates the parent directory if it is missing (in-memory databases
|
||||||
|
and bare filenames are skipped). Regression test
|
||||||
|
`InitializeAsync_FileInMissingDirectory_CreatesDirectory` fails against the pre-fix code;
|
||||||
|
all six `SiteActorPathTests` now pass. Fixed by the commit whose message references
|
||||||
|
`StoreAndForward-014`.
|
||||||
@@ -0,0 +1,487 @@
|
|||||||
|
# Code Review — TemplateEngine
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.TemplateEngine` |
|
||||||
|
| Design doc | `docs/requirements/Component-TemplateEngine.md` |
|
||||||
|
| Status | Reviewed |
|
||||||
|
| Last reviewed | 2026-05-16 |
|
||||||
|
| Reviewer | claude-agent |
|
||||||
|
| Commit reviewed | `9c60592` |
|
||||||
|
| Open findings | 14 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The Template Engine is a pure central-side modeling library: stateless services
|
||||||
|
over `ITemplateEngineRepository` plus four static helper classes (collision, cycle,
|
||||||
|
lock, resolver). It has no Akka actors and no direct concurrency, so the Akka and
|
||||||
|
thread-safety categories produce nothing of substance. The code is generally
|
||||||
|
well-structured and the cascade-based composition model (derived templates owned by
|
||||||
|
composition slots) is consistently applied. However the review surfaced several real
|
||||||
|
correctness gaps. The most serious are in **flattening**: composed alarms and scripts
|
||||||
|
nested below the first level are silently dropped, derived templates omit base
|
||||||
|
alarms entirely (breaking per-slot alarm override), and the alarm-on-trigger-script
|
||||||
|
resolution step is an empty placeholder so that whole validation rule is dead.
|
||||||
|
Validation has two security-relevant weaknesses — the forbidden-API scan is a naive
|
||||||
|
substring match and the brace-balance "compile" check mispredicts on verbatim /
|
||||||
|
interpolated / raw string literals. Several documented behaviours (collision check on
|
||||||
|
create, optimistic concurrency on instance state) are claimed but not implemented.
|
||||||
|
Themes: validation that is weaker than the design promises, and asymmetric handling
|
||||||
|
of attributes vs. alarms vs. scripts throughout the resolve/flatten/derive paths.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ✓ | Multiple real bugs: deep composed-member loss, derived alarms omitted, granularity bypass, no-op create-time collision block. |
|
||||||
|
| 2 | Akka.NET conventions | ✓ | No actors in this module (`AddTemplateEngineActors` is an empty placeholder). Nothing to assess. |
|
||||||
|
| 3 | Concurrency & thread safety | ✓ | Services are stateless, scoped per request; static helpers hold no mutable state. Design says template editing is last-write-wins; that is honoured. See TemplateEngine-010 re: a doc claim of optimistic concurrency that is not implemented. |
|
||||||
|
| 4 | Error handling & resilience | ✓ | `Result<T>` used consistently; repository nulls guarded. `FlatteningService` wraps in try/catch. No store-and-forward or failover surface in this module. |
|
||||||
|
| 5 | Security | ✓ | No auth checks in-module (delegated to callers per design). Script trust-model enforcement is weak — see TemplateEngine-006 and TemplateEngine-007. |
|
||||||
|
| 6 | Performance & resource management | ✓ | `GetAllTemplatesAsync` reloaded on most member edits; one genuine N+1 in `TemplateDeletionService` (TemplateEngine-009). No `IDisposable` leaks (`JsonDocument`/streams disposed). |
|
||||||
|
| 7 | Design-document adherence | ✓ | Drift found: recursive composition not fully implemented in flattening; `DataType` enum naming differs from doc; optimistic-concurrency claim. |
|
||||||
|
| 8 | Code organization & conventions | ✓ | POCO entities in Commons, repo interfaces in Commons, Options pattern N/A (no options here). Duplicate deletion logic (TemplateEngine-014). |
|
||||||
|
| 9 | Testing coverage | ✓ | Tests exist for every file, but the dead/placeholder paths (TemplateEngine-004, 005) and deep nesting (TemplateEngine-001) are not exercised. |
|
||||||
|
| 10 | Documentation & comments | ✓ | Mostly accurate; a misleading converter comment (TemplateEngine-011) and a stale enum/doc mismatch (TemplateEngine-012). |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
### TemplateEngine-001 — Deeply nested composed members are dropped during flattening
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:211`, `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:535`, `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:609` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc states composition supports "recursive nesting of feature modules"
|
||||||
|
and that nested paths extend as `[Outer].[Inner].[Member]`. `ResolveComposedAttributes`
|
||||||
|
only descends **one** level of nesting: it resolves the directly-composed module, then
|
||||||
|
its immediate child compositions, and stops. A module composed three or more levels
|
||||||
|
deep contributes no attributes to the flattened configuration. `ResolveComposedAlarms`
|
||||||
|
and `ResolveComposedScripts` are worse — they handle only the first (direct) level and
|
||||||
|
do not descend at all, so any alarm or script in a nested composed module is dropped
|
||||||
|
entirely. `CollisionDetector` and `TemplateResolver` recurse fully, so collision
|
||||||
|
detection and the authoring UI will show members that the deployed configuration
|
||||||
|
silently lacks.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Replace the hand-unrolled one/two-level loops with a single recursive walk
|
||||||
|
(carrying the accumulated path prefix) for attributes, alarms, and scripts, matching
|
||||||
|
the recursion already in `TemplateResolver.AddComposedMembers` and
|
||||||
|
`CollisionDetector.CollectComposedMembers`.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-002 — Derived templates omit all base alarms; composed alarms cannot be overridden per slot
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:799` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`BuildDerivedTemplate` copies the base template's `Attributes` and `Scripts` into the
|
||||||
|
new derived template as `IsInherited = true` placeholder rows so they can be overridden
|
||||||
|
per composition slot, but there is **no loop for `Alarms`**. The derived template
|
||||||
|
therefore has zero alarm rows. The `TemplateAlarm` entity also has no `IsInherited` or
|
||||||
|
`LockedInDerived` fields (unlike `TemplateAttribute` / `TemplateScript`), so even if a
|
||||||
|
copy loop were added there is no mechanism to mark a copied alarm as inherited or to
|
||||||
|
override one. The design's Override Granularity section explicitly requires composed
|
||||||
|
alarm fields (Priority, Trigger thresholds, Description, On-Trigger Script) to be
|
||||||
|
overridable. As written, a composed module's alarms cannot be tuned for the slot they
|
||||||
|
are used in.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Add an alarm copy loop to `BuildDerivedTemplate` and add `IsInherited` /
|
||||||
|
`LockedInDerived` fields to `TemplateAlarm`, mirroring `TemplateAttribute`. Update
|
||||||
|
`UpdateAlarmAsync` to honour them as `UpdateAttributeAsync` / `UpdateScriptAsync`
|
||||||
|
already do.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-003 — `UpdateAttributeAsync` lets a non-locked attribute change its fixed DataType / DataSourceReference
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:285` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`LockEnforcer.ValidateAttributeOverride` correctly rejects a change to `DataType` or
|
||||||
|
`DataSourceReference` (both "fixed by the defining level" per the design). But the
|
||||||
|
caller only honours that error when the attribute is already locked:
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
var granularityError = LockEnforcer.ValidateAttributeOverride(existing, proposed);
|
||||||
|
if (granularityError != null && existing.IsLocked)
|
||||||
|
return Result<TemplateAttribute>.Failure(granularityError);
|
||||||
|
```
|
||||||
|
|
||||||
|
Lines 293-294 then unconditionally apply `existing.DataType = proposed.DataType` and
|
||||||
|
`existing.DataSourceReference = proposed.DataSourceReference`. For the common case of an
|
||||||
|
unlocked attribute, the fixed-field guard is dead and both fields are silently mutable,
|
||||||
|
violating the override-granularity rule. (The lock-error branch of the same helper is
|
||||||
|
also redundant — a locked attribute already returns earlier inside the helper.)
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Remove the `&& existing.IsLocked` condition so the granularity error is always
|
||||||
|
returned, and stop assigning `DataType` / `DataSourceReference` from `proposed` in the
|
||||||
|
apply block.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-004 — Alarm on-trigger script references are never resolved (empty placeholder)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:695` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ResolveAlarmScriptReferences` is invoked as Step 7 of `Flatten` but its body is empty
|
||||||
|
— only a comment describing what it should do. Consequently every
|
||||||
|
`ResolvedAlarm.OnTriggerScriptCanonicalName` stays `null`. This has two downstream
|
||||||
|
effects: (1) `SemanticValidator`'s "on-trigger script must exist" check
|
||||||
|
(`SemanticValidator.cs:209`) can never fire, so the design-mandated validation of
|
||||||
|
alarm on-trigger script references is silently absent; (2) `RevisionHashService` and
|
||||||
|
`DiffService` both hash/compare `OnTriggerScriptCanonicalName`, so a change to which
|
||||||
|
script an alarm triggers never affects the revision hash and is invisible to the diff
|
||||||
|
— a real staleness-detection gap.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Implement the resolution: map each alarm's `OnTriggerScriptId` (set on `TemplateAlarm`)
|
||||||
|
to the canonical name of the corresponding resolved script, accounting for composition
|
||||||
|
prefixes. If the design intends scripts to be referenced by name within scope, document
|
||||||
|
and implement that consistently.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-005 — Collision validation is skipped when creating a child template
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | High |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:56` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`CreateTemplateAsync` contains a block guarded by `if (parentTemplateId.HasValue)` that
|
||||||
|
loads `GetAllTemplatesAsync` and then does nothing but hold a comment — it never runs a
|
||||||
|
collision check. A child template created with a parent inherits the parent's members;
|
||||||
|
if the child is later given members (via `AddAttributeAsync` etc.) those calls do run
|
||||||
|
`CollisionDetector`, but the create path itself performs no naming-collision validation
|
||||||
|
and `UpdateTemplateAsync` only validates collisions on a name change. The design states
|
||||||
|
naming collisions are design-time errors that must block a save. The dead block is also
|
||||||
|
confusing and allocates an unused full-table read.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either run a real collision check on the to-be-created template (including its
|
||||||
|
inherited members) or delete the dead block and its unused query. If create-time
|
||||||
|
collisions are genuinely impossible because a fresh template has no members, document
|
||||||
|
that explicitly instead of leaving a no-op.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-006 — Forbidden-API enforcement is a naive substring scan (bypassable and false-positive prone)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Security |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Validation/ScriptCompiler.cs:21`, `src/ScadaLink.TemplateEngine/Validation/ValidationService.cs:318` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ScriptCompiler.ForbiddenPatterns` is checked with `code.Contains(pattern)`. This is
|
||||||
|
both under- and over-inclusive against the script trust model:
|
||||||
|
- **Bypass:** `using System.IO;` followed by `File.ReadAllText(...)` contains no
|
||||||
|
`System.IO.` token; `using static System.IO.File;`, namespace aliases, and
|
||||||
|
`global::System.IO.File` all evade the literal patterns.
|
||||||
|
- **False positive:** a string literal, comment, or attribute name containing the text
|
||||||
|
`System.IO.` is flagged as a forbidden API even though it is inert.
|
||||||
|
|
||||||
|
The same patterns are reused for trigger-expression validation
|
||||||
|
(`CheckExpressionSyntax`), inheriting the same weakness. The file comment acknowledges
|
||||||
|
this is interim until Roslyn is wired in, but the trust model is security-relevant and
|
||||||
|
the gap should be tracked.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Defer real enforcement to the Roslyn-based compiler (semantic symbol analysis of
|
||||||
|
referenced types/namespaces) rather than text matching. Until then, document the
|
||||||
|
limitation prominently and treat the substring scan as advisory, not authoritative.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-007 — Brace-balance "compilation" misjudges verbatim / interpolated / raw strings
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Validation/ScriptCompiler.cs:54`, `src/ScadaLink.TemplateEngine/SharedScriptService.cs:124` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`ScriptCompiler.TryCompile` tracks string state with a single `inString` flag toggled
|
||||||
|
on `"` and an escaped-quote check of `code[i-1] != '\\'`. It does not understand
|
||||||
|
verbatim strings (`@"..."` where `""` is the escape and `\` is literal), interpolated
|
||||||
|
strings (`$"{...}"` whose braces are code, not text), raw string literals (`"""..."""`),
|
||||||
|
or char literals. A script with a verbatim string containing a brace, an interpolated
|
||||||
|
string, or a `'}'` char literal will be wrongly rejected as having mismatched braces —
|
||||||
|
blocking a valid script from deployment. `SharedScriptService.ValidateSyntax` is even
|
||||||
|
cruder: it counts braces/brackets/parens with no string or comment awareness at all, so
|
||||||
|
any string literal containing one of those characters produces a false syntax error.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Once the Roslyn compiler is available, parse with `CSharpSyntaxTree.ParseText` and
|
||||||
|
inspect diagnostics instead of hand-rolling a tokenizer. If an interim check must
|
||||||
|
remain, at minimum handle verbatim/interpolated/char literals or scope the check down
|
||||||
|
to something that cannot false-positive.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-008 — `SetAlarmOverrideAsync` accepts overrides for unknown / composed alarms with no validation
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Error handling & resilience |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Services/InstanceService.cs:178` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SetAlarmOverrideAsync` looks up the alarm by name among the template's **direct**
|
||||||
|
alarms only. When the lookup returns `null` — which is the case for every composed
|
||||||
|
(path-qualified) alarm as well as for a genuinely non-existent name — the method skips
|
||||||
|
the lock check and proceeds to persist the override. This means: (1) an override can be
|
||||||
|
created for an alarm that does not exist (a silent dead record), and (2) a composed
|
||||||
|
alarm that is `IsLocked` at the template level can be overridden, bypassing the lock
|
||||||
|
rule. `SetAttributeOverrideAsync` by contrast rejects unknown attribute names. The
|
||||||
|
inline comment acknowledges the gap but the behaviour is inconsistent and risky.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Resolve the full effective alarm set (via the resolver / flattening) so composed
|
||||||
|
alarms are found, reject overrides whose canonical name is not in that set, and apply
|
||||||
|
the lock check to composed alarms too.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-009 — N+1 query in `TemplateDeletionService.CanDeleteTemplateAsync`
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Performance & resource management |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Services/TemplateDeletionService.cs:75` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Check 3 ("other templates compose it directly") loads all templates and then issues a
|
||||||
|
separate `GetCompositionsByTemplateIdAsync` call **inside a loop over every template**
|
||||||
|
— one round-trip per template in the database. The composition information needed is
|
||||||
|
already reachable via `t.Compositions` on the templates returned by
|
||||||
|
`GetAllTemplatesAsync` (which `TemplateService.DeleteTemplateAsync` uses for the
|
||||||
|
equivalent check at line 162). The loop scales linearly with the template count on
|
||||||
|
every delete-precheck and every actual delete.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Use the `Compositions` navigation already loaded by `GetAllTemplatesAsync`, or add a
|
||||||
|
single repository call that returns all compositions, rather than querying per
|
||||||
|
template.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-010 — `InstanceService` documents optimistic concurrency that is not implemented
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Medium |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Services/InstanceService.cs:9` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The class summary states instances support "Enabled/disabled state with optimistic
|
||||||
|
concurrency". `EnableAsync`, `DisableAsync`, `AssignToAreaAsync` and the override/binding
|
||||||
|
mutators all perform a plain read-modify-write with no version token, `RowVersion`, or
|
||||||
|
concurrency check. Two concurrent enable/disable requests last-writer-wins with no
|
||||||
|
detection. Either the doc is stale (the design's optimistic-concurrency decision
|
||||||
|
applies to *deployment status records*, not instance state) or a concurrency token was
|
||||||
|
intended and is missing.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
If last-write-wins is acceptable for instance state, correct the XML doc. If optimistic
|
||||||
|
concurrency is required, add a concurrency token to `Instance` and surface a conflict
|
||||||
|
result.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-011 — `SortedPropertiesConverterFactory` is dead code with a misleading comment
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Documentation & comments |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Flattening/RevisionHashService.cs:136` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`SortedPropertiesConverterFactory.CanConvert` always returns `false` and
|
||||||
|
`CreateConverter` always returns `null`, so the factory registered in
|
||||||
|
`CanonicalJsonOptions` does nothing. The class comment claims it "ensures properties are
|
||||||
|
serialized in alphabetical order for deterministic output", and the options comment says
|
||||||
|
"Ensure consistent ordering" — both are false. Determinism actually relies entirely on
|
||||||
|
the `Hashable*` records being hand-declared with alphabetically-ordered properties (plus
|
||||||
|
camelCase). That works today but is fragile: a future contributor adding a property out
|
||||||
|
of alphabetical order silently changes every revision hash, and the dead converter gives
|
||||||
|
false confidence that ordering is enforced programmatically.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Either implement the converter to genuinely sort properties, or delete it and replace
|
||||||
|
the comments with an explicit note that determinism depends on the manual property
|
||||||
|
ordering of the `Hashable*` records (ideally enforced by a test).
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-012 — `DataType` enum naming diverges from the design doc
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Design-document adherence |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/Validation/SemanticValidator.cs:18` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
The design doc (Attribute section) lists data types as "Boolean, Integer, Float,
|
||||||
|
String". The actual `DataType` enum is `Boolean, Int32, Float, Double, DateTime,
|
||||||
|
Binary`. `SemanticValidator.NumericDataTypes` correctly hard-codes the real names
|
||||||
|
(`Int32`, `Float`, `Double`), so the code is internally consistent, but the design doc
|
||||||
|
is stale — it omits `Double`, `DateTime`, `Binary` and calls the integer type
|
||||||
|
"Integer". This makes the doc an unreliable reference for which trigger-operand types
|
||||||
|
are numeric.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Update `docs/requirements/Component-TemplateEngine.md` to list the actual enum members,
|
||||||
|
or rename the enum to match the doc if "Integer" is the intended canonical name.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-013 — `ToDictionary(t => t.Id)` throws on duplicate IDs; cycle detectors overload Id 0 as a sentinel
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Correctness & logic bugs |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/CycleDetector.cs:30`, `src/ScadaLink.TemplateEngine/CycleDetector.cs:38` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
Across the static helpers, `allTemplates.ToDictionary(t => t.Id)` is used freely; if the
|
||||||
|
caller ever passes a list containing two templates with the same `Id` (e.g. a
|
||||||
|
not-yet-saved template assigned `Id == 0`, or duplicated input) the call throws an
|
||||||
|
unhandled `ArgumentException` rather than returning a `Result` failure. Separately,
|
||||||
|
`CycleDetector` uses `0` as the "no parent" sentinel (`currentId != 0`,
|
||||||
|
`ParentTemplateId ?? 0`) and `DetectInheritanceCycle` / `DetectCrossGraphCycle` ignore a
|
||||||
|
proposed parent/composed id of `0`. EF identity keys start at 1 so this is currently
|
||||||
|
benign, but the overload is fragile — an in-memory or test template with `Id == 0`
|
||||||
|
would be treated as "no template" and cycle checks would be silently skipped.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Guard the dictionary builds (or use a grouping/`ToLookup`) and validate input, and use
|
||||||
|
`int?`/`-1` rather than `0` as the no-parent sentinel so a real id of 0 is never
|
||||||
|
special.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
|
||||||
|
### TemplateEngine-014 — Template-deletion constraint logic is duplicated and divergent
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Low |
|
||||||
|
| Category | Code organization & conventions |
|
||||||
|
| Status | Open |
|
||||||
|
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:109`, `src/ScadaLink.TemplateEngine/Services/TemplateDeletionService.cs:27` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
`TemplateService.DeleteTemplateAsync` and `TemplateDeletionService.CanDeleteTemplateAsync`
|
||||||
|
both implement the "can this template be deleted" rules (instances, child templates,
|
||||||
|
derived templates, composing templates). The two implementations have already drifted:
|
||||||
|
`TemplateService` reads composing templates from the in-memory `t.Compositions`
|
||||||
|
navigation while `TemplateDeletionService` issues per-template
|
||||||
|
`GetCompositionsByTemplateIdAsync` calls (see TemplateEngine-009), they format error
|
||||||
|
messages differently, and `TemplateService` returns on the first failing category while
|
||||||
|
`TemplateDeletionService` accumulates all of them. A future rule change must be made in
|
||||||
|
two places or behaviour will diverge further.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Make `TemplateService.DeleteTemplateAsync` delegate to `TemplateDeletionService` (or
|
||||||
|
vice versa) so the constraint logic lives in exactly one place.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
@@ -0,0 +1,67 @@
|
|||||||
|
# Code Review — <Module>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Template for a module review. Copy the structure below into
|
||||||
|
code-reviews/<Module>/findings.md and fill it in.
|
||||||
|
See ../REVIEW-PROCESS.md for the full process.
|
||||||
|
-->
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Module | `src/ScadaLink.<Module>` |
|
||||||
|
| Design doc | `docs/requirements/Component-<Name>.md` |
|
||||||
|
| Status | Not yet reviewed \| In progress \| Reviewed |
|
||||||
|
| Last reviewed | YYYY-MM-DD |
|
||||||
|
| Reviewer | <name> |
|
||||||
|
| Commit reviewed | `<short SHA>` |
|
||||||
|
| Open findings | 0 |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
One short paragraph: overall health of the module, themes across findings, and
|
||||||
|
anything notable that is not a finding.
|
||||||
|
|
||||||
|
## Checklist coverage
|
||||||
|
|
||||||
|
Confirm every category was examined. Record "No issues found" where applicable.
|
||||||
|
|
||||||
|
| # | Category | Examined | Notes |
|
||||||
|
|---|----------|----------|-------|
|
||||||
|
| 1 | Correctness & logic bugs | ☐ | |
|
||||||
|
| 2 | Akka.NET conventions | ☐ | |
|
||||||
|
| 3 | Concurrency & thread safety | ☐ | |
|
||||||
|
| 4 | Error handling & resilience | ☐ | |
|
||||||
|
| 5 | Security | ☐ | |
|
||||||
|
| 6 | Performance & resource management | ☐ | |
|
||||||
|
| 7 | Design-document adherence | ☐ | |
|
||||||
|
| 8 | Code organization & conventions | ☐ | |
|
||||||
|
| 9 | Testing coverage | ☐ | |
|
||||||
|
| 10 | Documentation & comments | ☐ | |
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
<!-- One entry per finding. Copy the block below. Never delete a finding; close it
|
||||||
|
by changing Status and completing Resolution. -->
|
||||||
|
|
||||||
|
### <Module>-001 — <Short title>
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|--|--|
|
||||||
|
| Severity | Critical \| High \| Medium \| Low |
|
||||||
|
| Category | <one of the 10 checklist categories> |
|
||||||
|
| Status | Open \| In Progress \| Resolved \| Won't Fix \| Deferred |
|
||||||
|
| Location | `src/ScadaLink.<Module>/<File>.cs:<line>` |
|
||||||
|
|
||||||
|
**Description**
|
||||||
|
|
||||||
|
What is wrong and why it matters.
|
||||||
|
|
||||||
|
**Recommendation**
|
||||||
|
|
||||||
|
Concrete suggested fix.
|
||||||
|
|
||||||
|
**Resolution**
|
||||||
|
|
||||||
|
_Unresolved._
|
||||||
|
<!-- When closed: fixing commit `<SHA>`, date YYYY-MM-DD, one-line description.
|
||||||
|
For Won't Fix / Deferred, justify the decision here. -->
|
||||||
Executable
+179
@@ -0,0 +1,179 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Regenerate code-reviews/README.md from the per-module findings.md files.
|
||||||
|
|
||||||
|
The findings files are the source of truth; README.md is a generated index.
|
||||||
|
Run this after resolving or re-triaging a finding so the aggregated tables stay
|
||||||
|
in sync (see REVIEW-PROCESS.md section 5).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 regen-readme.py # rewrite README.md
|
||||||
|
python3 regen-readme.py --check # exit 1 if README.md is stale (for CI)
|
||||||
|
|
||||||
|
Works from any directory — paths are resolved relative to this script.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
BASE = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
SEVERITY_ORDER = {"Critical": 0, "High": 1, "Medium": 2, "Low": 3}
|
||||||
|
PENDING_STATUSES = {"Open", "In Progress"}
|
||||||
|
|
||||||
|
|
||||||
|
def discover_modules():
|
||||||
|
"""Module folders are every subdirectory of code-reviews/ holding a findings.md,
|
||||||
|
excluding the _template folder. Returned sorted for a stable README order."""
|
||||||
|
modules = []
|
||||||
|
for name in sorted(os.listdir(BASE)):
|
||||||
|
if name.startswith(("_", ".")):
|
||||||
|
continue
|
||||||
|
if os.path.isfile(os.path.join(BASE, name, "findings.md")):
|
||||||
|
modules.append(name)
|
||||||
|
return modules
|
||||||
|
|
||||||
|
|
||||||
|
def parse_findings(module):
|
||||||
|
"""Parse one module's findings.md into (module, id, severity, title, status) tuples."""
|
||||||
|
text = open(os.path.join(BASE, module, "findings.md")).read()
|
||||||
|
findings = []
|
||||||
|
for block in re.split(r"^### ", text, flags=re.M)[1:]:
|
||||||
|
head = block.splitlines()[0].strip()
|
||||||
|
m = re.match(r"([A-Za-z][A-Za-z0-9]*-\d+)\b(.*)", head)
|
||||||
|
if not m:
|
||||||
|
raise SystemExit(f"{module}/findings.md: unparseable finding heading: {head!r}")
|
||||||
|
fid = m.group(1).strip()
|
||||||
|
title = m.group(2).strip().lstrip("—–-").strip().replace("|", "\\|")
|
||||||
|
sev = re.search(r"\|\s*Severity\s*\|\s*([A-Za-z]+)", block)
|
||||||
|
status = re.search(r"\|\s*Status\s*\|\s*([A-Za-z ]+?)\s*\|", block)
|
||||||
|
if not sev or not status:
|
||||||
|
raise SystemExit(f"{module}/findings.md: {fid} is missing a Severity or Status field")
|
||||||
|
findings.append((module, fid, sev.group(1), title, status.group(1).strip()))
|
||||||
|
return findings
|
||||||
|
|
||||||
|
|
||||||
|
def finding_number(finding):
|
||||||
|
return int(re.search(r"-(\d+)$", finding[1]).group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def build_readme(modules, per_module):
|
||||||
|
pending = sorted(
|
||||||
|
(f for fs in per_module.values() for f in fs if f[4] in PENDING_STATUSES),
|
||||||
|
key=lambda f: (SEVERITY_ORDER.get(f[2], 9), f[0], finding_number(f)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def severity_total(sev):
|
||||||
|
return sum(1 for f in pending if f[2] == sev)
|
||||||
|
|
||||||
|
def open_count(module, sev):
|
||||||
|
return sum(1 for f in per_module[module]
|
||||||
|
if f[2] == sev and f[4] in PENDING_STATUSES)
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
add = lines.append
|
||||||
|
|
||||||
|
add("# Code Reviews")
|
||||||
|
add("")
|
||||||
|
add("Comprehensive, per-module code reviews of the ScadaLink codebase. Each module (one")
|
||||||
|
add("buildable project under `src/`) has its own folder containing a `findings.md`. This")
|
||||||
|
add("README is the aggregated index — the single place to see all outstanding work.")
|
||||||
|
add("")
|
||||||
|
add("> Generated by `regen-readme.py` from the per-module `findings.md` files. Do not")
|
||||||
|
add("> edit by hand — edit the findings files and re-run the script.")
|
||||||
|
add("")
|
||||||
|
add("## How it works")
|
||||||
|
add("")
|
||||||
|
add("- Reviews are performed one module at a time against a fixed checklist.")
|
||||||
|
add("- Every finding is recorded in the module's `findings.md` with a severity and status.")
|
||||||
|
add("- Findings are **never deleted** — they are closed by changing their status, keeping")
|
||||||
|
add(" a full audit trail.")
|
||||||
|
add("- This README aggregates every **pending** finding (`Open` / `In Progress`) across all")
|
||||||
|
add(" modules.")
|
||||||
|
add("")
|
||||||
|
add("See **[REVIEW-PROCESS.md](REVIEW-PROCESS.md)** for the full procedure: the review")
|
||||||
|
add("checklist, severity definitions, finding format, and how to mark items resolved.")
|
||||||
|
add("")
|
||||||
|
add("## Layout")
|
||||||
|
add("")
|
||||||
|
add("```")
|
||||||
|
add("code-reviews/")
|
||||||
|
add("├── README.md # this file — process overview + pending findings")
|
||||||
|
add("├── REVIEW-PROCESS.md # how to perform a review and track findings")
|
||||||
|
add("├── regen-readme.py # regenerates this README from the findings files")
|
||||||
|
add("├── _template/findings.md # copy-this template for a module review")
|
||||||
|
add("└── <Module>/findings.md # one folder per src/ project")
|
||||||
|
add("```")
|
||||||
|
add("")
|
||||||
|
add("## Baseline review — 2026-05-16")
|
||||||
|
add("")
|
||||||
|
add("All 19 modules were reviewed at commit `9c60592` (241 findings: 6 Critical, 46 High,")
|
||||||
|
add("100 Medium, 89 Low). The tables below track what remains **open** as findings are")
|
||||||
|
add("resolved and re-triaged; findings discovered after the baseline are appended to their")
|
||||||
|
add("module file and counted in **Total**.")
|
||||||
|
add("")
|
||||||
|
add("| Severity | Open findings |")
|
||||||
|
add("|----------|---------------|")
|
||||||
|
for sev in ("Critical", "High", "Medium", "Low"):
|
||||||
|
add(f"| {sev} | {severity_total(sev)} |")
|
||||||
|
add(f"| **Total** | **{len(pending)}** |")
|
||||||
|
add("")
|
||||||
|
add("## Module Status")
|
||||||
|
add("")
|
||||||
|
add("| Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total |")
|
||||||
|
add("|--------|---------------|--------|----------------|------|-------|")
|
||||||
|
for module in modules:
|
||||||
|
counts = [open_count(module, s) for s in ("Critical", "High", "Medium", "Low")]
|
||||||
|
add(f"| [{module}]({module}/findings.md) | 2026-05-16 | `9c60592` "
|
||||||
|
f"| {counts[0]}/{counts[1]}/{counts[2]}/{counts[3]} "
|
||||||
|
f"| {sum(counts)} | {len(per_module[module])} |")
|
||||||
|
add("")
|
||||||
|
add("## Pending Findings")
|
||||||
|
add("")
|
||||||
|
add("Every `Open` / `In Progress` finding across all modules, highest severity first.")
|
||||||
|
add("Resolved findings drop off this list but remain recorded in their module's")
|
||||||
|
add("`findings.md` (see [REVIEW-PROCESS.md](REVIEW-PROCESS.md) §4–§5). Full detail —")
|
||||||
|
add("description, location, recommendation — lives in the module's `findings.md`.")
|
||||||
|
add("")
|
||||||
|
for sev in ("Critical", "High", "Medium", "Low"):
|
||||||
|
rows = [f for f in pending if f[2] == sev]
|
||||||
|
add(f"### {sev} ({len(rows)})")
|
||||||
|
add("")
|
||||||
|
if not rows:
|
||||||
|
add("_None open._")
|
||||||
|
add("")
|
||||||
|
continue
|
||||||
|
add("| ID | Module | Title |")
|
||||||
|
add("|----|--------|-------|")
|
||||||
|
for module, fid, _, title, _ in rows:
|
||||||
|
add(f"| {fid} | [{module}]({module}/findings.md) | {title} |")
|
||||||
|
add("")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
check = "--check" in sys.argv[1:]
|
||||||
|
modules = discover_modules()
|
||||||
|
per_module = {m: parse_findings(m) for m in modules}
|
||||||
|
content = build_readme(modules, per_module)
|
||||||
|
|
||||||
|
readme_path = os.path.join(BASE, "README.md")
|
||||||
|
pending = sum(1 for fs in per_module.values()
|
||||||
|
for f in fs if f[4] in PENDING_STATUSES)
|
||||||
|
total = sum(len(fs) for fs in per_module.values())
|
||||||
|
|
||||||
|
if check:
|
||||||
|
current = open(readme_path).read() if os.path.exists(readme_path) else ""
|
||||||
|
if current != content:
|
||||||
|
print("README.md is stale — run: python3 code-reviews/regen-readme.py")
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"README.md is up to date ({pending} pending / {total} total).")
|
||||||
|
return
|
||||||
|
|
||||||
|
open(readme_path, "w").write(content)
|
||||||
|
print(f"README.md regenerated — {pending} pending, {total} total findings "
|
||||||
|
f"across {len(modules)} modules.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,114 @@
|
|||||||
|
# Expression Trigger for Template Scripts and Alarms — Design
|
||||||
|
|
||||||
|
**Date:** 2026-05-16
|
||||||
|
**Status:** Approved (brainstorming) — implementation plan to follow
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Template scripts and template alarms can only be triggered by single-attribute
|
||||||
|
conditions. Scripts support `Interval`, `ValueChange`, `Conditional`
|
||||||
|
(`{attributeName, operator, threshold}` — one attribute, numeric compare),
|
||||||
|
and `Call`. Alarms support `ValueMatch`, `RangeViolation`, `RateOfChange`, and
|
||||||
|
`HiLo` — all single-attribute. There is no way to trigger on a relationship
|
||||||
|
between *multiple* attributes (e.g. "speed is high *and* mode is Run").
|
||||||
|
|
||||||
|
This design adds an **Expression trigger**: a user-supplied read-only boolean
|
||||||
|
C# expression, evaluated whenever an instance attribute updates, that fires the
|
||||||
|
script / activates the alarm when it returns true. It generalizes the existing
|
||||||
|
single-attribute `Conditional` trigger.
|
||||||
|
|
||||||
|
### Decisions taken during brainstorming
|
||||||
|
|
||||||
|
- The trigger is a **read-only boolean expression** — no `External`/`Database`/
|
||||||
|
`Notify`/`CallScript` side effects. It must be cheap and safe to run on every
|
||||||
|
attribute update.
|
||||||
|
- **Scripts fire edge-triggered** — once per `false→true` transition.
|
||||||
|
- **Alarms are level-based** — active while the expression is true, clear when
|
||||||
|
false (consistent with all existing alarm trigger types).
|
||||||
|
- **Evaluation approach B** — compile against a *restricted read-only globals
|
||||||
|
type*, so read-only is enforced, not merely conventional. Reuses the existing
|
||||||
|
Roslyn compilation pipeline.
|
||||||
|
|
||||||
|
## Design
|
||||||
|
|
||||||
|
### 1. Trigger model & storage
|
||||||
|
|
||||||
|
- **Scripts:** `TemplateScript.TriggerType` (`string?`) gains the value
|
||||||
|
`"Expression"`. `TriggerConfiguration` JSON is `{ "expression": "<C#>" }`.
|
||||||
|
- **Alarms:** `AlarmTriggerType` enum gains a member `Expression`.
|
||||||
|
`TriggerConfiguration` JSON is the same `{ "expression": "<C#>" }`.
|
||||||
|
- The expression is a bare C# boolean expression (no `return` keyword — Roslyn
|
||||||
|
scripting returns the trailing expression's value), e.g.
|
||||||
|
`Attributes["Speed"] > 1000 && (string)Attributes["Mode"] == "Run"`.
|
||||||
|
- Entity types unchanged: both `TriggerConfiguration` fields stay `string?`.
|
||||||
|
|
||||||
|
Adding the `AlarmTriggerType` member touches three switch sites:
|
||||||
|
`AlarmActor.ParseEvalConfig`, `AlarmActor.HandleAttributeValueChanged`,
|
||||||
|
`AlarmTriggerConfigCodec`.
|
||||||
|
|
||||||
|
### 2. Runtime evaluation
|
||||||
|
|
||||||
|
- **`TriggerExpressionGlobals`** (new, `ScadaLink.SiteRuntime`) — a read-only
|
||||||
|
globals type exposing only `Attributes["X"]`, `Children["C"].Attributes["X"]`,
|
||||||
|
and `Parent.Attributes["X"]`, backed by an in-memory snapshot dictionary. No
|
||||||
|
side-effecting APIs. A missing attribute reads as `null` (never throws).
|
||||||
|
- The expression is compiled once via the existing Roslyn pipeline (same
|
||||||
|
forbidden-API trust checks) against `TriggerExpressionGlobals`; the compiled
|
||||||
|
delegate is cached on the actor.
|
||||||
|
- **Attribute snapshot:** `ScriptActor` and `AlarmActor` already receive every
|
||||||
|
`AttributeValueChanged`. Each keeps a local `Dictionary<string,object?>`
|
||||||
|
snapshot — seeded from the instance's initial attribute set at startup, then
|
||||||
|
updated on each change. The expression evaluates against the snapshot — no
|
||||||
|
`Ask` back to the `InstanceActor`; cheap and re-entrancy-free.
|
||||||
|
- **On each `AttributeValueChanged`:** update snapshot → run cached expression
|
||||||
|
→ `bool`.
|
||||||
|
- **Script (edge):** track the previous result; on `false→true`, run the
|
||||||
|
script (spawn `ScriptExecutionActor`, as the other triggers do).
|
||||||
|
- **Alarm (level):** the `bool` feeds the existing binary Normal↔Active state
|
||||||
|
machine — raise on `→Active`, clear on `→Normal`.
|
||||||
|
- Cost per attribute update: one cached-delegate call + one bool compare.
|
||||||
|
|
||||||
|
### 3. Editors & analysis
|
||||||
|
|
||||||
|
- **`ScriptTriggerEditor`:** add `Expression` to `ScriptTriggerKind` and
|
||||||
|
`ScriptTriggerConfigCodec` (round-trips `{ expression }`).
|
||||||
|
- **`AlarmTriggerEditor`:** add an `Expression` case to its trigger `@switch`.
|
||||||
|
- Both render the same **expression panel**: a compact `MonacoEditor`
|
||||||
|
(~120 px) with C# syntax, `Attributes["..."]` completion driven by the
|
||||||
|
template's attribute metadata (self / children / parent), and live compile
|
||||||
|
diagnostics. A one-line hint summarizes what fires.
|
||||||
|
- **Analysis:** reuse the existing `Template` analysis kind — completion and
|
||||||
|
diagnostics work with no new analyzer code. Editor completion is slightly
|
||||||
|
permissive (also shows `Instance`/`External`), but the runtime's restricted
|
||||||
|
`TriggerExpressionGlobals` is what enforces read-only. A dedicated strict
|
||||||
|
analysis kind is a possible later refinement, out of scope here.
|
||||||
|
|
||||||
|
### 4. Error handling & validation
|
||||||
|
|
||||||
|
- **Pre-deployment:** extend `ValidationService` to compile-check expression
|
||||||
|
triggers (against `TriggerExpressionGlobals`); compile errors block
|
||||||
|
deployment and surface like other validation errors. Unknown
|
||||||
|
`Attributes["..."]` keys are flagged as the existing trigger-reference
|
||||||
|
validation does.
|
||||||
|
- **Runtime — expression throws:** caught; treated as `false` for that update;
|
||||||
|
a script-error event is written to the site event log. The actor never
|
||||||
|
crashes.
|
||||||
|
- **Non-bool result:** treated as `false` and logged.
|
||||||
|
- **Missing attribute:** reads as `null` (handled in `TriggerExpressionGlobals`).
|
||||||
|
- **Blank expression:** the trigger is inert; validation emits a warning.
|
||||||
|
|
||||||
|
### 5. Testing & verification
|
||||||
|
|
||||||
|
- **Unit:** codec round-trip for script and alarm `{ expression }`; expression
|
||||||
|
compile (valid + invalid).
|
||||||
|
- **Runtime:** deploy an instance with an expression-triggered script and
|
||||||
|
alarm; drive attribute updates (bound Test Run / CLI); confirm the script
|
||||||
|
fires only on `false→true` and the alarm raises/clears with the expression.
|
||||||
|
- **UI:** the expression panel in both editors; save → reopen round-trip.
|
||||||
|
|
||||||
|
## Implementation tasks
|
||||||
|
|
||||||
|
- #25 — Implement expression trigger model + codecs
|
||||||
|
- #26 — Implement runtime expression evaluation (blocked by #25)
|
||||||
|
- #27 — Add expression panel to the trigger editors (blocked by #25)
|
||||||
|
- #28 — Validate expression triggers pre-deployment (blocked by #25, #26)
|
||||||
@@ -0,0 +1,220 @@
|
|||||||
|
# Expression Trigger Implementation Plan
|
||||||
|
|
||||||
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
|
||||||
|
|
||||||
|
**Goal:** Add an "Expression" trigger to template scripts and alarms — a read-only boolean C# expression evaluated on attribute updates that fires the script (edge) or activates the alarm (level).
|
||||||
|
|
||||||
|
**Architecture:** A new restricted read-only globals type (`TriggerExpressionGlobals`) backed by an in-memory attribute snapshot; the expression is compiled once via the existing Roslyn pipeline and cached on `ScriptActor`/`AlarmActor`, which already receive every `AttributeValueChanged`. The CentralUI trigger editors gain an Expression panel. See the approved design: `docs/plans/2026-05-16-expression-trigger-design.md`.
|
||||||
|
|
||||||
|
**Tech Stack:** C#/.NET, Akka.NET (site runtime actors), Roslyn C# scripting, Blazor Server (CentralUI), Docker cluster.
|
||||||
|
|
||||||
|
**Verification note:** This repo has no CentralUI/Commons unit-test project; pure-logic correctness is verified by `dotnet build` + the editor round-trip, and runtime behavior by `bash docker/deploy.sh` + a browser/CLI walkthrough (the established pattern in this codebase). Steps below follow that.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 1: Trigger model + codecs
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/ScadaLink.Commons/Types/Enums/AlarmTriggerType.cs`
|
||||||
|
- Modify: `src/ScadaLink.CentralUI/Components/Shared/ScriptTriggerConfigCodec.cs`
|
||||||
|
- Modify: `src/ScadaLink.CentralUI/Components/Shared/AlarmTriggerConfigCodec.cs`
|
||||||
|
|
||||||
|
**Step 1: Add the `Expression` alarm trigger type.**
|
||||||
|
In `AlarmTriggerType.cs`, add `Expression` as the last enum member (append — do not reorder; the enum is persisted by value):
|
||||||
|
```csharp
|
||||||
|
public enum AlarmTriggerType
|
||||||
|
{
|
||||||
|
ValueMatch,
|
||||||
|
RangeViolation,
|
||||||
|
RateOfChange,
|
||||||
|
HiLo,
|
||||||
|
Expression
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Extend `ScriptTriggerConfigCodec`.**
|
||||||
|
- Add `Expression` to `ScriptTriggerKind` (before `Unknown`).
|
||||||
|
- `ParseKind`: map `"expression"` → `ScriptTriggerKind.Expression`.
|
||||||
|
- `KindToString`: `Expression` → `"Expression"`.
|
||||||
|
- Add `string? Expression` to `ScriptTriggerModel`.
|
||||||
|
- `Parse`: for `Expression`, read `model.Expression = root.TryGetProperty("expression", out var e) ? e.GetString() : null;`
|
||||||
|
- `Serialize`: for `Expression`, write `w.WriteString("expression", model.Expression ?? "");`
|
||||||
|
|
||||||
|
**Step 3: Extend `AlarmTriggerConfigCodec`.**
|
||||||
|
- Add `string? Expression` to `AlarmTriggerModel`.
|
||||||
|
- `Parse`: `case AlarmTriggerType.Expression:` → `model.Expression = TryReadString(root, "expression");`
|
||||||
|
- `Serialize`: `case AlarmTriggerType.Expression:` → `w.WriteString("expression", model.Expression ?? "");` (note: this codec always writes `attributeName` first — for Expression that key is unused; leave it written empty, harmless, or guard it. Prefer: skip the `attributeName` write when `type == Expression`.)
|
||||||
|
|
||||||
|
**Step 4: Build.**
|
||||||
|
Run: `dotnet build src/ScadaLink.CentralUI/ScadaLink.CentralUI.csproj -nologo`
|
||||||
|
Expected: `Build succeeded`.
|
||||||
|
|
||||||
|
**Step 5: Commit.**
|
||||||
|
```bash
|
||||||
|
git add src/ScadaLink.Commons/Types/Enums/AlarmTriggerType.cs src/ScadaLink.CentralUI/Components/Shared/ScriptTriggerConfigCodec.cs src/ScadaLink.CentralUI/Components/Shared/AlarmTriggerConfigCodec.cs
|
||||||
|
git commit -m "feat(triggers): add Expression to the script & alarm trigger codecs"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 2: Runtime expression evaluation
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `src/ScadaLink.SiteRuntime/Scripts/TriggerExpressionGlobals.cs`
|
||||||
|
- Modify: `src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs`
|
||||||
|
- Modify: `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs`
|
||||||
|
- Modify: `src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs`
|
||||||
|
|
||||||
|
**Step 1: Create `TriggerExpressionGlobals`.**
|
||||||
|
A read-only globals type backed by a snapshot dictionary. Exposes only attribute reads — no `Instance`/`Scripts`/`ExternalSystem`/`Database`/`Notify`. Mirror the shape of `ScopeAccessors` but read straight from the dict (no actor Ask). Missing key → `null`.
|
||||||
|
```csharp
|
||||||
|
namespace ScadaLink.SiteRuntime.Scripts;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Read-only globals a trigger expression is compiled against. Exposes only
|
||||||
|
/// attribute reads, backed by an in-memory snapshot — no I/O, no actor Ask.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class TriggerExpressionGlobals
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _snapshot;
|
||||||
|
public TriggerExpressionGlobals(IReadOnlyDictionary<string, object?> snapshot) => _snapshot = snapshot;
|
||||||
|
|
||||||
|
public ReadOnlyAttributes Attributes => new(_snapshot, "");
|
||||||
|
public ReadOnlyChildren Children => new(_snapshot);
|
||||||
|
public ReadOnlyComposition? Parent { get; init; } // set by caller for derived/composed scopes; null at root
|
||||||
|
|
||||||
|
public sealed class ReadOnlyAttributes
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _s;
|
||||||
|
private readonly string _prefix;
|
||||||
|
public ReadOnlyAttributes(IReadOnlyDictionary<string, object?> s, string prefix) { _s = s; _prefix = prefix; }
|
||||||
|
public object? this[string key] =>
|
||||||
|
_s.TryGetValue(_prefix.Length == 0 ? key : _prefix + "." + key, out var v) ? v : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ReadOnlyComposition
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _s;
|
||||||
|
private readonly string _path;
|
||||||
|
public ReadOnlyComposition(IReadOnlyDictionary<string, object?> s, string path) { _s = s; _path = path; }
|
||||||
|
public ReadOnlyAttributes Attributes => new(_s, _path);
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ReadOnlyChildren
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _s;
|
||||||
|
public ReadOnlyChildren(IReadOnlyDictionary<string, object?> s) => _s = s;
|
||||||
|
public ReadOnlyComposition this[string compositionName] => new(_s, compositionName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
Note: confirm against `ScopeAccessors.cs` whether canonical attribute keys are dotted (`TempSensor.Reading`) — they are; the prefix logic matches `AttributeAccessor.Resolve`.
|
||||||
|
|
||||||
|
**Step 2: Add expression compilation to `ScriptCompilationService`.**
|
||||||
|
Add a method that compiles a bare C# boolean expression against `TriggerExpressionGlobals`, reusing the existing `ScriptOptions` (references/imports) and the forbidden-API trust check. Return `Script<object?>` (Roslyn scripting returns the trailing expression's value).
|
||||||
|
```csharp
|
||||||
|
public ScriptCompilationResult CompileTriggerExpression(string name, string expression)
|
||||||
|
{
|
||||||
|
// same ScriptOptions as Compile(), globalsType: typeof(TriggerExpressionGlobals)
|
||||||
|
// run the same forbidden-API validation
|
||||||
|
}
|
||||||
|
```
|
||||||
|
Read the existing `Compile` (lines ~94-148) and factor the shared option-building + validation rather than duplicating.
|
||||||
|
|
||||||
|
**Step 3: ScriptActor — `ExpressionTriggerConfig` + edge evaluation.**
|
||||||
|
- Add a trigger config record `ExpressionTriggerConfig(string Expression)` alongside `IntervalTriggerConfig`/etc.
|
||||||
|
- `ParseTriggerConfig` (~line 262): add `"expression" => ParseExpressionTrigger(triggerConfigJson)` reading `{ "expression": "..." }`.
|
||||||
|
- On actor start (where the trigger is parsed/registered): if the config is `ExpressionTriggerConfig`, compile via `CompileTriggerExpression`, cache the `Script<object?>`, init `bool _lastExpressionResult = false`.
|
||||||
|
- Maintain `Dictionary<string,object?> _attributeSnapshot` — update it in the `AttributeValueChanged` handler (~lines 148-168) for **every** change, before trigger logic.
|
||||||
|
- In that handler, for `ExpressionTriggerConfig`: build `new TriggerExpressionGlobals(_attributeSnapshot)`, run the cached script (`RunAsync(globals)`), coerce `ReturnValue` to bool; if `result && !_lastExpressionResult` → run the script (same path `Conditional`/`ValueChange` use to spawn `ScriptExecutionActor`); set `_lastExpressionResult = result`.
|
||||||
|
- Wrap the evaluation in try/catch — on throw, treat as `false` and log a site-event-log script error; do not crash.
|
||||||
|
|
||||||
|
**Step 4: AlarmActor — `Expression` eval config + level evaluation.**
|
||||||
|
- `ParseEvalConfig` (~lines 413-484): add `case AlarmTriggerType.Expression:` building an `ExpressionEvalConfig` that holds the compiled `Script<object?>` (compile here via `CompileTriggerExpression`).
|
||||||
|
- `HandleAttributeValueChanged` (~lines 127-189): maintain the same `_attributeSnapshot`; for the `Expression` case (switch ~lines 141-147) evaluate the compiled expression against `TriggerExpressionGlobals` → bool; feed that bool into the existing **binary** Normal↔Active path (the same one `ValueMatch`/`RangeViolation` use — raise on `→Active`, clear on `→Normal`). Not HiLo.
|
||||||
|
- Same try/catch → `false` + log on throw.
|
||||||
|
|
||||||
|
**Step 5: Build.**
|
||||||
|
Run: `dotnet build src/ScadaLink.Host/ScadaLink.Host.csproj -nologo`
|
||||||
|
Expected: `Build succeeded`.
|
||||||
|
|
||||||
|
**Step 6: Commit.**
|
||||||
|
```bash
|
||||||
|
git add src/ScadaLink.SiteRuntime/
|
||||||
|
git commit -m "feat(triggers): runtime expression trigger evaluation for scripts and alarms"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 3: Trigger editor panels (CentralUI)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/ScadaLink.CentralUI/Components/Shared/ScriptTriggerEditor.razor`
|
||||||
|
- Modify: `src/ScadaLink.CentralUI/Components/Shared/AlarmTriggerEditor.razor`
|
||||||
|
- Reference: `src/ScadaLink.CentralUI/Components/Shared/MonacoEditor.razor`, `src/ScadaLink.CentralUI/Components/Pages/Design/TemplateEdit.razor` (how the script Code editor is fed `SelfAttributes`/`Children`/`Parent`)
|
||||||
|
|
||||||
|
**Step 1: `ScriptTriggerEditor` — Expression panel.**
|
||||||
|
- The codec already has the `Expression` kind (Task 1). Add `<option value="Expression">Expression — run when a boolean expression becomes true</option>` to the type `<select>`.
|
||||||
|
- Add a `case ScriptTriggerKind.Expression:` to the `@switch` rendering `RenderExpression()`.
|
||||||
|
- `RenderExpression()` hosts a compact `MonacoEditor` (`Height="120px"`, `Language="csharp"`, `ScriptKind=Template`) bound to `_model.Expression`; `ValueChanged` → update model + `Emit()`. Feed it the template attribute metadata for completion (see Step 3).
|
||||||
|
- Hint: "Runs once each time this expression becomes true."
|
||||||
|
|
||||||
|
**Step 2: `AlarmTriggerEditor` — Expression panel.**
|
||||||
|
- Add `case AlarmTriggerType.Expression: @RenderExpression(); break;` to the trigger `@switch` (~line 72).
|
||||||
|
- Same compact `MonacoEditor` bound to `_model.Expression`; `Emit()` on change.
|
||||||
|
- Hint: "Alarm is active while this expression is true."
|
||||||
|
|
||||||
|
**Step 3: Feed attribute metadata for completion.**
|
||||||
|
Both editors already receive `AvailableAttributes` (`IReadOnlyList<AlarmAttributeChoice>`). `MonacoEditor` wants `SelfAttributes` (`AttributeShape[]`) / `Children` / `Parent`. Add a small mapper from `AlarmAttributeChoice` → the Monaco metadata (Direct/Inherited → `SelfAttributes`; Composed → `Children` contexts). Keep it minimal — at least pass `SelfAttributes` so `Attributes["..."]` completion works.
|
||||||
|
|
||||||
|
**Step 4: Build.**
|
||||||
|
Run: `dotnet build src/ScadaLink.CentralUI/ScadaLink.CentralUI.csproj -nologo`
|
||||||
|
Expected: `Build succeeded`.
|
||||||
|
|
||||||
|
**Step 5: Commit.**
|
||||||
|
```bash
|
||||||
|
git add src/ScadaLink.CentralUI/Components/Shared/
|
||||||
|
git commit -m "feat(ui/triggers): expression trigger panel in the script & alarm editors"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 4: Pre-deployment validation
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/ScadaLink.TemplateEngine/.../ValidationService.cs` (the file with `ValidateScriptTriggerReferences` / `ExtractAttributeNameFromTriggerConfig`)
|
||||||
|
|
||||||
|
**Step 1: Compile-check expression triggers.**
|
||||||
|
In the validation pass, for any script/alarm whose trigger type is `Expression`, extract `expression` from `TriggerConfiguration` and compile-check it. The TemplateEngine project may not reference the SiteRuntime compiler — if so, do a Roslyn syntax/compile check using the same approach, or surface a clear "expression empty / invalid" check at minimum. Confirm the reference graph during execution; prefer reusing `CompileTriggerExpression` if reachable.
|
||||||
|
|
||||||
|
**Step 2: Flag unknown attribute references (best-effort).**
|
||||||
|
Expression text references `Attributes["X"]`; extend the existing attribute-reference validation to scan the expression for `Attributes["..."]` literals and flag keys absent from the flattened config — mirroring `ExtractAttributeNameFromTriggerConfig` for the structured triggers.
|
||||||
|
|
||||||
|
**Step 3: Build + commit.**
|
||||||
|
```bash
|
||||||
|
dotnet build src/ScadaLink.Host/ScadaLink.Host.csproj -nologo
|
||||||
|
git add src/ScadaLink.TemplateEngine/
|
||||||
|
git commit -m "feat(triggers): validate expression triggers pre-deployment"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 5: Build, deploy, verify
|
||||||
|
|
||||||
|
**Step 1:** `bash docker/deploy.sh` and wait for `http://localhost:9000/health/ready`.
|
||||||
|
|
||||||
|
**Step 2 — UI:** Log in (`multi-role`/`password`), open a template → Scripts → Add Script. Select trigger type **Expression**; confirm the Monaco expression box renders with attribute completion. Save `Attributes["TestDouble"] > 50` → reopen → confirm round-trip. Repeat on the alarm editor.
|
||||||
|
|
||||||
|
**Step 3 — runtime (script, edge):** Deploy an instance; set an attribute so the expression is false, then true → confirm the script runs once on the transition and does **not** re-run while it stays true; flip false then true again → runs again.
|
||||||
|
|
||||||
|
**Step 4 — runtime (alarm, level):** Expression-triggered alarm raises when the expression becomes true and clears when it becomes false (check the alarm state / Debug View).
|
||||||
|
|
||||||
|
**Step 5:** `git push`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes for the executor
|
||||||
|
- Append the `AlarmTriggerType.Expression` enum member **last** — the enum is persisted by integer value.
|
||||||
|
- The trigger expression is a *bare expression* (no `return`) — Roslyn scripting returns the trailing expression's value.
|
||||||
|
- Keep the evaluation try/catch tight; a throwing expression must never crash `ScriptActor`/`AlarmActor`.
|
||||||
|
- `_attributeSnapshot` must be updated for **every** `AttributeValueChanged`, not just attributes the expression names.
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"planPath": "docs/plans/2026-05-16-expression-trigger.md",
|
||||||
|
"tasks": [
|
||||||
|
{"id": 25, "subject": "Task 1: Trigger model + codecs", "status": "pending"},
|
||||||
|
{"id": 26, "subject": "Task 2: Runtime expression evaluation", "status": "pending", "blockedBy": [25]},
|
||||||
|
{"id": 27, "subject": "Task 3: Trigger editor panels", "status": "pending", "blockedBy": [25]},
|
||||||
|
{"id": 28, "subject": "Task 4: Pre-deployment validation", "status": "pending", "blockedBy": [25, 26]},
|
||||||
|
{"id": 29, "subject": "Task 5: Build, deploy, verify", "status": "pending", "blockedBy": [25, 26, 27, 28]}
|
||||||
|
],
|
||||||
|
"lastUpdated": "2026-05-16"
|
||||||
|
}
|
||||||
@@ -15,8 +15,8 @@ internal static class CommandHelpers
|
|||||||
Option<string> passwordOption,
|
Option<string> passwordOption,
|
||||||
object command)
|
object command)
|
||||||
{
|
{
|
||||||
var format = result.GetValue(formatOption) ?? "json";
|
|
||||||
var config = CliConfig.Load();
|
var config = CliConfig.Load();
|
||||||
|
var format = ResolveFormat(result, formatOption, config);
|
||||||
|
|
||||||
// Resolve management URL
|
// Resolve management URL
|
||||||
var url = result.GetValue(urlOption);
|
var url = result.GetValue(urlOption);
|
||||||
@@ -53,6 +53,27 @@ internal static class CommandHelpers
|
|||||||
return HandleResponse(response, format);
|
return HandleResponse(response, format);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Resolves the output format using the documented precedence chain:
|
||||||
|
/// an explicitly supplied <c>--format</c> option wins, otherwise the
|
||||||
|
/// config-file / environment-variable default (<see cref="CliConfig.DefaultFormat"/>)
|
||||||
|
/// is used, otherwise <c>json</c>. The <c>--format</c> option must not declare a
|
||||||
|
/// <c>DefaultValueFactory</c> — that would mask whether the flag was supplied.
|
||||||
|
/// </summary>
|
||||||
|
internal static string ResolveFormat(ParseResult result, Option<string> formatOption, CliConfig config)
|
||||||
|
{
|
||||||
|
// GetResult returns non-null only when the option was actually present on the
|
||||||
|
// command line, letting an explicit --format override the config default.
|
||||||
|
if (result.GetResult(formatOption) != null)
|
||||||
|
{
|
||||||
|
var explicitValue = result.GetValue(formatOption);
|
||||||
|
if (!string.IsNullOrWhiteSpace(explicitValue))
|
||||||
|
return explicitValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return string.IsNullOrWhiteSpace(config.DefaultFormat) ? "json" : config.DefaultFormat;
|
||||||
|
}
|
||||||
|
|
||||||
internal static int HandleResponse(ManagementResponse response, string format)
|
internal static int HandleResponse(ManagementResponse response, string format)
|
||||||
{
|
{
|
||||||
if (response.JsonData != null)
|
if (response.JsonData != null)
|
||||||
|
|||||||
@@ -42,8 +42,8 @@ public static class DebugCommands
|
|||||||
cmd.SetAction(async (ParseResult result) =>
|
cmd.SetAction(async (ParseResult result) =>
|
||||||
{
|
{
|
||||||
var instanceId = result.GetValue(idOption);
|
var instanceId = result.GetValue(idOption);
|
||||||
var format = result.GetValue(formatOption) ?? "json";
|
|
||||||
var config = CliConfig.Load();
|
var config = CliConfig.Load();
|
||||||
|
var format = CommandHelpers.ResolveFormat(result, formatOption, config);
|
||||||
|
|
||||||
var url = result.GetValue(urlOption);
|
var url = result.GetValue(urlOption);
|
||||||
if (string.IsNullOrWhiteSpace(url))
|
if (string.IsNullOrWhiteSpace(url))
|
||||||
|
|||||||
@@ -7,8 +7,9 @@ var rootCommand = new RootCommand("ScadaLink CLI — manage the ScadaLink SCADA
|
|||||||
var urlOption = new Option<string>("--url") { Description = "Management API URL", Recursive = true };
|
var urlOption = new Option<string>("--url") { Description = "Management API URL", Recursive = true };
|
||||||
var usernameOption = new Option<string>("--username") { Description = "LDAP username", Recursive = true };
|
var usernameOption = new Option<string>("--username") { Description = "LDAP username", Recursive = true };
|
||||||
var passwordOption = new Option<string>("--password") { Description = "LDAP password", Recursive = true };
|
var passwordOption = new Option<string>("--password") { Description = "LDAP password", Recursive = true };
|
||||||
|
// No DefaultValueFactory: format precedence (explicit --format -> config/env -> "json")
|
||||||
|
// is resolved by CommandHelpers.ResolveFormat, which needs to distinguish an absent flag.
|
||||||
var formatOption = new Option<string>("--format") { Description = "Output format (json or table)", Recursive = true };
|
var formatOption = new Option<string>("--format") { Description = "Output format (json or table)", Recursive = true };
|
||||||
formatOption.DefaultValueFactory = _ => "json";
|
|
||||||
|
|
||||||
rootCommand.Add(urlOption);
|
rootCommand.Add(urlOption);
|
||||||
rootCommand.Add(usernameOption);
|
rootCommand.Add(usernameOption);
|
||||||
|
|||||||
@@ -11,8 +11,8 @@
|
|||||||
<InternalsVisibleTo Include="ScadaLink.CLI.Tests" />
|
<InternalsVisibleTo Include="ScadaLink.CLI.Tests" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.AspNetCore.SignalR.Client" Version="9.0.3" />
|
<PackageReference Include="Microsoft.AspNetCore.SignalR.Client" />
|
||||||
<PackageReference Include="System.CommandLine" Version="2.0.5" />
|
<PackageReference Include="System.CommandLine" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
||||||
|
|||||||
@@ -7,23 +7,37 @@ namespace ScadaLink.CentralUI.Auth;
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Bridges ASP.NET Core cookie authentication with Blazor Server's auth state.
|
/// Bridges ASP.NET Core cookie authentication with Blazor Server's auth state.
|
||||||
/// The cookie middleware has already validated and decrypted the cookie by the time
|
/// <para>
|
||||||
/// the Blazor circuit is established, so we just read HttpContext.User.
|
/// The cookie middleware validates and decrypts the cookie during the initial
|
||||||
|
/// HTTP request that establishes the Blazor circuit. This provider is registered
|
||||||
|
/// <c>Scoped</c>, so it is constructed within that request's DI scope while
|
||||||
|
/// <see cref="IHttpContextAccessor.HttpContext"/> is still valid. We snapshot
|
||||||
|
/// the authenticated principal <b>once</b> in the constructor and serve that
|
||||||
|
/// snapshot for the lifetime of the circuit.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// We must NOT read <see cref="IHttpContextAccessor"/> on every
|
||||||
|
/// <see cref="GetAuthenticationStateAsync"/> call (CentralUI-004): for the
|
||||||
|
/// lifetime of a long-lived SignalR circuit <c>HttpContext</c> is <c>null</c>
|
||||||
|
/// (or, worse, a stale/foreign context), so a later re-evaluation —
|
||||||
|
/// e.g. <c><AuthorizeView></c> re-rendering — would otherwise see an
|
||||||
|
/// unauthenticated principal and render the wrong UI.
|
||||||
|
/// </para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class CookieAuthenticationStateProvider : ServerAuthenticationStateProvider
|
public class CookieAuthenticationStateProvider : ServerAuthenticationStateProvider
|
||||||
{
|
{
|
||||||
private readonly IHttpContextAccessor _httpContextAccessor;
|
private readonly Task<AuthenticationState> _circuitAuthState;
|
||||||
|
|
||||||
public CookieAuthenticationStateProvider(IHttpContextAccessor httpContextAccessor)
|
public CookieAuthenticationStateProvider(IHttpContextAccessor httpContextAccessor)
|
||||||
{
|
{
|
||||||
_httpContextAccessor = httpContextAccessor;
|
// Snapshot the principal at circuit-construction time. HttpContext is
|
||||||
|
// valid here (initial HTTP request) and will not be afterwards.
|
||||||
|
var user = httpContextAccessor.HttpContext?.User
|
||||||
|
?? new ClaimsPrincipal(new ClaimsIdentity());
|
||||||
|
|
||||||
|
_circuitAuthState = Task.FromResult(new AuthenticationState(user));
|
||||||
}
|
}
|
||||||
|
|
||||||
public override Task<AuthenticationState> GetAuthenticationStateAsync()
|
public override Task<AuthenticationState> GetAuthenticationStateAsync()
|
||||||
{
|
=> _circuitAuthState;
|
||||||
var user = _httpContextAccessor.HttpContext?.User
|
|
||||||
?? new ClaimsPrincipal(new ClaimsIdentity());
|
|
||||||
|
|
||||||
return Task.FromResult(new AuthenticationState(user));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,93 @@
|
|||||||
|
using Microsoft.AspNetCore.Components.Authorization;
|
||||||
|
using ScadaLink.Commons.Entities.Sites;
|
||||||
|
using ScadaLink.Security;
|
||||||
|
|
||||||
|
namespace ScadaLink.CentralUI.Auth;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Resolves the set of sites the current user is permitted to operate on, from
|
||||||
|
/// the <c>SiteId</c> claims attached at login (CentralUI-002).
|
||||||
|
/// <para>
|
||||||
|
/// The design (Component-CentralUI, CLAUDE.md "Security & Auth") makes the
|
||||||
|
/// Deployment role site-scoped: a Deployment user mapped through an LDAP group
|
||||||
|
/// with site-scope rules carries one <see cref="JwtTokenService.SiteIdClaimType"/>
|
||||||
|
/// claim per permitted site (the claim value is the integer <c>Site.Id</c>).
|
||||||
|
/// A Deployment user with no <c>SiteId</c> claim — and any Admin/Design user — is
|
||||||
|
/// system-wide.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// Deployment and Monitoring pages must filter every site/instance list through
|
||||||
|
/// <see cref="FilterSitesAsync"/> and re-check <see cref="IsSiteAllowedAsync"/>
|
||||||
|
/// before any cross-site command, so a scoped user cannot view or act on sites
|
||||||
|
/// outside their grant.
|
||||||
|
/// </para>
|
||||||
|
/// </summary>
|
||||||
|
public sealed class SiteScopeService
|
||||||
|
{
|
||||||
|
private readonly AuthenticationStateProvider _authStateProvider;
|
||||||
|
private (bool IsSystemWide, IReadOnlySet<int> Sites)? _cached;
|
||||||
|
|
||||||
|
public SiteScopeService(AuthenticationStateProvider authStateProvider)
|
||||||
|
{
|
||||||
|
_authStateProvider = authStateProvider;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// True when the user is not restricted to a site subset (no <c>SiteId</c>
|
||||||
|
/// claims). System-wide users see and act on every site.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<bool> IsSystemWideAsync()
|
||||||
|
=> (await ResolveAsync()).IsSystemWide;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The set of <c>Site.Id</c> values the user may operate on. Empty for a
|
||||||
|
/// system-wide user (callers should consult <see cref="IsSystemWideAsync"/>
|
||||||
|
/// or use the filter/allowed helpers, which already account for that).
|
||||||
|
/// </summary>
|
||||||
|
public async Task<IReadOnlySet<int>> PermittedSiteIdsAsync()
|
||||||
|
=> (await ResolveAsync()).Sites;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns the subset of <paramref name="sites"/> the user is permitted to
|
||||||
|
/// see. A system-wide user gets the full list back unchanged.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<List<Site>> FilterSitesAsync(IEnumerable<Site> sites)
|
||||||
|
{
|
||||||
|
var (isSystemWide, allowed) = await ResolveAsync();
|
||||||
|
if (isSystemWide)
|
||||||
|
return sites.ToList();
|
||||||
|
return sites.Where(s => allowed.Contains(s.Id)).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// True when the user may operate on the site with the given <c>Site.Id</c>.
|
||||||
|
/// Must be re-checked server-side before any mutating cross-site command.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<bool> IsSiteAllowedAsync(int siteId)
|
||||||
|
{
|
||||||
|
var (isSystemWide, allowed) = await ResolveAsync();
|
||||||
|
return isSystemWide || allowed.Contains(siteId);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<(bool IsSystemWide, IReadOnlySet<int> Sites)> ResolveAsync()
|
||||||
|
{
|
||||||
|
if (_cached is { } cached)
|
||||||
|
return cached;
|
||||||
|
|
||||||
|
var state = await _authStateProvider.GetAuthenticationStateAsync();
|
||||||
|
var siteClaims = state.User.FindAll(JwtTokenService.SiteIdClaimType);
|
||||||
|
|
||||||
|
var ids = new HashSet<int>();
|
||||||
|
foreach (var claim in siteClaims)
|
||||||
|
{
|
||||||
|
if (int.TryParse(claim.Value, out var id))
|
||||||
|
ids.Add(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No SiteId claims => system-wide. This mirrors SiteScopeAuthorizationHandler:
|
||||||
|
// absence of scope rules means an unrestricted deployer.
|
||||||
|
var result = (IsSystemWide: ids.Count == 0, Sites: (IReadOnlySet<int>)ids);
|
||||||
|
_cached = result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@
|
|||||||
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
||||||
@inject ITemplateEngineRepository TemplateEngineRepository
|
@inject ITemplateEngineRepository TemplateEngineRepository
|
||||||
@inject ISiteRepository SiteRepository
|
@inject ISiteRepository SiteRepository
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@inject DebugStreamService DebugStreamService
|
@inject DebugStreamService DebugStreamService
|
||||||
@inject IJSRuntime JS
|
@inject IJSRuntime JS
|
||||||
@implements IDisposable
|
@implements IDisposable
|
||||||
@@ -296,7 +297,9 @@
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
|
// Site scoping (CentralUI-002): a scoped Deployment user may only
|
||||||
|
// debug sites they are permitted on.
|
||||||
|
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
@@ -358,6 +361,14 @@
|
|||||||
_siteInstances.Clear();
|
_siteInstances.Clear();
|
||||||
_selectedInstanceId = 0;
|
_selectedInstanceId = 0;
|
||||||
if (_selectedSiteId == 0) return;
|
if (_selectedSiteId == 0) return;
|
||||||
|
// Site scoping (CentralUI-002): re-check the claim server-side — a query
|
||||||
|
// string or stale localStorage value could name a site outside the grant.
|
||||||
|
if (!await SiteScope.IsSiteAllowedAsync(_selectedSiteId))
|
||||||
|
{
|
||||||
|
_selectedSiteId = 0;
|
||||||
|
_toast.ShowError("You are not permitted to debug instances on that site.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
_siteInstances = (await TemplateEngineRepository.GetInstancesBySiteIdAsync(_selectedSiteId))
|
_siteInstances = (await TemplateEngineRepository.GetInstancesBySiteIdAsync(_selectedSiteId))
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
||||||
@inject IDeploymentManagerRepository DeploymentManagerRepository
|
@inject IDeploymentManagerRepository DeploymentManagerRepository
|
||||||
@inject ITemplateEngineRepository TemplateEngineRepository
|
@inject ITemplateEngineRepository TemplateEngineRepository
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@implements IDisposable
|
@implements IDisposable
|
||||||
|
|
||||||
<div class="container-fluid mt-3">
|
<div class="container-fluid mt-3">
|
||||||
@@ -245,13 +246,23 @@
|
|||||||
_errorMessage = null;
|
_errorMessage = null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
_records = (await DeploymentManagerRepository.GetAllDeploymentRecordsAsync())
|
// Build instance lookups first — site scoping (CentralUI-002) filters
|
||||||
.OrderByDescending(r => r.DeployedAt)
|
// deployment records by the site of their instance.
|
||||||
.ToList();
|
|
||||||
|
|
||||||
// Build instance name lookup
|
|
||||||
var instances = await TemplateEngineRepository.GetAllInstancesAsync();
|
var instances = await TemplateEngineRepository.GetAllInstancesAsync();
|
||||||
_instanceNames = instances.ToDictionary(i => i.Id, i => i.UniqueName);
|
_instanceNames = instances.ToDictionary(i => i.Id, i => i.UniqueName);
|
||||||
|
var instanceSiteIds = instances.ToDictionary(i => i.Id, i => i.SiteId);
|
||||||
|
|
||||||
|
var systemWide = await SiteScope.IsSystemWideAsync();
|
||||||
|
var permittedSiteIds = systemWide
|
||||||
|
? null
|
||||||
|
: await SiteScope.PermittedSiteIdsAsync();
|
||||||
|
|
||||||
|
_records = (await DeploymentManagerRepository.GetAllDeploymentRecordsAsync())
|
||||||
|
.Where(r => permittedSiteIds == null
|
||||||
|
|| (instanceSiteIds.TryGetValue(r.InstanceId, out var sid)
|
||||||
|
&& permittedSiteIds.Contains(sid)))
|
||||||
|
.OrderByDescending(r => r.DeployedAt)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
_totalPages = Math.Max(1, (int)Math.Ceiling(_records.Count / (double)PageSize));
|
_totalPages = Math.Max(1, (int)Math.Ceiling(_records.Count / (double)PageSize));
|
||||||
if (_currentPage > _totalPages) _currentPage = 1;
|
if (_currentPage > _totalPages) _currentPage = 1;
|
||||||
|
|||||||
@@ -11,6 +11,7 @@
|
|||||||
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
||||||
@inject ITemplateEngineRepository TemplateEngineRepository
|
@inject ITemplateEngineRepository TemplateEngineRepository
|
||||||
@inject ISiteRepository SiteRepository
|
@inject ISiteRepository SiteRepository
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@inject InstanceService InstanceService
|
@inject InstanceService InstanceService
|
||||||
@inject IFlatteningPipeline FlatteningPipeline
|
@inject IFlatteningPipeline FlatteningPipeline
|
||||||
@inject AuthenticationStateProvider AuthStateProvider
|
@inject AuthenticationStateProvider AuthStateProvider
|
||||||
@@ -377,6 +378,17 @@
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Site scoping (CentralUI-002): a scoped Deployment user must not be
|
||||||
|
// able to configure or deploy an instance on a site outside their
|
||||||
|
// grant by navigating straight to its URL.
|
||||||
|
if (!await SiteScope.IsSiteAllowedAsync(_instance.SiteId))
|
||||||
|
{
|
||||||
|
_instance = null;
|
||||||
|
_errorMessage = "You are not permitted to manage instances on this site.";
|
||||||
|
_loading = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Identity
|
// Identity
|
||||||
var template = await TemplateEngineRepository.GetTemplateByIdAsync(_instance.TemplateId);
|
var template = await TemplateEngineRepository.GetTemplateByIdAsync(_instance.TemplateId);
|
||||||
_templateName = template?.Name ?? $"#{_instance.TemplateId}";
|
_templateName = template?.Name ?? $"#{_instance.TemplateId}";
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
|
||||||
@inject ITemplateEngineRepository TemplateEngineRepository
|
@inject ITemplateEngineRepository TemplateEngineRepository
|
||||||
@inject ISiteRepository SiteRepository
|
@inject ISiteRepository SiteRepository
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@inject InstanceService InstanceService
|
@inject InstanceService InstanceService
|
||||||
@inject AuthenticationStateProvider AuthStateProvider
|
@inject AuthenticationStateProvider AuthStateProvider
|
||||||
@inject NavigationManager NavigationManager
|
@inject NavigationManager NavigationManager
|
||||||
@@ -93,7 +94,9 @@
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
_templates = (await TemplateEngineRepository.GetAllTemplatesAsync()).ToList();
|
_templates = (await TemplateEngineRepository.GetAllTemplatesAsync()).ToList();
|
||||||
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
|
// Site scoping (CentralUI-002): a scoped Deployment user may only
|
||||||
|
// create instances on sites they are permitted on.
|
||||||
|
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
|
||||||
|
|
||||||
_allAreas.Clear();
|
_allAreas.Clear();
|
||||||
foreach (var site in _sites)
|
foreach (var site in _sites)
|
||||||
@@ -124,6 +127,13 @@
|
|||||||
if (string.IsNullOrWhiteSpace(_createName)) { _formError = "Instance name is required."; return; }
|
if (string.IsNullOrWhiteSpace(_createName)) { _formError = "Instance name is required."; return; }
|
||||||
if (_createTemplateId == 0) { _formError = "Select a template."; return; }
|
if (_createTemplateId == 0) { _formError = "Select a template."; return; }
|
||||||
if (_createSiteId == 0) { _formError = "Select a site."; return; }
|
if (_createSiteId == 0) { _formError = "Select a site."; return; }
|
||||||
|
// Site scoping (CentralUI-002): re-check server-side before the mutating
|
||||||
|
// command, independent of what the site dropdown was populated with.
|
||||||
|
if (!await SiteScope.IsSiteAllowedAsync(_createSiteId))
|
||||||
|
{
|
||||||
|
_formError = "You are not permitted to create instances on the selected site.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
@inject AreaService AreaService
|
@inject AreaService AreaService
|
||||||
@inject InstanceService InstanceService
|
@inject InstanceService InstanceService
|
||||||
@inject AuthenticationStateProvider AuthStateProvider
|
@inject AuthenticationStateProvider AuthStateProvider
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@inject NavigationManager NavigationManager
|
@inject NavigationManager NavigationManager
|
||||||
@inject IJSRuntime JSRuntime
|
@inject IJSRuntime JSRuntime
|
||||||
@inject IDialogService Dialog
|
@inject IDialogService Dialog
|
||||||
@@ -225,8 +226,13 @@
|
|||||||
_errorMessage = null;
|
_errorMessage = null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
_allInstances = (await TemplateEngineRepository.GetAllInstancesAsync()).ToList();
|
// Site scoping (CentralUI-002): a scoped Deployment user only sees the
|
||||||
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
|
// sites — and therefore the areas/instances — they are permitted on.
|
||||||
|
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
|
||||||
|
var permittedSiteIds = _sites.Select(s => s.Id).ToHashSet();
|
||||||
|
_allInstances = (await TemplateEngineRepository.GetAllInstancesAsync())
|
||||||
|
.Where(i => permittedSiteIds.Contains(i.SiteId))
|
||||||
|
.ToList();
|
||||||
_templates = (await TemplateEngineRepository.GetAllTemplatesAsync()).ToList();
|
_templates = (await TemplateEngineRepository.GetAllTemplatesAsync()).ToList();
|
||||||
|
|
||||||
_allAreas.Clear();
|
_allAreas.Clear();
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
@using ScadaLink.Commons.Messages.RemoteQuery
|
@using ScadaLink.Commons.Messages.RemoteQuery
|
||||||
@using ScadaLink.Communication
|
@using ScadaLink.Communication
|
||||||
@inject ISiteRepository SiteRepository
|
@inject ISiteRepository SiteRepository
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@inject CommunicationService CommunicationService
|
@inject CommunicationService CommunicationService
|
||||||
|
|
||||||
<div class="container-fluid mt-3">
|
<div class="container-fluid mt-3">
|
||||||
@@ -212,9 +213,16 @@
|
|||||||
|
|
||||||
protected override async Task OnInitializedAsync()
|
protected override async Task OnInitializedAsync()
|
||||||
{
|
{
|
||||||
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
|
// Site scoping (CentralUI-002): a scoped Deployment user may only query
|
||||||
|
// event logs for the sites they are permitted on.
|
||||||
|
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// _sites is already filtered, so membership IS the scope check.
|
||||||
|
private bool SelectedSiteIsPermitted =>
|
||||||
|
!string.IsNullOrEmpty(_selectedSiteId)
|
||||||
|
&& _sites.Any(s => s.SiteIdentifier == _selectedSiteId);
|
||||||
|
|
||||||
private async Task Search()
|
private async Task Search()
|
||||||
{
|
{
|
||||||
_entries = new();
|
_entries = new();
|
||||||
@@ -237,6 +245,14 @@
|
|||||||
{
|
{
|
||||||
_searching = true;
|
_searching = true;
|
||||||
_errorMessage = null;
|
_errorMessage = null;
|
||||||
|
// Site scoping (CentralUI-002): re-check before querying — the dropdown is
|
||||||
|
// filtered, but the selection must not be trusted on its own.
|
||||||
|
if (!SelectedSiteIsPermitted)
|
||||||
|
{
|
||||||
|
_errorMessage = "You are not permitted to view event logs for that site.";
|
||||||
|
_searching = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var request = new EventLogQueryRequest(
|
var request = new EventLogQueryRequest(
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
@using ScadaLink.Commons.Types.Enums
|
@using ScadaLink.Commons.Types.Enums
|
||||||
@using ScadaLink.Communication
|
@using ScadaLink.Communication
|
||||||
@inject ISiteRepository SiteRepository
|
@inject ISiteRepository SiteRepository
|
||||||
|
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
|
||||||
@inject CommunicationService CommunicationService
|
@inject CommunicationService CommunicationService
|
||||||
@inject IJSRuntime JS
|
@inject IJSRuntime JS
|
||||||
@inject IDialogService Dialog
|
@inject IDialogService Dialog
|
||||||
@@ -360,9 +361,17 @@
|
|||||||
|
|
||||||
protected override async Task OnInitializedAsync()
|
protected override async Task OnInitializedAsync()
|
||||||
{
|
{
|
||||||
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
|
// Site scoping (CentralUI-002): a scoped Deployment user may only inspect
|
||||||
|
// and act on parked messages for the sites they are permitted on.
|
||||||
|
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// True only when the currently selected SiteIdentifier is one this user is
|
||||||
|
// permitted on. _sites is already filtered, so membership IS the scope check.
|
||||||
|
private bool SelectedSiteIsPermitted =>
|
||||||
|
!string.IsNullOrEmpty(_selectedSiteId)
|
||||||
|
&& _sites.Any(s => s.SiteIdentifier == _selectedSiteId);
|
||||||
|
|
||||||
private async Task OnSiteChanged(ChangeEventArgs e)
|
private async Task OnSiteChanged(ChangeEventArgs e)
|
||||||
{
|
{
|
||||||
_selectedSiteId = e.Value?.ToString() ?? string.Empty;
|
_selectedSiteId = e.Value?.ToString() ?? string.Empty;
|
||||||
@@ -393,6 +402,15 @@
|
|||||||
{
|
{
|
||||||
_searching = true;
|
_searching = true;
|
||||||
_errorMessage = null;
|
_errorMessage = null;
|
||||||
|
// Site scoping (CentralUI-002): re-check before querying — the dropdown is
|
||||||
|
// filtered, but the selection must not be trusted on its own.
|
||||||
|
if (!SelectedSiteIsPermitted)
|
||||||
|
{
|
||||||
|
_errorMessage = "You are not permitted to view parked messages for that site.";
|
||||||
|
_messages = null;
|
||||||
|
_searching = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var request = new ParkedMessageQueryRequest(
|
var request = new ParkedMessageQueryRequest(
|
||||||
@@ -557,6 +575,7 @@
|
|||||||
{
|
{
|
||||||
var ids = _selectedIds.ToList();
|
var ids = _selectedIds.ToList();
|
||||||
if (ids.Count == 0) return;
|
if (ids.Count == 0) return;
|
||||||
|
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return; }
|
||||||
|
|
||||||
var confirmed = await Dialog.ConfirmAsync(
|
var confirmed = await Dialog.ConfirmAsync(
|
||||||
"Retry parked messages",
|
"Retry parked messages",
|
||||||
@@ -587,6 +606,7 @@
|
|||||||
{
|
{
|
||||||
var ids = _selectedIds.ToList();
|
var ids = _selectedIds.ToList();
|
||||||
if (ids.Count == 0) return;
|
if (ids.Count == 0) return;
|
||||||
|
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return; }
|
||||||
|
|
||||||
var confirmed = await Dialog.ConfirmAsync(
|
var confirmed = await Dialog.ConfirmAsync(
|
||||||
"Discard parked messages",
|
"Discard parked messages",
|
||||||
@@ -618,6 +638,7 @@
|
|||||||
|
|
||||||
private async Task RetrySingle(ParkedMessageEntry msg)
|
private async Task RetrySingle(ParkedMessageEntry msg)
|
||||||
{
|
{
|
||||||
|
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return; }
|
||||||
_actionInProgress = true;
|
_actionInProgress = true;
|
||||||
_activeAction = "Retry";
|
_activeAction = "Retry";
|
||||||
try
|
try
|
||||||
@@ -638,6 +659,7 @@
|
|||||||
|
|
||||||
private async Task<bool> DiscardSingle(ParkedMessageEntry msg)
|
private async Task<bool> DiscardSingle(ParkedMessageEntry msg)
|
||||||
{
|
{
|
||||||
|
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return false; }
|
||||||
var confirmed = await Dialog.ConfirmAsync(
|
var confirmed = await Dialog.ConfirmAsync(
|
||||||
"Discard parked message",
|
"Discard parked message",
|
||||||
$"Permanently discard message {ShortId(msg.MessageId)}? This cannot be undone.",
|
$"Permanently discard message {ShortId(msg.MessageId)}? This cannot be undone.",
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ namespace ScadaLink.CentralUI.Components.Shared;
|
|||||||
/// RateOfChange { attributeName, thresholdPerSecond, windowSeconds, direction }
|
/// RateOfChange { attributeName, thresholdPerSecond, windowSeconds, direction }
|
||||||
/// HiLo { attributeName, loLo, lo, hi, hiHi,
|
/// HiLo { attributeName, loLo, lo, hi, hiHi,
|
||||||
/// loLoPriority, loPriority, hiPriority, hiHiPriority }
|
/// loLoPriority, loPriority, hiPriority, hiHiPriority }
|
||||||
|
/// Expression { expression }
|
||||||
///
|
///
|
||||||
/// All HiLo setpoints and per-setpoint priorities are optional — any subset
|
/// All HiLo setpoints and per-setpoint priorities are optional — any subset
|
||||||
/// is valid (e.g., only Hi/HiHi configured for over-temperature protection).
|
/// is valid (e.g., only Hi/HiHi configured for over-temperature protection).
|
||||||
@@ -93,6 +94,10 @@ internal static class AlarmTriggerConfigCodec
|
|||||||
model.HiMessage = TryReadString(root, "hiMessage");
|
model.HiMessage = TryReadString(root, "hiMessage");
|
||||||
model.HiHiMessage = TryReadString(root, "hiHiMessage");
|
model.HiHiMessage = TryReadString(root, "hiHiMessage");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case AlarmTriggerType.Expression:
|
||||||
|
model.Expression = TryReadString(root, "expression");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (JsonException)
|
catch (JsonException)
|
||||||
@@ -105,8 +110,10 @@ internal static class AlarmTriggerConfigCodec
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Serializes the model to the JSON shape AlarmActor.ParseEvalConfig
|
/// Serializes the model to the JSON shape AlarmActor.ParseEvalConfig
|
||||||
/// expects. Always writes <c>attributeName</c> (canonical key) and only
|
/// expects. Writes <c>attributeName</c> (canonical key) for the
|
||||||
/// the keys relevant to the current trigger type.
|
/// attribute-bound trigger types and only the keys relevant to the
|
||||||
|
/// current trigger type. <c>Expression</c> is not bound to a single
|
||||||
|
/// attribute, so <c>attributeName</c> is omitted for it.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal static string Serialize(AlarmTriggerModel model, AlarmTriggerType type)
|
internal static string Serialize(AlarmTriggerModel model, AlarmTriggerType type)
|
||||||
{
|
{
|
||||||
@@ -114,7 +121,8 @@ internal static class AlarmTriggerConfigCodec
|
|||||||
using (var w = new Utf8JsonWriter(stream))
|
using (var w = new Utf8JsonWriter(stream))
|
||||||
{
|
{
|
||||||
w.WriteStartObject();
|
w.WriteStartObject();
|
||||||
w.WriteString("attributeName", model.AttributeName ?? "");
|
if (type != AlarmTriggerType.Expression)
|
||||||
|
w.WriteString("attributeName", model.AttributeName ?? "");
|
||||||
|
|
||||||
switch (type)
|
switch (type)
|
||||||
{
|
{
|
||||||
@@ -155,6 +163,10 @@ internal static class AlarmTriggerConfigCodec
|
|||||||
if (!string.IsNullOrEmpty(model.HiMessage)) w.WriteString("hiMessage", model.HiMessage);
|
if (!string.IsNullOrEmpty(model.HiMessage)) w.WriteString("hiMessage", model.HiMessage);
|
||||||
if (!string.IsNullOrEmpty(model.HiHiMessage)) w.WriteString("hiHiMessage", model.HiHiMessage);
|
if (!string.IsNullOrEmpty(model.HiHiMessage)) w.WriteString("hiHiMessage", model.HiHiMessage);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case AlarmTriggerType.Expression:
|
||||||
|
w.WriteString("expression", model.Expression ?? "");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
w.WriteEndObject();
|
w.WriteEndObject();
|
||||||
@@ -241,4 +253,7 @@ internal sealed class AlarmTriggerModel
|
|||||||
public string? LoMessage { get; set; }
|
public string? LoMessage { get; set; }
|
||||||
public string? HiMessage { get; set; }
|
public string? HiMessage { get; set; }
|
||||||
public string? HiHiMessage { get; set; }
|
public string? HiHiMessage { get; set; }
|
||||||
|
|
||||||
|
// Expression — boolean C# expression evaluated on attribute updates.
|
||||||
|
public string? Expression { get; set; }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,10 @@
|
|||||||
<div class="border rounded bg-white p-3">
|
<div class="border rounded bg-white p-3">
|
||||||
|
|
||||||
@* ── Monitored attribute ───────────────────────────────────────────── *@
|
@* ── Monitored attribute ───────────────────────────────────────────── *@
|
||||||
|
@* Expression triggers reference attributes inside the C# expression itself,
|
||||||
|
so they do not use the single-attribute picker. *@
|
||||||
|
@if (TriggerType != AlarmTriggerType.Expression)
|
||||||
|
{
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
<label for="alarm-attr-select" class="form-label small text-uppercase text-muted fw-semibold mb-1">
|
<label for="alarm-attr-select" class="form-label small text-uppercase text-muted fw-semibold mb-1">
|
||||||
Monitored attribute
|
Monitored attribute
|
||||||
@@ -67,6 +71,7 @@
|
|||||||
</div>
|
</div>
|
||||||
}
|
}
|
||||||
</div>
|
</div>
|
||||||
|
}
|
||||||
|
|
||||||
@* ── Type-specific block ───────────────────────────────────────────── *@
|
@* ── Type-specific block ───────────────────────────────────────────── *@
|
||||||
@switch (TriggerType)
|
@switch (TriggerType)
|
||||||
@@ -83,6 +88,9 @@
|
|||||||
case AlarmTriggerType.HiLo:
|
case AlarmTriggerType.HiLo:
|
||||||
@RenderHiLo();
|
@RenderHiLo();
|
||||||
break;
|
break;
|
||||||
|
case AlarmTriggerType.Expression:
|
||||||
|
@RenderExpression();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@* ── Hint ──────────────────────────────────────────────────────────── *@
|
@* ── Hint ──────────────────────────────────────────────────────────── *@
|
||||||
@@ -559,6 +567,30 @@
|
|||||||
await Emit();
|
await Emit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Expression ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
private RenderFragment RenderExpression() => __builder =>
|
||||||
|
{
|
||||||
|
<label class="form-label small text-uppercase text-muted fw-semibold mb-1">Trigger expression</label>
|
||||||
|
<MonacoEditor Height="120px"
|
||||||
|
Language="csharp"
|
||||||
|
ScriptKind="ScriptAnalysis.ScriptKind.Template"
|
||||||
|
ShowToolbar="false"
|
||||||
|
Value="@(_model.Expression ?? string.Empty)"
|
||||||
|
ValueChanged="OnExpressionChanged"
|
||||||
|
SelfAttributes="@TriggerAttributeMapper.SelfAttributes(AvailableAttributes)"
|
||||||
|
Children="@TriggerAttributeMapper.Children(AvailableAttributes)" />
|
||||||
|
<div class="form-text">
|
||||||
|
A boolean C# expression — e.g. <code>Attributes["Temperature"] > 80</code>.
|
||||||
|
</div>
|
||||||
|
};
|
||||||
|
|
||||||
|
private async Task OnExpressionChanged(string value)
|
||||||
|
{
|
||||||
|
_model.Expression = value;
|
||||||
|
await Emit();
|
||||||
|
}
|
||||||
|
|
||||||
// ── Hint text ──────────────────────────────────────────────────────────
|
// ── Hint text ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
private string BuildHint()
|
private string BuildHint()
|
||||||
@@ -582,6 +614,9 @@
|
|||||||
|
|
||||||
AlarmTriggerType.HiLo => BuildHiLoHint(attr),
|
AlarmTriggerType.HiLo => BuildHiLoHint(attr),
|
||||||
|
|
||||||
|
AlarmTriggerType.Expression =>
|
||||||
|
"Alarm is active while this expression is true.",
|
||||||
|
|
||||||
_ => string.Empty
|
_ => string.Empty
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ namespace ScadaLink.CentralUI.Components.Shared;
|
|||||||
/// trigger; <see cref="Unknown"/> is a stored trigger-type string the runtime
|
/// trigger; <see cref="Unknown"/> is a stored trigger-type string the runtime
|
||||||
/// does not recognize (preserved as-is by the editor).
|
/// does not recognize (preserved as-is by the editor).
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal enum ScriptTriggerKind { None, Interval, ValueChange, Conditional, Call, Unknown }
|
internal enum ScriptTriggerKind { None, Interval, ValueChange, Conditional, Call, Expression, Unknown }
|
||||||
|
|
||||||
/// <summary>A script's trigger as the editor emits it: a type string + config JSON.</summary>
|
/// <summary>A script's trigger as the editor emits it: a type string + config JSON.</summary>
|
||||||
public sealed record ScriptTriggerValue(string? TriggerType, string? Config);
|
public sealed record ScriptTriggerValue(string? TriggerType, string? Config);
|
||||||
@@ -29,6 +29,9 @@ internal sealed class ScriptTriggerModel
|
|||||||
|
|
||||||
/// <summary>Comparison threshold (Conditional).</summary>
|
/// <summary>Comparison threshold (Conditional).</summary>
|
||||||
public double? Threshold { get; set; }
|
public double? Threshold { get; set; }
|
||||||
|
|
||||||
|
/// <summary>Boolean C# expression (Expression).</summary>
|
||||||
|
public string? Expression { get; set; }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -40,6 +43,7 @@ internal sealed class ScriptTriggerModel
|
|||||||
/// ValueChange { attributeName }
|
/// ValueChange { attributeName }
|
||||||
/// Conditional { attributeName, operator, threshold }
|
/// Conditional { attributeName, operator, threshold }
|
||||||
/// Call { }
|
/// Call { }
|
||||||
|
/// Expression { expression }
|
||||||
///
|
///
|
||||||
/// Parsing also accepts the legacy aliases <c>attribute</c> and <c>value</c> so
|
/// Parsing also accepts the legacy aliases <c>attribute</c> and <c>value</c> so
|
||||||
/// older configs survive a round-trip through the editor.
|
/// older configs survive a round-trip through the editor.
|
||||||
@@ -59,6 +63,7 @@ internal static class ScriptTriggerConfigCodec
|
|||||||
"valuechange" => ScriptTriggerKind.ValueChange,
|
"valuechange" => ScriptTriggerKind.ValueChange,
|
||||||
"conditional" => ScriptTriggerKind.Conditional,
|
"conditional" => ScriptTriggerKind.Conditional,
|
||||||
"call" => ScriptTriggerKind.Call,
|
"call" => ScriptTriggerKind.Call,
|
||||||
|
"expression" => ScriptTriggerKind.Expression,
|
||||||
_ => ScriptTriggerKind.Unknown
|
_ => ScriptTriggerKind.Unknown
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -70,6 +75,7 @@ internal static class ScriptTriggerConfigCodec
|
|||||||
ScriptTriggerKind.ValueChange => "ValueChange",
|
ScriptTriggerKind.ValueChange => "ValueChange",
|
||||||
ScriptTriggerKind.Conditional => "Conditional",
|
ScriptTriggerKind.Conditional => "Conditional",
|
||||||
ScriptTriggerKind.Call => "Call",
|
ScriptTriggerKind.Call => "Call",
|
||||||
|
ScriptTriggerKind.Expression => "Expression",
|
||||||
_ => null
|
_ => null
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -104,6 +110,10 @@ internal static class ScriptTriggerConfigCodec
|
|||||||
model.Operator = NormalizeOperator(op);
|
model.Operator = NormalizeOperator(op);
|
||||||
model.Threshold = TryReadDouble(root, "threshold") ?? TryReadDouble(root, "value");
|
model.Threshold = TryReadDouble(root, "threshold") ?? TryReadDouble(root, "value");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ScriptTriggerKind.Expression:
|
||||||
|
model.Expression = root.TryGetProperty("expression", out var e) ? e.GetString() : null;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (JsonException)
|
catch (JsonException)
|
||||||
@@ -144,6 +154,10 @@ internal static class ScriptTriggerConfigCodec
|
|||||||
w.WriteNumber("threshold", model.Threshold.Value);
|
w.WriteNumber("threshold", model.Threshold.Value);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ScriptTriggerKind.Expression:
|
||||||
|
w.WriteString("expression", model.Expression ?? "");
|
||||||
|
break;
|
||||||
|
|
||||||
// Call → empty object.
|
// Call → empty object.
|
||||||
}
|
}
|
||||||
w.WriteEndObject();
|
w.WriteEndObject();
|
||||||
|
|||||||
@@ -23,6 +23,7 @@
|
|||||||
<option value="Interval">Interval — run on a fixed timer</option>
|
<option value="Interval">Interval — run on a fixed timer</option>
|
||||||
<option value="ValueChange">Value change — run when an attribute changes</option>
|
<option value="ValueChange">Value change — run when an attribute changes</option>
|
||||||
<option value="Conditional">Conditional — run when a condition is met</option>
|
<option value="Conditional">Conditional — run when a condition is met</option>
|
||||||
|
<option value="Expression">Expression — run when a boolean expression becomes true</option>
|
||||||
<option value="Call">Call — run only when invoked by another script</option>
|
<option value="Call">Call — run only when invoked by another script</option>
|
||||||
@if (_kind == ScriptTriggerKind.Unknown)
|
@if (_kind == ScriptTriggerKind.Unknown)
|
||||||
{
|
{
|
||||||
@@ -45,6 +46,9 @@
|
|||||||
case ScriptTriggerKind.Conditional:
|
case ScriptTriggerKind.Conditional:
|
||||||
@RenderConditional();
|
@RenderConditional();
|
||||||
break;
|
break;
|
||||||
|
case ScriptTriggerKind.Expression:
|
||||||
|
@RenderExpression();
|
||||||
|
break;
|
||||||
case ScriptTriggerKind.Call:
|
case ScriptTriggerKind.Call:
|
||||||
<div class="small text-muted">
|
<div class="small text-muted">
|
||||||
No automatic trigger — this script runs only when another script
|
No automatic trigger — this script runs only when another script
|
||||||
@@ -62,7 +66,8 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
@* ── Hint ──────────────────────────────────────────────────────────── *@
|
@* ── Hint ──────────────────────────────────────────────────────────── *@
|
||||||
@if (_kind is ScriptTriggerKind.Interval or ScriptTriggerKind.ValueChange or ScriptTriggerKind.Conditional)
|
@if (_kind is ScriptTriggerKind.Interval or ScriptTriggerKind.ValueChange
|
||||||
|
or ScriptTriggerKind.Conditional or ScriptTriggerKind.Expression)
|
||||||
{
|
{
|
||||||
<div class="mt-3 pt-2 border-top small text-muted">@BuildHint()</div>
|
<div class="mt-3 pt-2 border-top small text-muted">@BuildHint()</div>
|
||||||
}
|
}
|
||||||
@@ -244,6 +249,30 @@
|
|||||||
await Emit();
|
await Emit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Expression ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
private RenderFragment RenderExpression() => __builder =>
|
||||||
|
{
|
||||||
|
<label class="form-label small text-uppercase text-muted fw-semibold mb-1">Trigger expression</label>
|
||||||
|
<MonacoEditor Height="120px"
|
||||||
|
Language="csharp"
|
||||||
|
ScriptKind="ScriptAnalysis.ScriptKind.Template"
|
||||||
|
ShowToolbar="false"
|
||||||
|
Value="@(_model.Expression ?? string.Empty)"
|
||||||
|
ValueChanged="OnExpressionChanged"
|
||||||
|
SelfAttributes="@TriggerAttributeMapper.SelfAttributes(AvailableAttributes)"
|
||||||
|
Children="@TriggerAttributeMapper.Children(AvailableAttributes)" />
|
||||||
|
<div class="form-text">
|
||||||
|
A boolean C# expression — e.g. <code>Attributes["Temperature"] > 80</code>.
|
||||||
|
</div>
|
||||||
|
};
|
||||||
|
|
||||||
|
private async Task OnExpressionChanged(string value)
|
||||||
|
{
|
||||||
|
_model.Expression = value;
|
||||||
|
await Emit();
|
||||||
|
}
|
||||||
|
|
||||||
// ── Attribute picker (ValueChange + Conditional) ───────────────────────
|
// ── Attribute picker (ValueChange + Conditional) ───────────────────────
|
||||||
|
|
||||||
private RenderFragment RenderAttributePicker(string label) => __builder =>
|
private RenderFragment RenderAttributePicker(string label) => __builder =>
|
||||||
@@ -315,6 +344,9 @@
|
|||||||
? $"Runs when {attr} changes, if {attr} {_model.Operator} {t.ToString("0.###", CultureInfo.InvariantCulture)}."
|
? $"Runs when {attr} changes, if {attr} {_model.Operator} {t.ToString("0.###", CultureInfo.InvariantCulture)}."
|
||||||
: $"Runs when {attr} changes and meets the configured condition — set a threshold above.",
|
: $"Runs when {attr} changes and meets the configured condition — set a threshold above.",
|
||||||
|
|
||||||
|
ScriptTriggerKind.Expression =>
|
||||||
|
"Runs once each time this expression becomes true.",
|
||||||
|
|
||||||
_ => string.Empty
|
_ => string.Empty
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
using ScadaLink.CentralUI.ScriptAnalysis;
|
||||||
|
|
||||||
|
namespace ScadaLink.CentralUI.Components.Shared;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Maps the trigger editors' flattened <see cref="AlarmAttributeChoice"/> list
|
||||||
|
/// into the metadata the <see cref="MonacoEditor"/> uses to drive C# completion
|
||||||
|
/// inside an expression trigger:
|
||||||
|
/// <list type="bullet">
|
||||||
|
/// <item>Direct + Inherited choices become <see cref="AttributeShape"/>s,
|
||||||
|
/// surfaced under <c>Attributes["..."]</c>.</item>
|
||||||
|
/// <item>Composed choices — whose canonical name is dotted, e.g.
|
||||||
|
/// <c>CoolingTank.Temp</c> — are grouped by their composition-instance prefix
|
||||||
|
/// into <see cref="CompositionContext"/>s, surfaced under
|
||||||
|
/// <c>Children["..."].Attributes["..."]</c>.</item>
|
||||||
|
/// </list>
|
||||||
|
/// </summary>
|
||||||
|
public static class TriggerAttributeMapper
|
||||||
|
{
|
||||||
|
/// <summary>Direct and inherited attributes, exposed as <c>Attributes["..."]</c>.</summary>
|
||||||
|
public static IReadOnlyList<AttributeShape> SelfAttributes(
|
||||||
|
IReadOnlyList<AlarmAttributeChoice> choices) =>
|
||||||
|
choices
|
||||||
|
.Where(c => c.Source is "Direct" or "Inherited")
|
||||||
|
.Select(c => new AttributeShape(c.CanonicalName, c.DataType))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Composed attributes grouped by composition-instance name, exposed as
|
||||||
|
/// <c>Children["X"].Attributes["Y"]</c>. Entries without a dotted prefix
|
||||||
|
/// are skipped (no child scope to attach them to).
|
||||||
|
/// </summary>
|
||||||
|
public static IReadOnlyList<CompositionContext> Children(
|
||||||
|
IReadOnlyList<AlarmAttributeChoice> choices) =>
|
||||||
|
choices
|
||||||
|
.Where(c => c.Source == "Composed" && c.CanonicalName.Contains('.'))
|
||||||
|
.Select(c => new
|
||||||
|
{
|
||||||
|
Child = c.CanonicalName[..c.CanonicalName.IndexOf('.')],
|
||||||
|
Member = c.CanonicalName[(c.CanonicalName.IndexOf('.') + 1)..],
|
||||||
|
c.DataType
|
||||||
|
})
|
||||||
|
.GroupBy(x => x.Child, StringComparer.Ordinal)
|
||||||
|
.Select(g => new CompositionContext(
|
||||||
|
g.Key,
|
||||||
|
g.Select(x => new AttributeShape(x.Member, x.DataType)).ToList(),
|
||||||
|
Array.Empty<ScriptShape>()))
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
@@ -16,8 +16,8 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="4.13.0" />
|
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" />
|
||||||
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Workspaces" Version="4.13.0" />
|
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Workspaces" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -0,0 +1,106 @@
|
|||||||
|
using System.Text;
|
||||||
|
|
||||||
|
namespace ScadaLink.CentralUI.ScriptAnalysis;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Per-call console capture for the Test Run sandbox.
|
||||||
|
/// <para>
|
||||||
|
/// Sandbox scripts use <c>System.Console.WriteLine</c> for ad-hoc output. The
|
||||||
|
/// sandbox needs to capture that output per execution. <c>Console.Out</c> is,
|
||||||
|
/// however, <b>process-global</b>: redirecting it with <c>Console.SetOut</c> for
|
||||||
|
/// the duration of one run corrupts any other run executing concurrently —
|
||||||
|
/// outputs interleave, and whichever run finishes first restores
|
||||||
|
/// <c>Console.Out</c> while the others are still writing (CentralUI-003).
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// This writer is installed into <c>Console.Out</c>/<c>Console.Error</c>
|
||||||
|
/// <b>exactly once</b> (see <see cref="Install"/>) and never removed. Each
|
||||||
|
/// concurrent run pushes its own buffer onto an <see cref="AsyncLocal{T}"/>
|
||||||
|
/// scope via <see cref="BeginCapture"/>; writes on that run's logical call-tree
|
||||||
|
/// land in that run's buffer only. Writes made on threads with no active
|
||||||
|
/// capture scope (i.e. genuine host-process console output) fall through to the
|
||||||
|
/// original writer. No process-global mutation happens per run.
|
||||||
|
/// </para>
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class SandboxConsoleCapture : TextWriter
|
||||||
|
{
|
||||||
|
private static readonly object InstallLock = new();
|
||||||
|
private static SandboxConsoleCapture? _outInstance;
|
||||||
|
private static SandboxConsoleCapture? _errorInstance;
|
||||||
|
|
||||||
|
private readonly TextWriter _fallback;
|
||||||
|
private readonly AsyncLocal<StringWriter?> _current = new();
|
||||||
|
|
||||||
|
private SandboxConsoleCapture(TextWriter fallback) => _fallback = fallback;
|
||||||
|
|
||||||
|
public override Encoding Encoding => _fallback.Encoding;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Installs the routing writers into <see cref="Console.Out"/> and
|
||||||
|
/// <see cref="Console.Error"/> once for the process. Idempotent and
|
||||||
|
/// thread-safe. Subsequent calls return the already-installed instances.
|
||||||
|
/// </summary>
|
||||||
|
public static (SandboxConsoleCapture Out, SandboxConsoleCapture Error) Install()
|
||||||
|
{
|
||||||
|
if (_outInstance != null && _errorInstance != null)
|
||||||
|
return (_outInstance, _errorInstance);
|
||||||
|
|
||||||
|
lock (InstallLock)
|
||||||
|
{
|
||||||
|
if (_outInstance == null)
|
||||||
|
{
|
||||||
|
_outInstance = new SandboxConsoleCapture(Console.Out);
|
||||||
|
Console.SetOut(_outInstance);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_errorInstance == null)
|
||||||
|
{
|
||||||
|
_errorInstance = new SandboxConsoleCapture(Console.Error);
|
||||||
|
Console.SetError(_errorInstance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (_outInstance, _errorInstance);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Begins a capture scope on the current logical (async) call-tree. All
|
||||||
|
/// console writes from this point until the returned scope is disposed are
|
||||||
|
/// routed into <paramref name="buffer"/> instead of the original writer.
|
||||||
|
/// The scope is restored on dispose, so nesting and concurrent scopes on
|
||||||
|
/// other call-trees are unaffected.
|
||||||
|
/// </summary>
|
||||||
|
public CaptureScope BeginCapture(StringWriter buffer)
|
||||||
|
{
|
||||||
|
var previous = _current.Value;
|
||||||
|
_current.Value = buffer;
|
||||||
|
return new CaptureScope(this, previous);
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void Write(char value) => Target.Write(value);
|
||||||
|
|
||||||
|
public override void Write(string? value) => Target.Write(value);
|
||||||
|
|
||||||
|
public override void Write(char[] buffer, int index, int count) =>
|
||||||
|
Target.Write(buffer, index, count);
|
||||||
|
|
||||||
|
public override void WriteLine() => Target.WriteLine();
|
||||||
|
|
||||||
|
public override void WriteLine(string? value) => Target.WriteLine(value);
|
||||||
|
|
||||||
|
private TextWriter Target => _current.Value ?? _fallback;
|
||||||
|
|
||||||
|
internal readonly struct CaptureScope : IDisposable
|
||||||
|
{
|
||||||
|
private readonly SandboxConsoleCapture _owner;
|
||||||
|
private readonly StringWriter? _previous;
|
||||||
|
|
||||||
|
internal CaptureScope(SandboxConsoleCapture owner, StringWriter? previous)
|
||||||
|
{
|
||||||
|
_owner = owner;
|
||||||
|
_previous = previous;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose() => _owner._current.Value = _previous;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -165,8 +165,10 @@ public class ScriptAnalysisService
|
|||||||
/// because a shared script has no template siblings in this context.
|
/// because a shared script has no template siblings in this context.
|
||||||
/// For the SandboxInboundScriptHost surface, every <c>Route</c> call throws
|
/// For the SandboxInboundScriptHost surface, every <c>Route</c> call throws
|
||||||
/// because cross-site routing needs a deployed site.
|
/// because cross-site routing needs a deployed site.
|
||||||
/// Console.Out / Console.Error are redirected per-call so writes from
|
/// Console.Out / Console.Error are captured per-call via an AsyncLocal
|
||||||
/// the script land in the result.
|
/// scope (see <see cref="SandboxConsoleCapture"/>) so writes from the script
|
||||||
|
/// land in the result without mutating process-global Console state — two
|
||||||
|
/// concurrent Test Runs do not interfere with each other.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public async Task<SandboxRunResult> RunInSandboxAsync(SandboxRunRequest request, CancellationToken ct)
|
public async Task<SandboxRunResult> RunInSandboxAsync(SandboxRunRequest request, CancellationToken ct)
|
||||||
{
|
{
|
||||||
@@ -220,6 +222,20 @@ public class ScriptAnalysisService
|
|||||||
SandboxErrorKind.CompileError, 0, markers);
|
SandboxErrorKind.CompileError, 0, markers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Trust-model gate (CentralUI-001): the documented forbidden-API set is
|
||||||
|
// enforced HERE, before execution — not merely surfaced as an editor hint.
|
||||||
|
// Without this, a Design-role user could run arbitrary file/process/
|
||||||
|
// reflection/network code in the central host process.
|
||||||
|
var trustViolations = EnforceTrustModel(script.GetCompilation());
|
||||||
|
if (trustViolations.Count > 0)
|
||||||
|
{
|
||||||
|
return new SandboxRunResult(false, null, null, "",
|
||||||
|
"Script blocked by the trust model — it references forbidden APIs "
|
||||||
|
+ "(System.IO, System.Diagnostics, System.Reflection, System.Net, threading). "
|
||||||
|
+ "See the highlighted diagnostics.",
|
||||||
|
SandboxErrorKind.CompileError, 0, trustViolations);
|
||||||
|
}
|
||||||
|
|
||||||
var parameters = ConvertJsonParameters(request.Parameters);
|
var parameters = ConvertJsonParameters(request.Parameters);
|
||||||
|
|
||||||
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
|
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
|
||||||
@@ -311,6 +327,13 @@ public class ScriptAnalysisService
|
|||||||
throw new ScriptSandboxException(
|
throw new ScriptSandboxException(
|
||||||
$"Scripts.CallShared(\"{name}\") compile failed: {string.Join("; ", nestedErrors.Select(d => d.GetMessage()))}");
|
$"Scripts.CallShared(\"{name}\") compile failed: {string.Join("; ", nestedErrors.Select(d => d.GetMessage()))}");
|
||||||
|
|
||||||
|
// Trust-model gate (CentralUI-001) — a nested shared script runs
|
||||||
|
// arbitrary code too, so it must clear the same forbidden-API gate.
|
||||||
|
if (EnforceTrustModel(built.GetCompilation()).Count > 0)
|
||||||
|
throw new ScriptSandboxException(
|
||||||
|
$"Scripts.CallShared(\"{name}\") is blocked by the script trust model — "
|
||||||
|
+ "the shared script references forbidden APIs.");
|
||||||
|
|
||||||
lock (compileCacheLock)
|
lock (compileCacheLock)
|
||||||
{
|
{
|
||||||
if (!compileCache.TryGetValue(name, out compiled))
|
if (!compileCache.TryGetValue(name, out compiled))
|
||||||
@@ -356,16 +379,19 @@ public class ScriptAnalysisService
|
|||||||
Instance = instanceContext,
|
Instance = instanceContext,
|
||||||
};
|
};
|
||||||
|
|
||||||
var originalOut = Console.Out;
|
// Console capture is routed per-call via an AsyncLocal scope (CentralUI-003).
|
||||||
var originalError = Console.Error;
|
// Console.Out is process-global, so it must NOT be redirected per run — two
|
||||||
|
// concurrent Test Runs would interleave output and the first to finish would
|
||||||
|
// restore Console.Out while the other is still writing. SandboxConsoleCapture
|
||||||
|
// installs routing writers once and scopes capture to this call-tree only.
|
||||||
|
var (captureOut, captureError) = SandboxConsoleCapture.Install();
|
||||||
var captured = new StringWriter();
|
var captured = new StringWriter();
|
||||||
|
using var outScope = captureOut.BeginCapture(captured);
|
||||||
|
using var errorScope = captureError.BeginCapture(captured);
|
||||||
|
|
||||||
var stopwatch = Stopwatch.StartNew();
|
var stopwatch = Stopwatch.StartNew();
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
Console.SetOut(captured);
|
|
||||||
Console.SetError(captured);
|
|
||||||
|
|
||||||
// Run on a thread-pool thread with no SynchronizationContext: a
|
// Run on a thread-pool thread with no SynchronizationContext: a
|
||||||
// bound script's Instance.SetAttribute / Attributes[...] block
|
// bound script's Instance.SetAttribute / Attributes[...] block
|
||||||
// synchronously on cross-site I/O (the API surface is sync by
|
// synchronously on cross-site I/O (the API surface is sync by
|
||||||
@@ -416,11 +442,9 @@ public class ScriptAnalysisService
|
|||||||
$"{inner.GetType().Name}: {inner.Message}",
|
$"{inner.GetType().Name}: {inner.Message}",
|
||||||
SandboxErrorKind.RuntimeError, stopwatch.ElapsedMilliseconds, null);
|
SandboxErrorKind.RuntimeError, stopwatch.ElapsedMilliseconds, null);
|
||||||
}
|
}
|
||||||
finally
|
// outScope / errorScope are disposed by their `using` declarations when the
|
||||||
{
|
// method returns, restoring the previous capture scope on this call-tree
|
||||||
Console.SetOut(originalOut);
|
// without touching process-global Console state.
|
||||||
Console.SetError(originalError);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dictionary<string, object?> ConvertJsonParameters(
|
private static Dictionary<string, object?> ConvertJsonParameters(
|
||||||
@@ -1086,15 +1110,25 @@ public class ScriptAnalysisService
|
|||||||
return new(AttributeContextKind.None, null);
|
return new(AttributeContextKind.None, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Finds every reference to a forbidden API — the documented script trust model,
|
||||||
|
/// see <see cref="ForbiddenNamespacePrefixes"/>. Identifiers are resolved against
|
||||||
|
/// the semantic model, so a forbidden type or member is caught however it is
|
||||||
|
/// written: bare (<c>File</c>), fully qualified
|
||||||
|
/// (<c>System.IO.File.WriteAllText</c>), or via an alias — while a user identifier
|
||||||
|
/// that merely shares a name with a forbidden type (<c>var File = …</c>) does not
|
||||||
|
/// false-positive. Used both for editor diagnostics and as the pre-execution
|
||||||
|
/// trust-model gate (see <see cref="EnforceTrustModel"/>).
|
||||||
|
/// </summary>
|
||||||
private static IEnumerable<DiagnosticMarker> FindForbiddenApiUsages(SyntaxTree tree, SemanticModel model)
|
private static IEnumerable<DiagnosticMarker> FindForbiddenApiUsages(SyntaxTree tree, SemanticModel model)
|
||||||
{
|
{
|
||||||
var root = tree.GetRoot();
|
var root = tree.GetRoot();
|
||||||
|
|
||||||
// Banned using directives — pure namespace string match is fine here.
|
// Banned using directives.
|
||||||
foreach (var u in root.DescendantNodes().OfType<UsingDirectiveSyntax>())
|
foreach (var u in root.DescendantNodes().OfType<UsingDirectiveSyntax>())
|
||||||
{
|
{
|
||||||
var name = u.Name?.ToString() ?? "";
|
var name = u.Name?.ToString() ?? "";
|
||||||
if (ForbiddenNamespacePrefixes.Any(p => name == p || name.StartsWith(p + ".")))
|
if (IsForbiddenName(name))
|
||||||
{
|
{
|
||||||
var span = u.GetLocation().GetLineSpan().Span;
|
var span = u.GetLocation().GetLineSpan().Span;
|
||||||
yield return new DiagnosticMarker(
|
yield return new DiagnosticMarker(
|
||||||
@@ -1108,20 +1142,14 @@ public class ScriptAnalysisService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Banned type usages — resolved via the semantic model so a user
|
// Banned type / member references, resolved via the semantic model. Every
|
||||||
// identifier named "File" or "Thread" does NOT trigger the diagnostic
|
// identifier is checked — including the right-hand side of a member access —
|
||||||
// unless it actually resolves to a forbidden type.
|
// so a fully-qualified forbidden call (System.IO.File.WriteAllText) cannot
|
||||||
|
// slip past by avoiding a `using` directive or a bare type name.
|
||||||
foreach (var ident in root.DescendantNodes().OfType<IdentifierNameSyntax>())
|
foreach (var ident in root.DescendantNodes().OfType<IdentifierNameSyntax>())
|
||||||
{
|
{
|
||||||
// Skip the identifier on the right side of a member access — only
|
var forbidden = ForbiddenNameFor(model.GetSymbolInfo(ident).Symbol);
|
||||||
// the leftmost (the type or qualifier) is what we want to check.
|
if (forbidden == null) continue;
|
||||||
if (ident.Parent is MemberAccessExpressionSyntax m && m.Name == ident) continue;
|
|
||||||
|
|
||||||
var symbol = model.GetSymbolInfo(ident).Symbol;
|
|
||||||
if (symbol is not INamedTypeSymbol type) continue;
|
|
||||||
|
|
||||||
var ns = type.ContainingNamespace?.ToDisplayString() ?? "";
|
|
||||||
if (!ForbiddenNamespacePrefixes.Any(p => ns == p || ns.StartsWith(p + "."))) continue;
|
|
||||||
|
|
||||||
var span = ident.GetLocation().GetLineSpan().Span;
|
var span = ident.GetLocation().GetLineSpan().Span;
|
||||||
yield return new DiagnosticMarker(
|
yield return new DiagnosticMarker(
|
||||||
@@ -1130,11 +1158,75 @@ public class ScriptAnalysisService
|
|||||||
StartColumn: span.Start.Character + 1,
|
StartColumn: span.Start.Character + 1,
|
||||||
EndLineNumber: span.End.Line + 1,
|
EndLineNumber: span.End.Line + 1,
|
||||||
EndColumn: span.End.Character + 1,
|
EndColumn: span.End.Character + 1,
|
||||||
Message: $"Type '{type.Name}' from forbidden namespace '{ns}' is not allowed in scripts.",
|
Message: $"'{ident.Identifier.ValueText}' resolves to forbidden API '{forbidden}', " +
|
||||||
|
"which is not allowed in scripts (script trust model).",
|
||||||
Code: "SCADA002");
|
Code: "SCADA002");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The forbidden namespace/type a symbol implicates, or null if it is allowed.
|
||||||
|
/// Checks the symbol's namespace and — for a type or member — the type's full
|
||||||
|
/// name, so an entry like <c>System.Threading.Thread</c> bans that exact type
|
||||||
|
/// while <c>System.Threading</c> (e.g. <c>CancellationToken</c>) stays allowed.
|
||||||
|
/// </summary>
|
||||||
|
private static string? ForbiddenNameFor(ISymbol? symbol)
|
||||||
|
{
|
||||||
|
if (symbol == null) return null;
|
||||||
|
foreach (var name in QualifiedNamesOf(symbol))
|
||||||
|
if (IsForbiddenName(name))
|
||||||
|
return name;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Fully-qualified names a symbol reference implicates for trust-model checking.</summary>
|
||||||
|
private static IEnumerable<string> QualifiedNamesOf(ISymbol symbol)
|
||||||
|
{
|
||||||
|
switch (symbol)
|
||||||
|
{
|
||||||
|
case INamespaceSymbol { IsGlobalNamespace: false } ns:
|
||||||
|
yield return ns.ToDisplayString();
|
||||||
|
break;
|
||||||
|
case ITypeSymbol type:
|
||||||
|
if (type.ContainingNamespace is { IsGlobalNamespace: false } tn)
|
||||||
|
yield return tn.ToDisplayString();
|
||||||
|
yield return FullTypeName(type);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (symbol.ContainingType is { } ct)
|
||||||
|
{
|
||||||
|
if (ct.ContainingNamespace is { IsGlobalNamespace: false } cn)
|
||||||
|
yield return cn.ToDisplayString();
|
||||||
|
yield return FullTypeName(ct);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string FullTypeName(ITypeSymbol type) =>
|
||||||
|
type.ContainingNamespace is { IsGlobalNamespace: false } ns
|
||||||
|
? ns.ToDisplayString() + "." + type.Name
|
||||||
|
: type.Name;
|
||||||
|
|
||||||
|
private static bool IsForbiddenName(string qualifiedName) =>
|
||||||
|
ForbiddenNamespacePrefixes.Any(p =>
|
||||||
|
qualifiedName == p || qualifiedName.StartsWith(p + ".", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Pre-execution trust-model gate (CentralUI-001). Returns the forbidden-API
|
||||||
|
/// markers (SCADA001/SCADA002) for a compiled script; an empty list means the
|
||||||
|
/// script is clear to run. This is a static semantic gate, not a process
|
||||||
|
/// sandbox — reflection-based indirection is still out of its reach; full
|
||||||
|
/// isolation would require running scripts in a separate constrained process.
|
||||||
|
/// </summary>
|
||||||
|
private static IReadOnlyList<DiagnosticMarker> EnforceTrustModel(Compilation compilation)
|
||||||
|
{
|
||||||
|
var tree = compilation.SyntaxTrees.FirstOrDefault();
|
||||||
|
if (tree == null) return Array.Empty<DiagnosticMarker>();
|
||||||
|
var model = compilation.GetSemanticModel(tree);
|
||||||
|
return FindForbiddenApiUsages(tree, model).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
private static CompletionItem ToCompletionItem(ISymbol symbol)
|
private static CompletionItem ToCompletionItem(ISymbol symbol)
|
||||||
{
|
{
|
||||||
var kind = symbol.Kind switch
|
var kind = symbol.Kind switch
|
||||||
|
|||||||
@@ -18,6 +18,10 @@ public static class ServiceCollectionExtensions
|
|||||||
services.AddScoped<AuthenticationStateProvider, CookieAuthenticationStateProvider>();
|
services.AddScoped<AuthenticationStateProvider, CookieAuthenticationStateProvider>();
|
||||||
services.AddCascadingAuthenticationState();
|
services.AddCascadingAuthenticationState();
|
||||||
|
|
||||||
|
// Resolves the current user's permitted site set from their SiteId claims
|
||||||
|
// so Deployment/Monitoring pages can enforce site scoping (CentralUI-002).
|
||||||
|
services.AddScoped<SiteScopeService>();
|
||||||
|
|
||||||
// Centralised dialog service: pages inject IDialogService and a single
|
// Centralised dialog service: pages inject IDialogService and a single
|
||||||
// <DialogHost /> in MainLayout renders the active dialog. See
|
// <DialogHost /> in MainLayout renders the active dialog. See
|
||||||
// Components/Shared/IDialogService.cs.
|
// Components/Shared/IDialogService.cs.
|
||||||
|
|||||||
@@ -8,8 +8,8 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -11,5 +11,11 @@ public enum AlarmTriggerType
|
|||||||
/// may carry its own priority; transitions between levels emit a fresh
|
/// may carry its own priority; transitions between levels emit a fresh
|
||||||
/// AlarmStateChanged with the corresponding <see cref="AlarmLevel"/>.
|
/// AlarmStateChanged with the corresponding <see cref="AlarmLevel"/>.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
HiLo
|
HiLo,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Read-only boolean C# expression evaluated on attribute updates. The
|
||||||
|
/// trigger fires when the expression evaluates to <c>true</c>.
|
||||||
|
/// </summary>
|
||||||
|
Expression
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -183,6 +183,15 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
|
||||||
|
// This cancels the local subscription CTS and -- where the channel is still
|
||||||
|
// alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
|
||||||
|
// stops the StreamRelayActor for this correlation ID, rather than leaving a
|
||||||
|
// zombie relay actor until TCP RST / keepalive eventually detects the loss.
|
||||||
|
var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
|
||||||
|
var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
|
||||||
|
previousClient.Unsubscribe(_correlationId);
|
||||||
|
|
||||||
// Flip to the other node
|
// Flip to the other node
|
||||||
_useNodeA = !_useNodeA;
|
_useNodeA = !_useNodeA;
|
||||||
|
|
||||||
|
|||||||
@@ -127,20 +127,36 @@ public class DebugStreamService
|
|||||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||||
timeoutCts.CancelAfter(TimeSpan.FromSeconds(30));
|
timeoutCts.CancelAfter(TimeSpan.FromSeconds(30));
|
||||||
|
|
||||||
|
DebugViewSnapshot snapshot;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var snapshot = await snapshotTcs.Task.WaitAsync(timeoutCts.Token);
|
snapshot = await snapshotTcs.Task.WaitAsync(timeoutCts.Token);
|
||||||
|
|
||||||
_logger.LogInformation("Debug stream {SessionId} started for {Instance} on site {Site}",
|
|
||||||
sessionId, instanceUniqueName, siteIdentifier);
|
|
||||||
|
|
||||||
return new DebugStreamSession(sessionId, snapshot);
|
|
||||||
}
|
}
|
||||||
catch (OperationCanceledException)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
StopStream(sessionId);
|
// Any failure before the snapshot arrives — the 30s timeout, or the stream
|
||||||
throw new TimeoutException($"Timed out waiting for debug snapshot from {instanceUniqueName} on site {siteIdentifier}.");
|
// terminating early (site disconnect / gRPC failure, surfaced by
|
||||||
|
// onTerminatedWrapper as an InvalidOperationException) — must deterministically
|
||||||
|
// tear down the bridge actor and its site-side subscription. Use the local
|
||||||
|
// actor reference: a racing onTerminatedWrapper may already have removed the
|
||||||
|
// session, which would make StopStream a no-op. StopDebugStream is idempotent
|
||||||
|
// (the actor may already be stopping itself).
|
||||||
|
_sessions.TryRemove(sessionId, out _);
|
||||||
|
bridgeActor.Tell(new StopDebugStream());
|
||||||
|
|
||||||
|
if (ex is OperationCanceledException)
|
||||||
|
throw new TimeoutException(
|
||||||
|
$"Timed out waiting for debug snapshot from {instanceUniqueName} on site {siteIdentifier}.");
|
||||||
|
|
||||||
|
throw new InvalidOperationException(
|
||||||
|
$"Debug stream for {instanceUniqueName} on site {siteIdentifier} terminated before a snapshot was received.",
|
||||||
|
ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_logger.LogInformation("Debug stream {SessionId} started for {Instance} on site {Site}",
|
||||||
|
sessionId, instanceUniqueName, siteIdentifier);
|
||||||
|
|
||||||
|
return new DebugStreamSession(sessionId, snapshot);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|||||||
@@ -57,6 +57,32 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
|||||||
_subscriptions[correlationId] = cts;
|
_subscriptions[correlationId] = cts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Registers a subscription's CancellationTokenSource for a correlation ID.
|
||||||
|
/// If an entry already exists for that correlation ID (a reconnect race where two
|
||||||
|
/// <see cref="SubscribeAsync"/> calls briefly share an ID), the prior CTS is
|
||||||
|
/// cancelled and disposed so it cannot leak. Internal for testability.
|
||||||
|
/// </summary>
|
||||||
|
internal void RegisterSubscription(string correlationId, CancellationTokenSource cts)
|
||||||
|
{
|
||||||
|
if (_subscriptions.TryGetValue(correlationId, out var prior) && !ReferenceEquals(prior, cts))
|
||||||
|
{
|
||||||
|
prior.Cancel();
|
||||||
|
prior.Dispose();
|
||||||
|
}
|
||||||
|
_subscriptions[correlationId] = cts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Removes the subscription entry for a correlation ID only if the stored CTS is
|
||||||
|
/// exactly the one supplied. A racing replacement stream may already own the slot,
|
||||||
|
/// in which case this is a no-op. Internal for testability.
|
||||||
|
/// </summary>
|
||||||
|
internal void RemoveSubscription(string correlationId, CancellationTokenSource cts)
|
||||||
|
{
|
||||||
|
_subscriptions.TryRemove(new KeyValuePair<string, CancellationTokenSource>(correlationId, cts));
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Opens a server-streaming subscription for a specific instance.
|
/// Opens a server-streaming subscription for a specific instance.
|
||||||
/// This is a long-running async method; the caller launches it as a background task.
|
/// This is a long-running async method; the caller launches it as a background task.
|
||||||
@@ -74,7 +100,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
|||||||
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
|
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
|
||||||
|
|
||||||
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||||
_subscriptions[correlationId] = cts;
|
RegisterSubscription(correlationId, cts);
|
||||||
|
|
||||||
var request = new InstanceStreamRequest
|
var request = new InstanceStreamRequest
|
||||||
{
|
{
|
||||||
@@ -103,7 +129,8 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
|||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
{
|
{
|
||||||
_subscriptions.TryRemove(correlationId, out _);
|
// Remove only our own entry -- a racing reconnect may already own the slot.
|
||||||
|
RemoveSubscription(correlationId, cts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,13 +16,13 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka" Version="1.5.62" />
|
<PackageReference Include="Akka" />
|
||||||
<PackageReference Include="Akka.Remote" Version="1.5.62" />
|
<PackageReference Include="Akka.Remote" />
|
||||||
<PackageReference Include="Akka.Cluster" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster" />
|
||||||
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster.Tools" />
|
||||||
<PackageReference Include="Google.Protobuf" Version="3.29.3" />
|
<PackageReference Include="Google.Protobuf" />
|
||||||
<PackageReference Include="Grpc.Net.Client" Version="2.71.0" />
|
<PackageReference Include="Grpc.Net.Client" />
|
||||||
<PackageReference Include="Grpc.Tools" Version="2.71.0" PrivateAssets="All" />
|
<PackageReference Include="Grpc.Tools" PrivateAssets="All" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -27,17 +27,15 @@ public class TemplateEngineRepository : ITemplateEngineRepository
|
|||||||
.FirstOrDefaultAsync(t => t.Id == id, cancellationToken);
|
.FirstOrDefaultAsync(t => t.Id == id, cancellationToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Loads a template together with its child members — Attributes, Alarms,
|
||||||
|
/// Scripts and Compositions — eager-loaded so callers get the full template
|
||||||
|
/// aggregate in a single round-trip. "Children" here refers to the template's
|
||||||
|
/// member collections, not derived/sub templates.
|
||||||
|
/// </summary>
|
||||||
public async Task<Template?> GetTemplateWithChildrenAsync(int id, CancellationToken cancellationToken = default)
|
public async Task<Template?> GetTemplateWithChildrenAsync(int id, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
var template = await GetTemplateByIdAsync(id, cancellationToken);
|
return await GetTemplateByIdAsync(id, cancellationToken);
|
||||||
if (template == null) return null;
|
|
||||||
|
|
||||||
// Load all templates that have this template as parent
|
|
||||||
var children = await _context.Templates
|
|
||||||
.Where(t => t.ParentTemplateId == id)
|
|
||||||
.ToListAsync(cancellationToken);
|
|
||||||
|
|
||||||
return template;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task<IReadOnlyList<Template>> GetAllTemplatesAsync(CancellationToken cancellationToken = default)
|
public async Task<IReadOnlyList<Template>> GetAllTemplatesAsync(CancellationToken cancellationToken = default)
|
||||||
|
|||||||
@@ -8,16 +8,16 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore" Version="10.0.7" />
|
<PackageReference Include="Microsoft.EntityFrameworkCore" />
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="10.0.7" />
|
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.7">
|
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
|
||||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||||
<PrivateAssets>all</PrivateAssets>
|
<PrivateAssets>all</PrivateAssets>
|
||||||
</PackageReference>
|
</PackageReference>
|
||||||
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Configuration.Json" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
<PackageReference Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" Version="10.0.7" />
|
<PackageReference Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -171,6 +171,11 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
case UnsubscribeTagsRequest:
|
case UnsubscribeTagsRequest:
|
||||||
Stash.Stash();
|
Stash.Stash();
|
||||||
break;
|
break;
|
||||||
|
case SubscribeCompleted sc:
|
||||||
|
// A subscribe started while Connected can complete after a transition;
|
||||||
|
// apply it so its state survives into the next ReSubscribeAll.
|
||||||
|
HandleSubscribeCompleted(sc);
|
||||||
|
break;
|
||||||
case GetHealthReport:
|
case GetHealthReport:
|
||||||
ReplyWithHealthReport();
|
ReplyWithHealthReport();
|
||||||
break;
|
break;
|
||||||
@@ -207,6 +212,15 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
case SubscribeTagsRequest req:
|
case SubscribeTagsRequest req:
|
||||||
HandleSubscribe(req);
|
HandleSubscribe(req);
|
||||||
break;
|
break;
|
||||||
|
case SubscribeCompleted sc:
|
||||||
|
// In Connected state, a connection-level subscribe failure must drive
|
||||||
|
// the reconnection state machine (DataConnectionLayer-004).
|
||||||
|
if (HandleSubscribeCompleted(sc))
|
||||||
|
{
|
||||||
|
_log.Warning("[{0}] Connection-level subscribe failure — entering Reconnecting", _connectionName);
|
||||||
|
BecomeReconnecting();
|
||||||
|
}
|
||||||
|
break;
|
||||||
case UnsubscribeTagsRequest req:
|
case UnsubscribeTagsRequest req:
|
||||||
HandleUnsubscribe(req);
|
HandleUnsubscribe(req);
|
||||||
break;
|
break;
|
||||||
@@ -338,6 +352,11 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
case TagResolutionFailed:
|
case TagResolutionFailed:
|
||||||
// Ignore — stale results from previous connection; ReSubscribeAll runs after reconnect
|
// Ignore — stale results from previous connection; ReSubscribeAll runs after reconnect
|
||||||
break;
|
break;
|
||||||
|
case SubscribeCompleted sc:
|
||||||
|
// A subscribe started while Connected can complete after a transition;
|
||||||
|
// apply it so its state survives into the next ReSubscribeAll.
|
||||||
|
HandleSubscribeCompleted(sc);
|
||||||
|
break;
|
||||||
case GetHealthReport:
|
case GetHealthReport:
|
||||||
ReplyWithHealthReport();
|
ReplyWithHealthReport();
|
||||||
break;
|
break;
|
||||||
@@ -466,18 +485,27 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
if (!_subscriptionsByInstance.ContainsKey(request.InstanceUniqueName))
|
if (!_subscriptionsByInstance.ContainsKey(request.InstanceUniqueName))
|
||||||
_subscriptionsByInstance[request.InstanceUniqueName] = new HashSet<string>();
|
_subscriptionsByInstance[request.InstanceUniqueName] = new HashSet<string>();
|
||||||
|
|
||||||
var instanceTags = _subscriptionsByInstance[request.InstanceUniqueName];
|
|
||||||
var self = Self;
|
var self = Self;
|
||||||
var sender = Sender;
|
var sender = Sender;
|
||||||
|
|
||||||
|
// Snapshot the already-subscribed tag set on the actor thread. The background
|
||||||
|
// task below must NOT read or mutate actor state — it performs only adapter
|
||||||
|
// I/O and reports results back via a SubscribeCompleted message, which is
|
||||||
|
// applied to actor state on the actor thread (see HandleSubscribeCompleted).
|
||||||
|
var alreadySubscribed = new HashSet<string>(_subscriptionIds.Keys);
|
||||||
|
|
||||||
Task.Run(async () =>
|
Task.Run(async () =>
|
||||||
{
|
{
|
||||||
|
var results = new List<SubscribeTagResult>(request.TagPaths.Count);
|
||||||
|
var tagsToSeed = new List<string>();
|
||||||
|
|
||||||
foreach (var tagPath in request.TagPaths)
|
foreach (var tagPath in request.TagPaths)
|
||||||
{
|
{
|
||||||
if (_subscriptionIds.ContainsKey(tagPath))
|
if (alreadySubscribed.Contains(tagPath))
|
||||||
{
|
{
|
||||||
// Already subscribed — just track for this instance
|
// Already subscribed by another instance — just track for this one.
|
||||||
instanceTags.Add(tagPath);
|
results.Add(new SubscribeTagResult(tagPath, AlreadySubscribed: true, Success: true, null, null));
|
||||||
|
tagsToSeed.Add(tagPath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -487,27 +515,27 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
{
|
{
|
||||||
self.Tell(new TagValueReceived(path, value));
|
self.Tell(new TagValueReceived(path, value));
|
||||||
});
|
});
|
||||||
_subscriptionIds[tagPath] = subId;
|
results.Add(new SubscribeTagResult(tagPath, AlreadySubscribed: false, Success: true, subId, null));
|
||||||
instanceTags.Add(tagPath);
|
tagsToSeed.Add(tagPath);
|
||||||
_totalSubscribed++;
|
|
||||||
_resolvedTags++;
|
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
// WP-12: Tag path resolution failure — mark as unresolved, retry later
|
// DataConnectionLayer-004: distinguish a connection-level fault
|
||||||
_unresolvedTags.Add(tagPath);
|
// (adapter not connected / transport down) from a genuine
|
||||||
instanceTags.Add(tagPath);
|
// node-not-found. Connection-level faults must drive the
|
||||||
_totalSubscribed++;
|
// reconnection state machine, not be retried as unresolved tags.
|
||||||
|
var connectionLevel = IsConnectionLevelFailure(ex);
|
||||||
self.Tell(new TagResolutionFailed(tagPath, ex.Message));
|
results.Add(new SubscribeTagResult(
|
||||||
|
tagPath, AlreadySubscribed: false, Success: false, null, ex.Message,
|
||||||
|
ConnectionLevelFailure: connectionLevel));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initial read — seed current values for all resolved tags so the Instance Actor
|
// Initial read — seed current values for resolved tags so the Instance Actor
|
||||||
// doesn't stay Uncertain until the next OPC UA data change notification
|
// doesn't stay Uncertain until the next OPC UA data change notification.
|
||||||
foreach (var tagPath in instanceTags)
|
// Tell is thread-safe, so seeded values are delivered directly as messages.
|
||||||
|
foreach (var tagPath in tagsToSeed)
|
||||||
{
|
{
|
||||||
if (_unresolvedTags.Contains(tagPath)) continue;
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var readResult = await _adapter.ReadAsync(tagPath);
|
var readResult = await _adapter.ReadAsync(tagPath);
|
||||||
@@ -522,11 +550,77 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new SubscribeTagsResponse(
|
return new SubscribeCompleted(request, sender, results);
|
||||||
request.CorrelationId, request.InstanceUniqueName, true, null, DateTimeOffset.UtcNow);
|
}).PipeTo(self);
|
||||||
}).PipeTo(sender);
|
}
|
||||||
|
|
||||||
// Start tag resolution retry timer if we have unresolved tags
|
/// <summary>
|
||||||
|
/// Applies the result of an asynchronous subscribe on the actor thread. ALL mutation
|
||||||
|
/// of subscription state and counters happens here — never on the background task —
|
||||||
|
/// so the actor model's single-threaded state guarantee holds.
|
||||||
|
/// Returns <c>true</c> if any tag failed at connection level (DataConnectionLayer-004),
|
||||||
|
/// signalling the caller (only the Connected state) to enter Reconnecting.
|
||||||
|
/// </summary>
|
||||||
|
private bool HandleSubscribeCompleted(SubscribeCompleted msg)
|
||||||
|
{
|
||||||
|
var instanceName = msg.Request.InstanceUniqueName;
|
||||||
|
if (!_subscriptionsByInstance.TryGetValue(instanceName, out var instanceTags))
|
||||||
|
{
|
||||||
|
// The instance was unsubscribed while the subscribe I/O was in flight.
|
||||||
|
instanceTags = new HashSet<string>();
|
||||||
|
_subscriptionsByInstance[instanceName] = instanceTags;
|
||||||
|
}
|
||||||
|
|
||||||
|
// DataConnectionLayer-004: if any tag failed because the adapter is not
|
||||||
|
// connected (a connection-level fault), the subscribe needs the reconnection
|
||||||
|
// state machine, not the tag-resolution retry. Drive a disconnect and let the
|
||||||
|
// request be re-stashed/retried after reconnect via ReSubscribeAll.
|
||||||
|
var connectionLevelFailure = msg.Results.Any(r => !r.Success && r.ConnectionLevelFailure);
|
||||||
|
|
||||||
|
foreach (var result in msg.Results)
|
||||||
|
{
|
||||||
|
instanceTags.Add(result.TagPath);
|
||||||
|
|
||||||
|
// Re-check against current state: another subscribe may have resolved the
|
||||||
|
// same tag while this request's I/O was in flight.
|
||||||
|
if (result.AlreadySubscribed || _subscriptionIds.ContainsKey(result.TagPath))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (result.Success)
|
||||||
|
{
|
||||||
|
_subscriptionIds[result.TagPath] = result.SubscriptionId!;
|
||||||
|
_totalSubscribed++;
|
||||||
|
_resolvedTags++;
|
||||||
|
}
|
||||||
|
else if (result.ConnectionLevelFailure)
|
||||||
|
{
|
||||||
|
// Connection-level fault — do not count as an unresolved tag.
|
||||||
|
// ReSubscribeAll after reconnect derives the tag from
|
||||||
|
// _subscriptionsByInstance (already updated above).
|
||||||
|
_log.Warning("[{0}] Subscribe for {1} failed at connection level: {2}",
|
||||||
|
_connectionName, result.TagPath, result.Error);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// WP-12: genuine tag resolution failure — mark unresolved so the
|
||||||
|
// periodic retry timer picks it up.
|
||||||
|
_unresolvedTags.Add(result.TagPath);
|
||||||
|
_totalSubscribed++;
|
||||||
|
_log.Debug("[{0}] Tag resolution failed for {1}: {2}",
|
||||||
|
_connectionName, result.TagPath, result.Error);
|
||||||
|
|
||||||
|
// DataConnectionLayer-004 / design doc Tag Path Resolution step 2:
|
||||||
|
// mark the attribute quality `bad` so the Instance Actor sees a
|
||||||
|
// signal rather than staying Uncertain indefinitely.
|
||||||
|
if (_subscribers.TryGetValue(instanceName, out var subscriber))
|
||||||
|
{
|
||||||
|
subscriber.Tell(new TagValueUpdate(
|
||||||
|
_connectionName, result.TagPath, null, QualityCode.Bad, DateTimeOffset.UtcNow));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the tag-resolution retry timer if any tags are unresolved.
|
||||||
if (_unresolvedTags.Count > 0)
|
if (_unresolvedTags.Count > 0)
|
||||||
{
|
{
|
||||||
Timers.StartPeriodicTimer(
|
Timers.StartPeriodicTimer(
|
||||||
@@ -535,6 +629,30 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
_options.TagResolutionRetryInterval,
|
_options.TagResolutionRetryInterval,
|
||||||
_options.TagResolutionRetryInterval);
|
_options.TagResolutionRetryInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
msg.ReplyTo.Tell(new SubscribeTagsResponse(
|
||||||
|
msg.Request.CorrelationId, instanceName, true, null, DateTimeOffset.UtcNow));
|
||||||
|
|
||||||
|
// The caller (Connected state only) decides whether to enter Reconnecting.
|
||||||
|
// In Connecting/Reconnecting the connection is not established anyway, so the
|
||||||
|
// existing reconnect cycle handles recovery without a re-trigger here.
|
||||||
|
return connectionLevelFailure;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// DataConnectionLayer-004: classifies a subscribe exception as a connection-level
|
||||||
|
/// fault (adapter not connected / transport down) versus a genuine tag-resolution
|
||||||
|
/// failure (the node does not exist on the device). Connection-level faults must
|
||||||
|
/// drive the reconnection state machine; resolution failures are retried on the
|
||||||
|
/// tag-resolution timer.
|
||||||
|
/// </summary>
|
||||||
|
private static bool IsConnectionLevelFailure(Exception ex)
|
||||||
|
{
|
||||||
|
var baseEx = ex is AggregateException agg ? agg.GetBaseException() : ex;
|
||||||
|
return baseEx is InvalidOperationException
|
||||||
|
or System.Net.Sockets.SocketException
|
||||||
|
or TimeoutException
|
||||||
|
or System.IO.IOException;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void HandleUnsubscribe(UnsubscribeTagsRequest request)
|
private void HandleUnsubscribe(UnsubscribeTagsRequest request)
|
||||||
@@ -575,15 +693,29 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
_log.Debug("[{0}] Writing to tag {1}", _connectionName, request.TagPath);
|
_log.Debug("[{0}] Writing to tag {1}", _connectionName, request.TagPath);
|
||||||
var sender = Sender;
|
var sender = Sender;
|
||||||
|
|
||||||
|
// DataConnectionLayer-005: bound the write with WriteTimeout. A hung device
|
||||||
|
// write (TCP black-hole) would otherwise never complete, so PipeTo never
|
||||||
|
// fires and the calling script gets no DCL-level error. The CancellationToken
|
||||||
|
// is passed to the adapter; on timeout we translate cancellation into a
|
||||||
|
// failed WriteTagResponse so the failure is returned synchronously (WP-11).
|
||||||
|
var cts = new CancellationTokenSource(_options.WriteTimeout);
|
||||||
|
|
||||||
// WP-11: Write through DCL to device, failure returned synchronously
|
// WP-11: Write through DCL to device, failure returned synchronously
|
||||||
_adapter.WriteAsync(request.TagPath, request.Value).ContinueWith(t =>
|
_adapter.WriteAsync(request.TagPath, request.Value, cts.Token).ContinueWith(t =>
|
||||||
{
|
{
|
||||||
|
cts.Dispose();
|
||||||
if (t.IsCompletedSuccessfully)
|
if (t.IsCompletedSuccessfully)
|
||||||
{
|
{
|
||||||
var result = t.Result;
|
var result = t.Result;
|
||||||
return new WriteTagResponse(
|
return new WriteTagResponse(
|
||||||
request.CorrelationId, result.Success, result.ErrorMessage, DateTimeOffset.UtcNow);
|
request.CorrelationId, result.Success, result.ErrorMessage, DateTimeOffset.UtcNow);
|
||||||
}
|
}
|
||||||
|
if (t.IsCanceled || t.Exception?.GetBaseException() is OperationCanceledException)
|
||||||
|
{
|
||||||
|
return new WriteTagResponse(
|
||||||
|
request.CorrelationId, false,
|
||||||
|
$"Write timeout after {_options.WriteTimeout.TotalSeconds:F0}s", DateTimeOffset.UtcNow);
|
||||||
|
}
|
||||||
return new WriteTagResponse(
|
return new WriteTagResponse(
|
||||||
request.CorrelationId, false, t.Exception?.GetBaseException().Message, DateTimeOffset.UtcNow);
|
request.CorrelationId, false, t.Exception?.GetBaseException().Message, DateTimeOffset.UtcNow);
|
||||||
}).PipeTo(sender);
|
}).PipeTo(sender);
|
||||||
@@ -764,5 +896,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
|||||||
internal record TagResolutionFailed(string TagPath, string Error);
|
internal record TagResolutionFailed(string TagPath, string Error);
|
||||||
internal record TagResolutionSucceeded(string TagPath, string SubscriptionId);
|
internal record TagResolutionSucceeded(string TagPath, string SubscriptionId);
|
||||||
internal record RetryTagResolution;
|
internal record RetryTagResolution;
|
||||||
|
internal record SubscribeTagResult(
|
||||||
|
string TagPath, bool AlreadySubscribed, bool Success, string? SubscriptionId, string? Error,
|
||||||
|
bool ConnectionLevelFailure = false);
|
||||||
|
internal record SubscribeCompleted(
|
||||||
|
SubscribeTagsRequest Request, IActorRef ReplyTo, IReadOnlyList<SubscribeTagResult> Results);
|
||||||
public record GetHealthReport;
|
public record GetHealthReport;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -125,8 +125,20 @@ public class DataConnectionManagerActor : ReceiveActor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// OneForOneStrategy with Restart for connection actors — a failed connection
|
/// OneForOneStrategy with Resume for connection actors.
|
||||||
/// should restart and attempt reconnection.
|
///
|
||||||
|
/// DataConnectionLayer-002: a DataConnectionActor is a long-lived, stateful
|
||||||
|
/// coordinator — its in-memory subscription registry (_subscriptionsByInstance,
|
||||||
|
/// _subscriptionIds, _subscribers) is the only record of which Instance Actors
|
||||||
|
/// subscribed to which tags, and there is no durable store to rebuild it from.
|
||||||
|
/// Restart would create a fresh instance and silently discard that registry,
|
||||||
|
/// breaking the design doc's "transparent re-subscribe" guarantee (WP-10):
|
||||||
|
/// subscribers would never be re-subscribed and would sit at stale quality with
|
||||||
|
/// no error. Resume keeps the actor instance and its state intact, so a transient
|
||||||
|
/// exception in a message handler does not lose subscription state. The actor's
|
||||||
|
/// own Become/Stash reconnect state machine already recovers connection-level
|
||||||
|
/// faults, so it does not need a restart to re-establish the connection.
|
||||||
|
/// This matches the ScadaLink convention of Resume for coordinator actors.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
protected override SupervisorStrategy SupervisorStrategy()
|
protected override SupervisorStrategy SupervisorStrategy()
|
||||||
{
|
{
|
||||||
@@ -135,8 +147,8 @@ public class DataConnectionManagerActor : ReceiveActor
|
|||||||
withinTimeRange: TimeSpan.FromMinutes(1),
|
withinTimeRange: TimeSpan.FromMinutes(1),
|
||||||
decider: Decider.From(ex =>
|
decider: Decider.From(ex =>
|
||||||
{
|
{
|
||||||
_log.Warning(ex, "DataConnectionActor threw exception, restarting");
|
_log.Warning(ex, "DataConnectionActor threw exception, resuming (subscription state preserved)");
|
||||||
return Directive.Restart;
|
return Directive.Resume;
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
using System.Collections.Concurrent;
|
||||||
using System.Security.Cryptography.X509Certificates;
|
using System.Security.Cryptography.X509Certificates;
|
||||||
using Opc.Ua;
|
using Opc.Ua;
|
||||||
using Opc.Ua.Client;
|
using Opc.Ua.Client;
|
||||||
@@ -13,8 +14,14 @@ public class RealOpcUaClient : IOpcUaClient
|
|||||||
{
|
{
|
||||||
private ISession? _session;
|
private ISession? _session;
|
||||||
private Subscription? _subscription;
|
private Subscription? _subscription;
|
||||||
private readonly Dictionary<string, MonitoredItem> _monitoredItems = new();
|
|
||||||
private readonly Dictionary<string, Action<string, object?, DateTime, uint>> _callbacks = new();
|
// DataConnectionLayer-003: these maps are read from the OPC Foundation SDK's
|
||||||
|
// internal publish threads (the MonitoredItem.Notification handler reads
|
||||||
|
// _callbacks) concurrently with subscribe/disconnect mutations that run on
|
||||||
|
// thread-pool threads. Plain Dictionary access during a concurrent resize or
|
||||||
|
// Clear() is undefined behaviour, so they must be ConcurrentDictionary.
|
||||||
|
private readonly ConcurrentDictionary<string, MonitoredItem> _monitoredItems = new();
|
||||||
|
private readonly ConcurrentDictionary<string, Action<string, object?, DateTime, uint>> _callbacks = new();
|
||||||
private volatile bool _connectionLostFired;
|
private volatile bool _connectionLostFired;
|
||||||
private OpcUaConnectionOptions _options = new();
|
private OpcUaConnectionOptions _options = new();
|
||||||
private readonly OpcUaGlobalOptions _globalOptions;
|
private readonly OpcUaGlobalOptions _globalOptions;
|
||||||
@@ -180,8 +187,8 @@ public class RealOpcUaClient : IOpcUaClient
|
|||||||
{
|
{
|
||||||
_subscription.RemoveItem(item);
|
_subscription.RemoveItem(item);
|
||||||
await _subscription.ApplyChangesAsync(cancellationToken);
|
await _subscription.ApplyChangesAsync(cancellationToken);
|
||||||
_monitoredItems.Remove(subscriptionHandle);
|
_monitoredItems.TryRemove(subscriptionHandle, out _);
|
||||||
_callbacks.Remove(subscriptionHandle);
|
_callbacks.TryRemove(subscriptionHandle, out _);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,14 +8,14 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka" Version="1.5.62" />
|
<PackageReference Include="Akka" />
|
||||||
<PackageReference Include="Akka.Cluster" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster" />
|
||||||
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster.Tools" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Client" Version="1.5.378.106" />
|
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Client" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -183,19 +183,53 @@ public class DeploymentService
|
|||||||
: Result<DeploymentRecord>.Failure(
|
: Result<DeploymentRecord>.Failure(
|
||||||
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
|
$"Deployment failed: {response.ErrorMessage ?? "Unknown error"}");
|
||||||
}
|
}
|
||||||
catch (Exception ex) when (ex is TimeoutException or OperationCanceledException)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
|
// DeploymentManager-001: any exception out of the try (timeout,
|
||||||
|
// cancellation, transport, serialization, DB) must leave the
|
||||||
|
// deployment record as Failed -- the design requires an interrupted
|
||||||
|
// deployment to be treated as failed, never stuck in InProgress.
|
||||||
|
//
|
||||||
|
// DeploymentManager-002: the failure-status write must NOT use the
|
||||||
|
// operation's cancellation token. If the operation was cancelled or
|
||||||
|
// timed out, that token is already cancelled and the cleanup writes
|
||||||
|
// would themselves throw before the Failed status is persisted.
|
||||||
|
// Use CancellationToken.None so the failure is durably recorded.
|
||||||
|
var isTimeout = ex is TimeoutException or OperationCanceledException;
|
||||||
|
|
||||||
record.Status = DeploymentStatus.Failed;
|
record.Status = DeploymentStatus.Failed;
|
||||||
record.ErrorMessage = $"Communication failure: {ex.Message}";
|
record.ErrorMessage = isTimeout
|
||||||
|
? $"Communication failure: {ex.Message}"
|
||||||
|
: $"Deployment error: {ex.Message}";
|
||||||
record.CompletedAt = DateTimeOffset.UtcNow;
|
record.CompletedAt = DateTimeOffset.UtcNow;
|
||||||
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
|
||||||
await _repository.SaveChangesAsync(cancellationToken);
|
|
||||||
|
|
||||||
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
try
|
||||||
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
{
|
||||||
cancellationToken);
|
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
|
||||||
|
await _repository.SaveChangesAsync(CancellationToken.None);
|
||||||
|
|
||||||
return Result<DeploymentRecord>.Failure($"Deployment timed out: {ex.Message}");
|
await _auditService.LogAsync(user, "DeployFailed", "Instance", instanceId.ToString(),
|
||||||
|
instance.UniqueName, new { DeploymentId = deploymentId, Error = ex.Message },
|
||||||
|
CancellationToken.None);
|
||||||
|
}
|
||||||
|
catch (Exception cleanupEx)
|
||||||
|
{
|
||||||
|
// The deployment already failed; a failed cleanup write must not
|
||||||
|
// mask the original error. Log loudly so an operator can reconcile.
|
||||||
|
_logger.LogError(cleanupEx,
|
||||||
|
"Failed to persist Failed status for deployment {DeploymentId} of instance {Instance} " +
|
||||||
|
"after deployment error: {Error}",
|
||||||
|
deploymentId, instance.UniqueName, ex.Message);
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger.LogError(ex,
|
||||||
|
"Deployment {DeploymentId} for instance {Instance} failed",
|
||||||
|
deploymentId, instance.UniqueName);
|
||||||
|
|
||||||
|
return Result<DeploymentRecord>.Failure(
|
||||||
|
isTimeout
|
||||||
|
? $"Deployment timed out: {ex.Message}"
|
||||||
|
: $"Deployment failed: {ex.Message}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,9 +8,9 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -87,6 +87,64 @@ public class DatabaseGateway : IDatabaseGateway
|
|||||||
definition.RetryDelay > TimeSpan.Zero ? definition.RetryDelay : null);
|
definition.RetryDelay > TimeSpan.Zero ? definition.RetryDelay : null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// WP-9/10: Delivers a buffered CachedDbWrite during a store-and-forward retry
|
||||||
|
/// sweep — executes the SQL against the named connection. Returns true on
|
||||||
|
/// success, false if the connection no longer exists (the message is parked);
|
||||||
|
/// throws on any execution error so the engine retries.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<bool> DeliverBufferedAsync(
|
||||||
|
StoreAndForwardMessage message, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var payload = JsonSerializer.Deserialize<CachedWritePayload>(message.PayloadJson);
|
||||||
|
if (payload == null || string.IsNullOrEmpty(payload.ConnectionName) || string.IsNullOrEmpty(payload.Sql))
|
||||||
|
{
|
||||||
|
_logger.LogError("Buffered CachedDbWrite message {Id} has an unreadable payload; parking.", message.Id);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var definition = await ResolveConnectionAsync(payload.ConnectionName, cancellationToken);
|
||||||
|
if (definition == null)
|
||||||
|
{
|
||||||
|
_logger.LogError(
|
||||||
|
"Buffered DB write to '{Connection}' cannot be delivered — the connection no longer exists; parking.",
|
||||||
|
payload.ConnectionName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
await using var connection = new SqlConnection(definition.ConnectionString);
|
||||||
|
await connection.OpenAsync(cancellationToken);
|
||||||
|
using var command = connection.CreateCommand();
|
||||||
|
command.CommandText = payload.Sql;
|
||||||
|
if (payload.Parameters != null)
|
||||||
|
{
|
||||||
|
foreach (var (key, value) in payload.Parameters)
|
||||||
|
{
|
||||||
|
var parameter = command.CreateParameter();
|
||||||
|
parameter.ParameterName = key.StartsWith('@') ? key : "@" + key;
|
||||||
|
parameter.Value = JsonElementToParameterValue(value);
|
||||||
|
command.Parameters.Add(parameter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await command.ExecuteNonQueryAsync(cancellationToken);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static object JsonElementToParameterValue(JsonElement element) => element.ValueKind switch
|
||||||
|
{
|
||||||
|
JsonValueKind.String => (object?)element.GetString() ?? DBNull.Value,
|
||||||
|
JsonValueKind.Number => element.TryGetInt64(out var l) ? l : element.GetDouble(),
|
||||||
|
JsonValueKind.True => true,
|
||||||
|
JsonValueKind.False => false,
|
||||||
|
JsonValueKind.Null or JsonValueKind.Undefined => DBNull.Value,
|
||||||
|
_ => element.GetRawText()
|
||||||
|
};
|
||||||
|
|
||||||
|
private sealed record CachedWritePayload(
|
||||||
|
string ConnectionName,
|
||||||
|
string Sql,
|
||||||
|
Dictionary<string, JsonElement>? Parameters);
|
||||||
|
|
||||||
private async Task<DatabaseConnectionDefinition?> ResolveConnectionAsync(
|
private async Task<DatabaseConnectionDefinition?> ResolveConnectionAsync(
|
||||||
string connectionName,
|
string connectionName,
|
||||||
CancellationToken cancellationToken)
|
CancellationToken cancellationToken)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ using System.Net.Http.Headers;
|
|||||||
using System.Text;
|
using System.Text;
|
||||||
using System.Text.Json;
|
using System.Text.Json;
|
||||||
using Microsoft.Extensions.Logging;
|
using Microsoft.Extensions.Logging;
|
||||||
|
using Microsoft.Extensions.Options;
|
||||||
using ScadaLink.Commons.Entities.ExternalSystems;
|
using ScadaLink.Commons.Entities.ExternalSystems;
|
||||||
using ScadaLink.Commons.Interfaces.Repositories;
|
using ScadaLink.Commons.Interfaces.Repositories;
|
||||||
using ScadaLink.Commons.Interfaces.Services;
|
using ScadaLink.Commons.Interfaces.Services;
|
||||||
@@ -22,17 +23,20 @@ public class ExternalSystemClient : IExternalSystemClient
|
|||||||
private readonly IExternalSystemRepository _repository;
|
private readonly IExternalSystemRepository _repository;
|
||||||
private readonly StoreAndForwardService? _storeAndForward;
|
private readonly StoreAndForwardService? _storeAndForward;
|
||||||
private readonly ILogger<ExternalSystemClient> _logger;
|
private readonly ILogger<ExternalSystemClient> _logger;
|
||||||
|
private readonly ExternalSystemGatewayOptions _options;
|
||||||
|
|
||||||
public ExternalSystemClient(
|
public ExternalSystemClient(
|
||||||
IHttpClientFactory httpClientFactory,
|
IHttpClientFactory httpClientFactory,
|
||||||
IExternalSystemRepository repository,
|
IExternalSystemRepository repository,
|
||||||
ILogger<ExternalSystemClient> logger,
|
ILogger<ExternalSystemClient> logger,
|
||||||
StoreAndForwardService? storeAndForward = null)
|
StoreAndForwardService? storeAndForward = null,
|
||||||
|
IOptions<ExternalSystemGatewayOptions>? options = null)
|
||||||
{
|
{
|
||||||
_httpClientFactory = httpClientFactory;
|
_httpClientFactory = httpClientFactory;
|
||||||
_repository = repository;
|
_repository = repository;
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
_storeAndForward = storeAndForward;
|
_storeAndForward = storeAndForward;
|
||||||
|
_options = options?.Value ?? new ExternalSystemGatewayOptions();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -106,18 +110,67 @@ public class ExternalSystemClient : IExternalSystemClient
|
|||||||
Parameters = parameters
|
Parameters = parameters
|
||||||
});
|
});
|
||||||
|
|
||||||
var sfResult = await _storeAndForward.EnqueueAsync(
|
// attemptImmediateDelivery: false — this method already made the HTTP
|
||||||
|
// attempt above; letting EnqueueAsync re-invoke the handler would
|
||||||
|
// dispatch the same request a second time.
|
||||||
|
await _storeAndForward.EnqueueAsync(
|
||||||
StoreAndForwardCategory.ExternalSystem,
|
StoreAndForwardCategory.ExternalSystem,
|
||||||
systemName,
|
systemName,
|
||||||
payload,
|
payload,
|
||||||
originInstanceName,
|
originInstanceName,
|
||||||
system.MaxRetries > 0 ? system.MaxRetries : null,
|
system.MaxRetries > 0 ? system.MaxRetries : null,
|
||||||
system.RetryDelay > TimeSpan.Zero ? system.RetryDelay : null);
|
system.RetryDelay > TimeSpan.Zero ? system.RetryDelay : null,
|
||||||
|
attemptImmediateDelivery: false);
|
||||||
|
|
||||||
return new ExternalCallResult(true, null, null, WasBuffered: true);
|
return new ExternalCallResult(true, null, null, WasBuffered: true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// WP-7/10: Delivers a buffered ExternalSystem call during a store-and-forward
|
||||||
|
/// retry sweep. Returns true on success, false on permanent failure (the message
|
||||||
|
/// is parked); throws <see cref="TransientExternalSystemException"/> on a
|
||||||
|
/// transient failure so the engine retries.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<bool> DeliverBufferedAsync(
|
||||||
|
StoreAndForwardMessage message, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var payload = JsonSerializer.Deserialize<CachedCallPayload>(message.PayloadJson);
|
||||||
|
if (payload == null || string.IsNullOrEmpty(payload.SystemName) || string.IsNullOrEmpty(payload.MethodName))
|
||||||
|
{
|
||||||
|
_logger.LogError("Buffered ExternalSystem message {Id} has an unreadable payload; parking.", message.Id);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var (system, method) = await ResolveSystemAndMethodAsync(
|
||||||
|
payload.SystemName, payload.MethodName, cancellationToken);
|
||||||
|
if (system == null || method == null)
|
||||||
|
{
|
||||||
|
_logger.LogError(
|
||||||
|
"Buffered call to '{System}'/'{Method}' cannot be delivered — the system or method no longer exists; parking.",
|
||||||
|
payload.SystemName, payload.MethodName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var parameters = payload.Parameters?.ToDictionary(kv => kv.Key, kv => (object?)kv.Value);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await InvokeHttpAsync(system, method, parameters, cancellationToken);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (PermanentExternalSystemException ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Buffered call to '{System}' failed permanently; parking.", payload.SystemName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// TransientExternalSystemException propagates — the S&F engine retries.
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed record CachedCallPayload(
|
||||||
|
string SystemName,
|
||||||
|
string MethodName,
|
||||||
|
Dictionary<string, JsonElement>? Parameters);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// WP-6: Executes the HTTP request against the external system.
|
/// WP-6: Executes the HTTP request against the external system.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -149,22 +202,59 @@ public class ExternalSystemClient : IExternalSystemClient
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Enforce the per-call timeout. ExternalSystemDefinition has no per-system
|
||||||
|
// Timeout field yet, so the configured DefaultHttpTimeout is the effective
|
||||||
|
// round-trip limit (the design's "timeout applies to the HTTP request
|
||||||
|
// round-trip" guarantee). A linked CTS lets us distinguish a timeout from a
|
||||||
|
// caller-initiated cancellation: only the timeout is reclassified as transient.
|
||||||
|
using var timeoutCts = new CancellationTokenSource(_options.DefaultHttpTimeout);
|
||||||
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
|
||||||
|
cancellationToken, timeoutCts.Token);
|
||||||
|
|
||||||
HttpResponseMessage response;
|
HttpResponseMessage response;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
response = await client.SendAsync(request, cancellationToken);
|
response = await client.SendAsync(request, linkedCts.Token);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
// The caller asked to abandon the work — do not reclassify as transient.
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
// Our own timeout elapsed — a transient failure per the design.
|
||||||
|
throw ErrorClassifier.AsTransient(
|
||||||
|
$"Timeout calling {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
|
||||||
}
|
}
|
||||||
catch (Exception ex) when (ErrorClassifier.IsTransient(ex))
|
catch (Exception ex) when (ErrorClassifier.IsTransient(ex))
|
||||||
{
|
{
|
||||||
throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex);
|
throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (response.IsSuccessStatusCode)
|
// The timeout also covers reading the response body (the design's
|
||||||
|
// "round-trip" guarantee), so the linked token is used for the read too.
|
||||||
|
string body;
|
||||||
|
try
|
||||||
{
|
{
|
||||||
return await response.Content.ReadAsStringAsync(cancellationToken);
|
body = await response.Content.ReadAsStringAsync(linkedCts.Token);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
throw ErrorClassifier.AsTransient(
|
||||||
|
$"Timeout reading response from {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
var errorBody = await response.Content.ReadAsStringAsync(cancellationToken);
|
if (response.IsSuccessStatusCode)
|
||||||
|
{
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
|
||||||
|
var errorBody = body;
|
||||||
|
|
||||||
if (ErrorClassifier.IsTransient(response.StatusCode))
|
if (ErrorClassifier.IsTransient(response.StatusCode))
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -8,11 +8,11 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.Data.SqlClient" Version="6.0.2" />
|
<PackageReference Include="Microsoft.Data.SqlClient" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Http" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -33,16 +33,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
|||||||
/// Only replaces stored state if incoming sequence number is greater than last received.
|
/// Only replaces stored state if incoming sequence number is greater than last received.
|
||||||
/// Auto-marks previously offline sites as online.
|
/// Auto-marks previously offline sites as online.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <see cref="SiteHealthState"/> is immutable: each transition produces a brand-new
|
||||||
|
/// instance, and the dictionary entry is replaced atomically. The mutation is
|
||||||
|
/// performed in a compare-and-swap retry loop rather than via the
|
||||||
|
/// <c>AddOrUpdate</c> update delegate so the sequence-number guard and the field
|
||||||
|
/// writes are evaluated as a single atomic step against the value actually
|
||||||
|
/// installed — the <c>AddOrUpdate</c> delegate may be invoked more than once
|
||||||
|
/// under contention and could otherwise act on a value that is then discarded.
|
||||||
|
/// </remarks>
|
||||||
public void ProcessReport(SiteHealthReport report)
|
public void ProcessReport(SiteHealthReport report)
|
||||||
{
|
{
|
||||||
var now = _timeProvider.GetUtcNow();
|
var now = _timeProvider.GetUtcNow();
|
||||||
|
|
||||||
_siteStates.AddOrUpdate(
|
while (true)
|
||||||
report.SiteId,
|
{
|
||||||
_ =>
|
if (!_siteStates.TryGetValue(report.SiteId, out var existing))
|
||||||
{
|
{
|
||||||
_logger.LogInformation("Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
|
var registered = new SiteHealthState
|
||||||
return new SiteHealthState
|
|
||||||
{
|
{
|
||||||
SiteId = report.SiteId,
|
SiteId = report.SiteId,
|
||||||
LatestReport = report,
|
LatestReport = report,
|
||||||
@@ -51,50 +59,84 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
|||||||
LastSequenceNumber = report.SequenceNumber,
|
LastSequenceNumber = report.SequenceNumber,
|
||||||
IsOnline = true
|
IsOnline = true
|
||||||
};
|
};
|
||||||
},
|
|
||||||
(_, existing) =>
|
if (_siteStates.TryAdd(report.SiteId, registered))
|
||||||
|
{
|
||||||
|
_logger.LogInformation(
|
||||||
|
"Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lost the race — another thread registered first; retry as an update.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (report.SequenceNumber <= existing.LastSequenceNumber)
|
||||||
{
|
{
|
||||||
if (report.SequenceNumber <= existing.LastSequenceNumber)
|
_logger.LogDebug(
|
||||||
|
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
|
||||||
|
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var updated = existing with
|
||||||
|
{
|
||||||
|
LatestReport = report,
|
||||||
|
LastReportReceivedAt = now,
|
||||||
|
LastHeartbeatAt = now,
|
||||||
|
LastSequenceNumber = report.SequenceNumber,
|
||||||
|
IsOnline = true
|
||||||
|
};
|
||||||
|
|
||||||
|
if (_siteStates.TryUpdate(report.SiteId, updated, existing))
|
||||||
|
{
|
||||||
|
if (!existing.IsOnline)
|
||||||
{
|
{
|
||||||
_logger.LogDebug(
|
_logger.LogInformation(
|
||||||
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
|
"Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
|
||||||
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
|
|
||||||
return existing;
|
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
var wasOffline = !existing.IsOnline;
|
// CAS lost — the entry changed under us; retry with the fresh value.
|
||||||
existing.LatestReport = report;
|
}
|
||||||
existing.LastReportReceivedAt = now;
|
|
||||||
existing.LastHeartbeatAt = now;
|
|
||||||
existing.LastSequenceNumber = report.SequenceNumber;
|
|
||||||
existing.IsOnline = true;
|
|
||||||
|
|
||||||
if (wasOffline)
|
|
||||||
{
|
|
||||||
_logger.LogInformation("Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
|
|
||||||
}
|
|
||||||
|
|
||||||
return existing;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Bumps the last-seen timestamp for a site already known via a prior
|
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||||
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
||||||
/// full report from are ignored — registration only happens on report.
|
/// full report from are ignored — registration only happens on report.
|
||||||
|
/// The update is an atomic compare-and-swap of the immutable state.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
||||||
{
|
{
|
||||||
if (!_siteStates.TryGetValue(siteId, out var state))
|
while (true)
|
||||||
return;
|
|
||||||
|
|
||||||
if (receivedAt > state.LastHeartbeatAt)
|
|
||||||
state.LastHeartbeatAt = receivedAt;
|
|
||||||
|
|
||||||
if (!state.IsOnline)
|
|
||||||
{
|
{
|
||||||
state.IsOnline = true;
|
if (!_siteStates.TryGetValue(siteId, out var existing))
|
||||||
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
return;
|
||||||
|
|
||||||
|
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
|
||||||
|
? receivedAt
|
||||||
|
: existing.LastHeartbeatAt;
|
||||||
|
|
||||||
|
// Nothing to change — avoid a needless swap.
|
||||||
|
if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
|
||||||
|
return;
|
||||||
|
|
||||||
|
var updated = existing with
|
||||||
|
{
|
||||||
|
LastHeartbeatAt = newHeartbeat,
|
||||||
|
IsOnline = true
|
||||||
|
};
|
||||||
|
|
||||||
|
if (_siteStates.TryUpdate(siteId, updated, existing))
|
||||||
|
{
|
||||||
|
if (!existing.IsOnline)
|
||||||
|
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CAS lost — retry with the fresh value.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -143,13 +185,20 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
|||||||
var state = kvp.Value;
|
var state = kvp.Value;
|
||||||
if (!state.IsOnline) continue;
|
if (!state.IsOnline) continue;
|
||||||
|
|
||||||
// Use LastHeartbeatAt — heartbeats arrive every ~5s from any
|
// Use LastHeartbeatAt — heartbeats arrive frequently from any
|
||||||
// healthy site node, so OfflineTimeout only fires when no node
|
// healthy site node (cadence owned by Cluster Infrastructure /
|
||||||
// can reach central, not during single-node failovers.
|
// SiteCommunicationActor), so OfflineTimeout only fires when no
|
||||||
|
// node can reach central, not during single-node failovers.
|
||||||
var elapsed = now - state.LastHeartbeatAt;
|
var elapsed = now - state.LastHeartbeatAt;
|
||||||
if (elapsed > _options.OfflineTimeout)
|
if (elapsed <= _options.OfflineTimeout)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Atomically swap to an offline copy. If the CAS loses to a
|
||||||
|
// concurrent report/heartbeat the site was just heard from, so
|
||||||
|
// leaving it online is the correct outcome — no retry needed.
|
||||||
|
var offline = state with { IsOnline = false };
|
||||||
|
if (_siteStates.TryUpdate(kvp.Key, offline, state))
|
||||||
{
|
{
|
||||||
state.IsOnline = false;
|
|
||||||
_logger.LogWarning(
|
_logger.LogWarning(
|
||||||
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
||||||
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
|
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
|
||||||
|
|||||||
@@ -84,6 +84,20 @@ public class HealthReportSender : BackgroundService
|
|||||||
_collector.SetParkedMessageCount(parkedCount);
|
_collector.SetParkedMessageCount(parkedCount);
|
||||||
}
|
}
|
||||||
catch { /* Non-fatal — parked count will be 0 */ }
|
catch { /* Non-fatal — parked count will be 0 */ }
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// Per-category pending-message buffer depths (the documented
|
||||||
|
// "store-and-forward buffer depth" triage metric). Keyed by
|
||||||
|
// StoreAndForwardCategory name so the central dashboard can
|
||||||
|
// render external/notification/DB-write depths separately.
|
||||||
|
var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
|
||||||
|
var depths = depthsByCategory.ToDictionary(
|
||||||
|
kvp => kvp.Key.ToString(),
|
||||||
|
kvp => kvp.Value);
|
||||||
|
_collector.SetStoreAndForwardDepths(depths);
|
||||||
|
}
|
||||||
|
catch { /* Non-fatal — buffer depths will be empty */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
var seq = Interlocked.Increment(ref _sequenceNumber);
|
var seq = Interlocked.Increment(ref _sequenceNumber);
|
||||||
|
|||||||
@@ -8,10 +8,10 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -4,26 +4,37 @@ namespace ScadaLink.HealthMonitoring;
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// In-memory state for a single site's health, stored by the central aggregator.
|
/// In-memory state for a single site's health, stored by the central aggregator.
|
||||||
|
/// Immutable: every state transition produces a new instance which the aggregator
|
||||||
|
/// installs into its <c>ConcurrentDictionary</c> via an atomic compare-and-swap.
|
||||||
|
/// This makes handing the reference straight to UI callers safe — a consumer can
|
||||||
|
/// never observe a torn or half-applied update.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class SiteHealthState
|
public sealed record SiteHealthState
|
||||||
{
|
{
|
||||||
public required string SiteId { get; init; }
|
public required string SiteId { get; init; }
|
||||||
public SiteHealthReport LatestReport { get; set; } = null!;
|
|
||||||
|
/// <summary>
|
||||||
|
/// The latest full <see cref="SiteHealthReport"/> received for the site, or
|
||||||
|
/// <c>null</c> if the site is known only via heartbeats and has not yet sent
|
||||||
|
/// a report.
|
||||||
|
/// </summary>
|
||||||
|
public SiteHealthReport? LatestReport { get; init; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Time the latest full <see cref="SiteHealthReport"/> was processed.
|
/// Time the latest full <see cref="SiteHealthReport"/> was processed.
|
||||||
/// Used by the UI to surface report staleness during failover.
|
/// Used by the UI to surface report staleness during failover.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public DateTimeOffset LastReportReceivedAt { get; set; }
|
public DateTimeOffset LastReportReceivedAt { get; init; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Time the most recent signal of any kind (full report OR ~5s heartbeat)
|
/// Time the most recent signal of any kind (full report OR heartbeat) was
|
||||||
/// was received. Drives offline detection — heartbeats from the standby
|
/// received. Drives offline detection — heartbeats from the standby keep the
|
||||||
/// keep the site marked online even when the active node is unable to
|
/// site marked online even when the active node is unable to produce a report
|
||||||
/// produce a report (mid-failover, brief stalls).
|
/// (mid-failover, brief stalls). See the heartbeat scheduler owned by the
|
||||||
|
/// Cluster Infrastructure / SiteCommunicationActor for the actual cadence.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public DateTimeOffset LastHeartbeatAt { get; set; }
|
public DateTimeOffset LastHeartbeatAt { get; init; }
|
||||||
|
|
||||||
public long LastSequenceNumber { get; set; }
|
public long LastSequenceNumber { get; init; }
|
||||||
public bool IsOnline { get; set; }
|
public bool IsOnline { get; init; }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -344,6 +344,42 @@ akka {{
|
|||||||
// any actor or HTTP handler touches the service.
|
// any actor or HTTP handler touches the service.
|
||||||
storeAndForwardService.StartAsync().GetAwaiter().GetResult();
|
storeAndForwardService.StartAsync().GetAwaiter().GetResult();
|
||||||
|
|
||||||
|
// Register the store-and-forward delivery handlers so buffered
|
||||||
|
// ExternalSystem calls, cached DB writes and notifications are actually
|
||||||
|
// delivered by the retry sweep. Without this, every buffered message is
|
||||||
|
// persisted but never delivered. Each handler resolves its scoped consumer
|
||||||
|
// service in a fresh DI scope — the sweep runs on a timer thread, outside
|
||||||
|
// any request scope.
|
||||||
|
storeAndForwardService.RegisterDeliveryHandler(
|
||||||
|
ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem,
|
||||||
|
async msg =>
|
||||||
|
{
|
||||||
|
using var scope = _serviceProvider.CreateScope();
|
||||||
|
return await scope.ServiceProvider
|
||||||
|
.GetRequiredService<ScadaLink.ExternalSystemGateway.ExternalSystemClient>()
|
||||||
|
.DeliverBufferedAsync(msg);
|
||||||
|
});
|
||||||
|
storeAndForwardService.RegisterDeliveryHandler(
|
||||||
|
ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite,
|
||||||
|
async msg =>
|
||||||
|
{
|
||||||
|
using var scope = _serviceProvider.CreateScope();
|
||||||
|
return await scope.ServiceProvider
|
||||||
|
.GetRequiredService<ScadaLink.ExternalSystemGateway.DatabaseGateway>()
|
||||||
|
.DeliverBufferedAsync(msg);
|
||||||
|
});
|
||||||
|
storeAndForwardService.RegisterDeliveryHandler(
|
||||||
|
ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.Notification,
|
||||||
|
async msg =>
|
||||||
|
{
|
||||||
|
using var scope = _serviceProvider.CreateScope();
|
||||||
|
return await scope.ServiceProvider
|
||||||
|
.GetRequiredService<ScadaLink.NotificationService.NotificationDeliveryService>()
|
||||||
|
.DeliverBufferedAsync(msg);
|
||||||
|
});
|
||||||
|
_logger.LogInformation(
|
||||||
|
"Store-and-forward delivery handlers registered (ExternalSystem, CachedDbWrite, Notification)");
|
||||||
|
|
||||||
var parkedMessageHandler = _actorSystem.ActorOf(
|
var parkedMessageHandler = _actorSystem.ActorOf(
|
||||||
Props.Create(() => new ParkedMessageHandlerActor(
|
Props.Create(() => new ParkedMessageHandlerActor(
|
||||||
storeAndForwardService, _nodeOptions.SiteId!)),
|
storeAndForwardService, _nodeOptions.SiteId!)),
|
||||||
|
|||||||
@@ -131,9 +131,14 @@ try
|
|||||||
app.UseAuthorization();
|
app.UseAuthorization();
|
||||||
app.UseAntiforgery();
|
app.UseAntiforgery();
|
||||||
|
|
||||||
// WP-12: Map readiness endpoint — returns 503 until all checks pass, 200 when ready
|
// WP-12: Map readiness endpoint — returns 503 until ready, 200 when ready.
|
||||||
|
// REQ-HOST-4a defines readiness as cluster membership + DB connectivity,
|
||||||
|
// explicitly NOT cluster leadership. The leader-only "active-node" check is
|
||||||
|
// excluded here so a fully operational standby central node reports ready;
|
||||||
|
// leadership is reported separately on /health/active.
|
||||||
app.MapHealthChecks("/health/ready", new HealthCheckOptions
|
app.MapHealthChecks("/health/ready", new HealthCheckOptions
|
||||||
{
|
{
|
||||||
|
Predicate = check => check.Name != "active-node",
|
||||||
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -8,23 +8,23 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka.Cluster.Hosting" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster.Hosting" />
|
||||||
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster.Tools" />
|
||||||
<PackageReference Include="Akka.Hosting" Version="1.5.62" />
|
<PackageReference Include="Akka.Hosting" />
|
||||||
<PackageReference Include="Akka.Remote.Hosting" Version="1.5.62" />
|
<PackageReference Include="Akka.Remote.Hosting" />
|
||||||
<PackageReference Include="AspNetCore.HealthChecks.UI.Client" Version="9.0.0" />
|
<PackageReference Include="AspNetCore.HealthChecks.UI.Client" />
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.7">
|
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
|
||||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||||
<PrivateAssets>all</PrivateAssets>
|
<PrivateAssets>all</PrivateAssets>
|
||||||
</PackageReference>
|
</PackageReference>
|
||||||
<PackageReference Include="Grpc.AspNetCore" Version="2.71.0" />
|
<PackageReference Include="Grpc.AspNetCore" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Hosting.WindowsServices" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Hosting.WindowsServices" />
|
||||||
<PackageReference Include="Serilog.AspNetCore" Version="10.0.0" />
|
<PackageReference Include="Serilog.AspNetCore" />
|
||||||
<PackageReference Include="Serilog.Sinks.Console" Version="6.1.1" />
|
<PackageReference Include="Serilog.Sinks.Console" />
|
||||||
<PackageReference Include="Serilog.Sinks.File" Version="7.0.0" />
|
<PackageReference Include="Serilog.Sinks.File" />
|
||||||
<!-- Transitive override: Akka.Hosting 1.5.62 pins OpenTelemetry.Api 1.9.0 which is flagged
|
<!-- Transitive override: Akka.Hosting 1.5.62 pins OpenTelemetry.Api 1.9.0 which is flagged
|
||||||
(GHSA-g94r-2vxg-569j, GHSA-8785-wc3w-h8q6). Bumping directly clears both advisories. -->
|
(GHSA-g94r-2vxg-569j, GHSA-8785-wc3w-h8q6). Bumping directly clears both advisories. -->
|
||||||
<PackageReference Include="OpenTelemetry.Api" Version="1.15.3" />
|
<PackageReference Include="OpenTelemetry.Api" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="5.0.0" />
|
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
@@ -9,8 +9,8 @@
|
|||||||
<FrameworkReference Include="Microsoft.AspNetCore.App" />
|
<FrameworkReference Include="Microsoft.AspNetCore.App" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka" Version="1.5.62" />
|
<PackageReference Include="Akka" />
|
||||||
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster.Tools" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
||||||
|
|||||||
@@ -93,18 +93,75 @@ public class NotificationDeliveryService : INotificationDeliveryService
|
|||||||
Message = message
|
Message = message
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// attemptImmediateDelivery: false — DeliverAsync was already attempted
|
||||||
|
// above; letting EnqueueAsync re-invoke the handler would send twice.
|
||||||
await _storeAndForward.EnqueueAsync(
|
await _storeAndForward.EnqueueAsync(
|
||||||
StoreAndForwardCategory.Notification,
|
StoreAndForwardCategory.Notification,
|
||||||
listName,
|
listName,
|
||||||
payload,
|
payload,
|
||||||
originInstanceName,
|
originInstanceName,
|
||||||
smtpConfig.MaxRetries > 0 ? smtpConfig.MaxRetries : null,
|
smtpConfig.MaxRetries > 0 ? smtpConfig.MaxRetries : null,
|
||||||
smtpConfig.RetryDelay > TimeSpan.Zero ? smtpConfig.RetryDelay : null);
|
smtpConfig.RetryDelay > TimeSpan.Zero ? smtpConfig.RetryDelay : null,
|
||||||
|
attemptImmediateDelivery: false);
|
||||||
|
|
||||||
return new NotificationResult(true, null, WasBuffered: true);
|
return new NotificationResult(true, null, WasBuffered: true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// WP-11/12: Delivers a buffered notification during a store-and-forward retry
|
||||||
|
/// sweep — re-resolves the list, recipients and SMTP config and re-attempts
|
||||||
|
/// delivery. Returns true on success, false on permanent failure (the message
|
||||||
|
/// is parked); throws on a transient failure so the engine retries.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<bool> DeliverBufferedAsync(
|
||||||
|
StoreAndForwardMessage message, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var payload = JsonSerializer.Deserialize<BufferedNotification>(message.PayloadJson);
|
||||||
|
if (payload == null || string.IsNullOrEmpty(payload.ListName))
|
||||||
|
{
|
||||||
|
_logger.LogError("Buffered notification message {Id} has an unreadable payload; parking.", message.Id);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var list = await _repository.GetListByNameAsync(payload.ListName, cancellationToken);
|
||||||
|
if (list == null)
|
||||||
|
{
|
||||||
|
_logger.LogError(
|
||||||
|
"Buffered notification to list '{List}' cannot be delivered — the list no longer exists; parking.",
|
||||||
|
payload.ListName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var recipients = await _repository.GetRecipientsByListIdAsync(list.Id, cancellationToken);
|
||||||
|
if (recipients.Count == 0)
|
||||||
|
{
|
||||||
|
_logger.LogError("Buffered notification to list '{List}' has no recipients; parking.", payload.ListName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var smtpConfig = (await _repository.GetAllSmtpConfigurationsAsync(cancellationToken)).FirstOrDefault();
|
||||||
|
if (smtpConfig == null)
|
||||||
|
{
|
||||||
|
_logger.LogError("Buffered notification cannot be delivered — no SMTP configuration available; parking.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await DeliverAsync(smtpConfig, recipients, payload.Subject, payload.Message, cancellationToken);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (SmtpPermanentException ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Buffered notification to list '{List}' failed permanently; parking.", payload.ListName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Transient SMTP errors propagate out of DeliverAsync — the S&F engine retries.
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed record BufferedNotification(string ListName, string Subject, string Message);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Delivers an email via SMTP. Throws on failure.
|
/// Delivers an email via SMTP. Throws on failure.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -8,11 +8,11 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="MailKit" Version="4.16.0" />
|
<PackageReference Include="MailKit" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Http" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -8,12 +8,12 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.7" />
|
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" />
|
||||||
<PackageReference Include="Microsoft.AspNetCore.Authorization" Version="10.0.7" />
|
<PackageReference Include="Microsoft.AspNetCore.Authorization" />
|
||||||
<PackageReference Include="System.IdentityModel.Tokens.Jwt" Version="8.11.0" />
|
<PackageReference Include="System.IdentityModel.Tokens.Jwt" />
|
||||||
<PackageReference Include="Novell.Directory.Ldap.NETStandard" Version="3.6.0" />
|
<PackageReference Include="Novell.Directory.Ldap.NETStandard" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -8,12 +8,12 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka" Version="1.5.62" />
|
<PackageReference Include="Akka" />
|
||||||
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -50,6 +50,12 @@ public class AlarmActor : ReceiveActor
|
|||||||
private readonly string? _onTriggerScriptName;
|
private readonly string? _onTriggerScriptName;
|
||||||
private readonly Script<object?>? _onTriggerCompiledScript;
|
private readonly Script<object?>? _onTriggerCompiledScript;
|
||||||
|
|
||||||
|
// Expression trigger: compiled expression + the attribute snapshot it
|
||||||
|
// evaluates against. This field is the single home for the compiled
|
||||||
|
// expression on the hot path.
|
||||||
|
private readonly Script<object?>? _compiledTriggerExpression;
|
||||||
|
private readonly Dictionary<string, object?> _attributeSnapshot = new();
|
||||||
|
|
||||||
// Rate of change tracking
|
// Rate of change tracking
|
||||||
private readonly Queue<(DateTimeOffset Timestamp, double Value)> _rateOfChangeWindow = new();
|
private readonly Queue<(DateTimeOffset Timestamp, double Value)> _rateOfChangeWindow = new();
|
||||||
private readonly TimeSpan _rateOfChangeWindowDuration;
|
private readonly TimeSpan _rateOfChangeWindowDuration;
|
||||||
@@ -65,6 +71,8 @@ public class AlarmActor : ReceiveActor
|
|||||||
SharedScriptLibrary sharedScriptLibrary,
|
SharedScriptLibrary sharedScriptLibrary,
|
||||||
SiteRuntimeOptions options,
|
SiteRuntimeOptions options,
|
||||||
ILogger logger,
|
ILogger logger,
|
||||||
|
Script<object?>? compiledTriggerExpression = null,
|
||||||
|
IReadOnlyDictionary<string, object?>? initialAttributes = null,
|
||||||
ISiteHealthCollector? healthCollector = null)
|
ISiteHealthCollector? healthCollector = null)
|
||||||
{
|
{
|
||||||
_alarmName = alarmName;
|
_alarmName = alarmName;
|
||||||
@@ -77,6 +85,16 @@ public class AlarmActor : ReceiveActor
|
|||||||
_priority = alarmConfig.PriorityLevel;
|
_priority = alarmConfig.PriorityLevel;
|
||||||
_onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName;
|
_onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName;
|
||||||
_onTriggerCompiledScript = onTriggerCompiledScript;
|
_onTriggerCompiledScript = onTriggerCompiledScript;
|
||||||
|
_compiledTriggerExpression = compiledTriggerExpression;
|
||||||
|
|
||||||
|
// Seed the trigger-expression attribute snapshot from the instance's
|
||||||
|
// initial attribute set so static attributes (which never re-emit an
|
||||||
|
// AttributeValueChanged after deploy) evaluate correctly at startup.
|
||||||
|
if (initialAttributes != null)
|
||||||
|
{
|
||||||
|
foreach (var kvp in initialAttributes)
|
||||||
|
_attributeSnapshot[kvp.Key] = kvp.Value;
|
||||||
|
}
|
||||||
|
|
||||||
// Parse trigger type
|
// Parse trigger type
|
||||||
_triggerType = Enum.TryParse<AlarmTriggerType>(alarmConfig.TriggerType, true, out var tt)
|
_triggerType = Enum.TryParse<AlarmTriggerType>(alarmConfig.TriggerType, true, out var tt)
|
||||||
@@ -126,9 +144,18 @@ public class AlarmActor : ReceiveActor
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
private void HandleAttributeValueChanged(AttributeValueChanged changed)
|
private void HandleAttributeValueChanged(AttributeValueChanged changed)
|
||||||
{
|
{
|
||||||
// Only evaluate if this change is for an attribute we're monitoring
|
// Expression triggers evaluate against a snapshot of every attribute,
|
||||||
if (!IsMonitoredAttribute(changed.AttributeName))
|
// not a single monitored attribute. Keep the snapshot current for every
|
||||||
|
// change before the IsMonitoredAttribute gate (which does not apply).
|
||||||
|
if (_triggerType == AlarmTriggerType.Expression)
|
||||||
|
{
|
||||||
|
_attributeSnapshot[changed.AttributeName] = changed.Value;
|
||||||
|
}
|
||||||
|
else if (!IsMonitoredAttribute(changed.AttributeName))
|
||||||
|
{
|
||||||
|
// Only evaluate if this change is for an attribute we're monitoring
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@@ -143,6 +170,7 @@ public class AlarmActor : ReceiveActor
|
|||||||
AlarmTriggerType.ValueMatch => EvaluateValueMatch(changed.Value),
|
AlarmTriggerType.ValueMatch => EvaluateValueMatch(changed.Value),
|
||||||
AlarmTriggerType.RangeViolation => EvaluateRangeViolation(changed.Value),
|
AlarmTriggerType.RangeViolation => EvaluateRangeViolation(changed.Value),
|
||||||
AlarmTriggerType.RateOfChange => EvaluateRateOfChange(changed.Value, changed.Timestamp),
|
AlarmTriggerType.RateOfChange => EvaluateRateOfChange(changed.Value, changed.Timestamp),
|
||||||
|
AlarmTriggerType.Expression => EvaluateExpression(),
|
||||||
_ => false
|
_ => false
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -337,6 +365,44 @@ public class AlarmActor : ReceiveActor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Evaluates the compiled trigger expression against the current attribute
|
||||||
|
/// snapshot, returning the resulting bool. This bool feeds the existing
|
||||||
|
/// binary Normal↔Active state path — the alarm is active while true. A
|
||||||
|
/// throwing, non-bool, or timed-out expression is treated as false (logged
|
||||||
|
/// as an alarm error) so that the state machine still runs — an Active
|
||||||
|
/// alarm correctly clears if the expression starts throwing.
|
||||||
|
/// </summary>
|
||||||
|
private bool EvaluateExpression()
|
||||||
|
{
|
||||||
|
if (_compiledTriggerExpression == null) return false;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var globals = new TriggerExpressionGlobals(_attributeSnapshot);
|
||||||
|
// Bound evaluation with a short timeout. The CancellationToken
|
||||||
|
// covers cooperative/async cases; a pathological CPU-bound
|
||||||
|
// expression is not fully interruptible. Acceptable because
|
||||||
|
// trigger expressions are authored by trusted Design-role users
|
||||||
|
// and are compile-checked pre-deployment.
|
||||||
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
|
||||||
|
var state = _compiledTriggerExpression
|
||||||
|
.RunAsync(globals, cancellationToken: cts.Token)
|
||||||
|
.GetAwaiter().GetResult();
|
||||||
|
return state.ReturnValue is bool b && b;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
// OperationCanceledException (timeout) falls through here too,
|
||||||
|
// and is correctly treated as false.
|
||||||
|
_healthCollector?.IncrementAlarmError();
|
||||||
|
_logger.LogError(ex,
|
||||||
|
"Alarm {Alarm} trigger expression evaluation failed on {Instance}; treated as false",
|
||||||
|
_alarmName, _instanceName);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// HiLo level evaluator: returns the most-severe matching band for the
|
/// HiLo level evaluator: returns the most-severe matching band for the
|
||||||
/// given value. Severity order checked from highest to lowest so that a
|
/// given value. Severity order checked from highest to lowest so that a
|
||||||
@@ -473,6 +539,14 @@ public class AlarmActor : ReceiveActor
|
|||||||
HiMessage: TryReadString(root, "hiMessage"),
|
HiMessage: TryReadString(root, "hiMessage"),
|
||||||
HiHiMessage: TryReadString(root, "hiHiMessage")),
|
HiHiMessage: TryReadString(root, "hiHiMessage")),
|
||||||
|
|
||||||
|
// Expression triggers have no single monitored attribute; they
|
||||||
|
// evaluate the compiled expression (passed into the actor and
|
||||||
|
// cached in _compiledTriggerExpression) over the full attribute
|
||||||
|
// snapshot. MonitoredAttributeName is unused.
|
||||||
|
AlarmTriggerType.Expression => new ExpressionEvalConfig(
|
||||||
|
"",
|
||||||
|
TriggerExpressionGlobals.ExtractExpression(triggerConfigJson) ?? ""),
|
||||||
|
|
||||||
_ => new ValueMatchEvalConfig(attr, null)
|
_ => new ValueMatchEvalConfig(attr, null)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -535,6 +609,17 @@ internal record RateOfChangeEvalConfig(
|
|||||||
TimeSpan WindowDuration,
|
TimeSpan WindowDuration,
|
||||||
RateOfChangeDirection Direction) : AlarmEvalConfig(MonitoredAttributeName);
|
RateOfChangeDirection Direction) : AlarmEvalConfig(MonitoredAttributeName);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Expression evaluation config: a read-only boolean C# expression evaluated
|
||||||
|
/// over the full attribute snapshot. Has no single monitored attribute
|
||||||
|
/// (<see cref="AlarmEvalConfig.MonitoredAttributeName"/> is empty). The
|
||||||
|
/// compiled expression itself lives on the actor's <c>_compiledTriggerExpression</c>
|
||||||
|
/// field, the single source for the hot path.
|
||||||
|
/// </summary>
|
||||||
|
internal record ExpressionEvalConfig(
|
||||||
|
string MonitoredAttributeName,
|
||||||
|
string Expression) : AlarmEvalConfig(MonitoredAttributeName);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// HiLo evaluation config: any subset of the four setpoints may be set; null
|
/// HiLo evaluation config: any subset of the four setpoints may be set; null
|
||||||
/// means "don't evaluate that band". Per-setpoint priorities override the
|
/// means "don't evaluate that band". Per-setpoint priorities override the
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
using Akka.Actor;
|
using Akka.Actor;
|
||||||
|
using Microsoft.CodeAnalysis.Scripting;
|
||||||
using Microsoft.Extensions.Logging;
|
using Microsoft.Extensions.Logging;
|
||||||
using ScadaLink.Commons.Messages.DataConnection;
|
using ScadaLink.Commons.Messages.DataConnection;
|
||||||
using ScadaLink.Commons.Messages.DebugView;
|
using ScadaLink.Commons.Messages.DebugView;
|
||||||
@@ -515,6 +516,10 @@ public class InstanceActor : ReceiveActor
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compile the trigger expression for Expression-triggered scripts.
|
||||||
|
var triggerExpression = CompileTriggerExpression(
|
||||||
|
script.TriggerType, script.TriggerConfiguration, $"script-trigger-{script.CanonicalName}");
|
||||||
|
|
||||||
var props = Props.Create(() => new ScriptActor(
|
var props = Props.Create(() => new ScriptActor(
|
||||||
script.CanonicalName,
|
script.CanonicalName,
|
||||||
_instanceUniqueName,
|
_instanceUniqueName,
|
||||||
@@ -524,6 +529,8 @@ public class InstanceActor : ReceiveActor
|
|||||||
_sharedScriptLibrary,
|
_sharedScriptLibrary,
|
||||||
_options,
|
_options,
|
||||||
_logger,
|
_logger,
|
||||||
|
triggerExpression,
|
||||||
|
_attributes,
|
||||||
_healthCollector,
|
_healthCollector,
|
||||||
_serviceProvider));
|
_serviceProvider));
|
||||||
|
|
||||||
@@ -534,7 +541,7 @@ public class InstanceActor : ReceiveActor
|
|||||||
// Create Alarm Actors
|
// Create Alarm Actors
|
||||||
foreach (var alarm in _configuration.Alarms)
|
foreach (var alarm in _configuration.Alarms)
|
||||||
{
|
{
|
||||||
Microsoft.CodeAnalysis.Scripting.Script<object?>? onTriggerScript = null;
|
Script<object?>? onTriggerScript = null;
|
||||||
|
|
||||||
// Compile on-trigger script if defined
|
// Compile on-trigger script if defined
|
||||||
if (!string.IsNullOrEmpty(alarm.OnTriggerScriptCanonicalName))
|
if (!string.IsNullOrEmpty(alarm.OnTriggerScriptCanonicalName))
|
||||||
@@ -559,6 +566,10 @@ public class InstanceActor : ReceiveActor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compile the trigger expression for Expression-triggered alarms.
|
||||||
|
var triggerExpression = CompileTriggerExpression(
|
||||||
|
alarm.TriggerType, alarm.TriggerConfiguration, $"alarm-trigger-expr-{alarm.CanonicalName}");
|
||||||
|
|
||||||
var props = Props.Create(() => new AlarmActor(
|
var props = Props.Create(() => new AlarmActor(
|
||||||
alarm.CanonicalName,
|
alarm.CanonicalName,
|
||||||
_instanceUniqueName,
|
_instanceUniqueName,
|
||||||
@@ -568,6 +579,8 @@ public class InstanceActor : ReceiveActor
|
|||||||
_sharedScriptLibrary,
|
_sharedScriptLibrary,
|
||||||
_options,
|
_options,
|
||||||
_logger,
|
_logger,
|
||||||
|
triggerExpression,
|
||||||
|
_attributes,
|
||||||
_healthCollector));
|
_healthCollector));
|
||||||
|
|
||||||
var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}");
|
var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}");
|
||||||
@@ -581,6 +594,32 @@ public class InstanceActor : ReceiveActor
|
|||||||
_instanceUniqueName, _scriptActors.Count, _alarmActors.Count);
|
_instanceUniqueName, _scriptActors.Count, _alarmActors.Count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Compiles the boolean trigger expression for an Expression-triggered
|
||||||
|
/// script or alarm. Returns null for non-Expression triggers, a blank
|
||||||
|
/// expression, or a compilation failure (logged) — in which case the
|
||||||
|
/// trigger is inert and the actor still starts.
|
||||||
|
/// </summary>
|
||||||
|
private Script<object?>? CompileTriggerExpression(
|
||||||
|
string? triggerType, string? triggerConfigJson, string compileName)
|
||||||
|
{
|
||||||
|
if (!string.Equals(triggerType, "Expression", StringComparison.OrdinalIgnoreCase))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
var expression = TriggerExpressionGlobals.ExtractExpression(triggerConfigJson);
|
||||||
|
if (expression == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
var result = _compilationService.CompileTriggerExpression(compileName, expression);
|
||||||
|
if (result.IsSuccess)
|
||||||
|
return result.CompiledScript;
|
||||||
|
|
||||||
|
_logger.LogError(
|
||||||
|
"Trigger expression for {Name} on {Instance} failed to compile: {Errors}",
|
||||||
|
compileName, _instanceUniqueName, string.Join("; ", result.Errors));
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Read-only access to current attribute count (for testing/diagnostics).
|
/// Read-only access to current attribute count (for testing/diagnostics).
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
using Akka.Actor;
|
using Akka.Actor;
|
||||||
using Microsoft.CodeAnalysis.Scripting;
|
using Microsoft.CodeAnalysis.Scripting;
|
||||||
|
using Microsoft.Extensions.DependencyInjection;
|
||||||
using Microsoft.Extensions.Logging;
|
using Microsoft.Extensions.Logging;
|
||||||
using ScadaLink.Commons.Messages.ScriptExecution;
|
using ScadaLink.Commons.Messages.ScriptExecution;
|
||||||
using ScadaLink.Commons.Messages.Streaming;
|
using ScadaLink.Commons.Messages.Streaming;
|
||||||
using ScadaLink.Commons.Types.Flattening;
|
using ScadaLink.Commons.Types.Flattening;
|
||||||
using ScadaLink.HealthMonitoring;
|
using ScadaLink.HealthMonitoring;
|
||||||
|
using ScadaLink.SiteEventLogging;
|
||||||
using ScadaLink.SiteRuntime.Scripts;
|
using ScadaLink.SiteRuntime.Scripts;
|
||||||
using System.Text.Json;
|
using System.Text.Json;
|
||||||
|
|
||||||
@@ -40,6 +42,12 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
private int _executionCounter;
|
private int _executionCounter;
|
||||||
private readonly Commons.Types.Scripts.ScriptScope _scope;
|
private readonly Commons.Types.Scripts.ScriptScope _scope;
|
||||||
|
|
||||||
|
// Expression trigger state: compiled expression, edge-tracking, and the
|
||||||
|
// attribute snapshot the expression evaluates against.
|
||||||
|
private readonly Script<object?>? _compiledTriggerExpression;
|
||||||
|
private bool _lastExpressionResult;
|
||||||
|
private readonly Dictionary<string, object?> _attributeSnapshot = new();
|
||||||
|
|
||||||
public ITimerScheduler Timers { get; set; } = null!;
|
public ITimerScheduler Timers { get; set; } = null!;
|
||||||
|
|
||||||
public ScriptActor(
|
public ScriptActor(
|
||||||
@@ -51,6 +59,8 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
SharedScriptLibrary sharedScriptLibrary,
|
SharedScriptLibrary sharedScriptLibrary,
|
||||||
SiteRuntimeOptions options,
|
SiteRuntimeOptions options,
|
||||||
ILogger logger,
|
ILogger logger,
|
||||||
|
Script<object?>? compiledTriggerExpression = null,
|
||||||
|
IReadOnlyDictionary<string, object?>? initialAttributes = null,
|
||||||
ISiteHealthCollector? healthCollector = null,
|
ISiteHealthCollector? healthCollector = null,
|
||||||
IServiceProvider? serviceProvider = null)
|
IServiceProvider? serviceProvider = null)
|
||||||
{
|
{
|
||||||
@@ -65,6 +75,16 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
_serviceProvider = serviceProvider;
|
_serviceProvider = serviceProvider;
|
||||||
_minTimeBetweenRuns = scriptConfig.MinTimeBetweenRuns;
|
_minTimeBetweenRuns = scriptConfig.MinTimeBetweenRuns;
|
||||||
_scope = scriptConfig.Scope;
|
_scope = scriptConfig.Scope;
|
||||||
|
_compiledTriggerExpression = compiledTriggerExpression;
|
||||||
|
|
||||||
|
// Seed the trigger-expression attribute snapshot from the instance's
|
||||||
|
// initial attribute set so static attributes (which never re-emit an
|
||||||
|
// AttributeValueChanged after deploy) evaluate correctly at startup.
|
||||||
|
if (initialAttributes != null)
|
||||||
|
{
|
||||||
|
foreach (var kvp in initialAttributes)
|
||||||
|
_attributeSnapshot[kvp.Key] = kvp.Value;
|
||||||
|
}
|
||||||
|
|
||||||
// Parse trigger configuration
|
// Parse trigger configuration
|
||||||
_triggerConfig = ParseTriggerConfig(scriptConfig.TriggerType, scriptConfig.TriggerConfiguration);
|
_triggerConfig = ParseTriggerConfig(scriptConfig.TriggerType, scriptConfig.TriggerConfiguration);
|
||||||
@@ -143,10 +163,15 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Handles attribute value changes — triggers script if configured for value-change or conditional.
|
/// Handles attribute value changes — triggers script if configured for
|
||||||
|
/// value-change, conditional, or expression. The attribute snapshot is
|
||||||
|
/// updated for every change before any trigger logic runs.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private void HandleAttributeValueChanged(AttributeValueChanged changed)
|
private void HandleAttributeValueChanged(AttributeValueChanged changed)
|
||||||
{
|
{
|
||||||
|
// Keep the snapshot current for every change, regardless of trigger type.
|
||||||
|
_attributeSnapshot[changed.AttributeName] = changed.Value;
|
||||||
|
|
||||||
if (_triggerConfig is ValueChangeTriggerConfig valueTrigger)
|
if (_triggerConfig is ValueChangeTriggerConfig valueTrigger)
|
||||||
{
|
{
|
||||||
if (valueTrigger.AttributeName == changed.AttributeName)
|
if (valueTrigger.AttributeName == changed.AttributeName)
|
||||||
@@ -165,6 +190,65 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (_triggerConfig is ExpressionTriggerConfig)
|
||||||
|
{
|
||||||
|
EvaluateExpressionTrigger();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Evaluates the compiled trigger expression against the current attribute
|
||||||
|
/// snapshot and runs the script edge-triggered — once per false→true
|
||||||
|
/// transition. A throwing or non-bool expression is treated as false and
|
||||||
|
/// logged as a script error; the actor never crashes.
|
||||||
|
/// </summary>
|
||||||
|
private void EvaluateExpressionTrigger()
|
||||||
|
{
|
||||||
|
if (_compiledTriggerExpression == null) return;
|
||||||
|
|
||||||
|
bool result;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var globals = new TriggerExpressionGlobals(_attributeSnapshot);
|
||||||
|
// Bound evaluation with a short timeout. The CancellationToken
|
||||||
|
// covers cooperative/async cases; a pathological CPU-bound
|
||||||
|
// expression is not fully interruptible. Acceptable because
|
||||||
|
// trigger expressions are authored by trusted Design-role users
|
||||||
|
// and are compile-checked pre-deployment.
|
||||||
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
|
||||||
|
var state = _compiledTriggerExpression
|
||||||
|
.RunAsync(globals, cancellationToken: cts.Token)
|
||||||
|
.GetAwaiter().GetResult();
|
||||||
|
result = state.ReturnValue is bool b && b;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
// OperationCanceledException (timeout) falls through here too,
|
||||||
|
// and is correctly treated as false.
|
||||||
|
LogExpressionError(ex);
|
||||||
|
result = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result && !_lastExpressionResult)
|
||||||
|
{
|
||||||
|
TrySpawnExecution(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
_lastExpressionResult = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Records a trigger-expression evaluation failure to the site event log,
|
||||||
|
/// mirroring how ScriptExecutionActor reports script errors.
|
||||||
|
/// </summary>
|
||||||
|
private void LogExpressionError(Exception ex)
|
||||||
|
{
|
||||||
|
_healthCollector?.IncrementScriptError();
|
||||||
|
var errorMsg = $"Trigger expression for script '{_scriptName}' on instance '{_instanceName}' failed: {ex.Message}";
|
||||||
|
_logger.LogError(ex, "Trigger expression evaluation failed: {Script} on {Instance}", _scriptName, _instanceName);
|
||||||
|
|
||||||
|
_ = _serviceProvider?.GetService<ISiteEventLogger>()?.LogEventAsync(
|
||||||
|
"script", "Error", _instanceName, $"ScriptActor:{_scriptName}", errorMsg, ex.ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -264,11 +348,18 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
"interval" => ParseIntervalTrigger(triggerConfigJson),
|
"interval" => ParseIntervalTrigger(triggerConfigJson),
|
||||||
"valuechange" => ParseValueChangeTrigger(triggerConfigJson),
|
"valuechange" => ParseValueChangeTrigger(triggerConfigJson),
|
||||||
"conditional" => ParseConditionalTrigger(triggerConfigJson),
|
"conditional" => ParseConditionalTrigger(triggerConfigJson),
|
||||||
|
"expression" => ParseExpressionTrigger(triggerConfigJson),
|
||||||
"call" => null, // No automatic trigger — invoked only via Instance.CallScript()
|
"call" => null, // No automatic trigger — invoked only via Instance.CallScript()
|
||||||
_ => null
|
_ => null
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static ExpressionTriggerConfig? ParseExpressionTrigger(string? json)
|
||||||
|
{
|
||||||
|
var expr = TriggerExpressionGlobals.ExtractExpression(json);
|
||||||
|
return expr == null ? null : new ExpressionTriggerConfig(expr);
|
||||||
|
}
|
||||||
|
|
||||||
private static IntervalTriggerConfig? ParseIntervalTrigger(string? json)
|
private static IntervalTriggerConfig? ParseIntervalTrigger(string? json)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrEmpty(json)) return null;
|
if (string.IsNullOrEmpty(json)) return null;
|
||||||
@@ -323,4 +414,5 @@ public class ScriptActor : ReceiveActor, IWithTimers
|
|||||||
internal record IntervalTriggerConfig(TimeSpan Interval) : ScriptTriggerConfig;
|
internal record IntervalTriggerConfig(TimeSpan Interval) : ScriptTriggerConfig;
|
||||||
internal record ValueChangeTriggerConfig(string AttributeName) : ScriptTriggerConfig;
|
internal record ValueChangeTriggerConfig(string AttributeName) : ScriptTriggerConfig;
|
||||||
internal record ConditionalTriggerConfig(string AttributeName, string Operator, double Threshold) : ScriptTriggerConfig;
|
internal record ConditionalTriggerConfig(string AttributeName, string Operator, double Threshold) : ScriptTriggerConfig;
|
||||||
|
internal record ExpressionTriggerConfig(string Expression) : ScriptTriggerConfig;
|
||||||
internal abstract record ScriptTriggerConfig;
|
internal abstract record ScriptTriggerConfig;
|
||||||
|
|||||||
@@ -8,16 +8,16 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka" Version="1.5.62" />
|
<PackageReference Include="Akka" />
|
||||||
<PackageReference Include="Akka.Cluster" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster" />
|
||||||
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.62" />
|
<PackageReference Include="Akka.Cluster.Tools" />
|
||||||
<PackageReference Include="Akka.Streams" Version="1.5.62" />
|
<PackageReference Include="Akka.Streams" />
|
||||||
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="5.0.0" />
|
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" />
|
||||||
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -87,11 +87,45 @@ public class ScriptCompilationService
|
|||||||
return violations;
|
return violations;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Shared Roslyn scripting options (references + imports) used by both full
|
||||||
|
/// script compilation and trigger-expression compilation.
|
||||||
|
/// </summary>
|
||||||
|
private static ScriptOptions BuildScriptOptions() => ScriptOptions.Default
|
||||||
|
.WithReferences(
|
||||||
|
typeof(object).Assembly,
|
||||||
|
typeof(Enumerable).Assembly,
|
||||||
|
typeof(Math).Assembly,
|
||||||
|
typeof(Microsoft.CSharp.RuntimeBinder.CSharpArgumentInfo).Assembly,
|
||||||
|
typeof(Commons.Types.DynamicJsonElement).Assembly)
|
||||||
|
.WithImports(
|
||||||
|
"System",
|
||||||
|
"System.Collections.Generic",
|
||||||
|
"System.Linq",
|
||||||
|
"System.Threading.Tasks");
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Compiles a script into a reusable delegate that takes a ScriptRuntimeContext
|
/// Compiles a script into a reusable delegate that takes a ScriptRuntimeContext
|
||||||
/// and parameters dictionary, and returns an object? result.
|
/// and parameters dictionary, and returns an object? result.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public ScriptCompilationResult Compile(string scriptName, string code)
|
public ScriptCompilationResult Compile(string scriptName, string code)
|
||||||
|
=> CompileCore(scriptName, code, typeof(ScriptGlobals));
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Compiles a bare C# boolean trigger expression against the restricted
|
||||||
|
/// read-only <see cref="TriggerExpressionGlobals"/>. The expression is a
|
||||||
|
/// trailing expression (no <c>return</c>); Roslyn scripting yields its
|
||||||
|
/// value, which the caller coerces to <c>bool</c>. Reuses the same script
|
||||||
|
/// options and forbidden-API trust validation as <see cref="Compile"/>.
|
||||||
|
/// </summary>
|
||||||
|
public ScriptCompilationResult CompileTriggerExpression(string name, string expression)
|
||||||
|
=> CompileCore(name, expression, typeof(TriggerExpressionGlobals));
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Shared compilation path: validates the trust model, builds the script
|
||||||
|
/// against the given globals type, and returns the compiled result.
|
||||||
|
/// </summary>
|
||||||
|
private ScriptCompilationResult CompileCore(string name, string code, Type globalsType)
|
||||||
{
|
{
|
||||||
// Validate trust model
|
// Validate trust model
|
||||||
var violations = ValidateTrustModel(code);
|
var violations = ValidateTrustModel(code);
|
||||||
@@ -99,29 +133,16 @@ public class ScriptCompilationService
|
|||||||
{
|
{
|
||||||
_logger.LogWarning(
|
_logger.LogWarning(
|
||||||
"Script {Script} failed trust validation: {Violations}",
|
"Script {Script} failed trust validation: {Violations}",
|
||||||
scriptName, string.Join("; ", violations));
|
name, string.Join("; ", violations));
|
||||||
return ScriptCompilationResult.Failed(violations);
|
return ScriptCompilationResult.Failed(violations);
|
||||||
}
|
}
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var scriptOptions = ScriptOptions.Default
|
|
||||||
.WithReferences(
|
|
||||||
typeof(object).Assembly,
|
|
||||||
typeof(Enumerable).Assembly,
|
|
||||||
typeof(Math).Assembly,
|
|
||||||
typeof(Microsoft.CSharp.RuntimeBinder.CSharpArgumentInfo).Assembly,
|
|
||||||
typeof(Commons.Types.DynamicJsonElement).Assembly)
|
|
||||||
.WithImports(
|
|
||||||
"System",
|
|
||||||
"System.Collections.Generic",
|
|
||||||
"System.Linq",
|
|
||||||
"System.Threading.Tasks");
|
|
||||||
|
|
||||||
var script = CSharpScript.Create<object?>(
|
var script = CSharpScript.Create<object?>(
|
||||||
code,
|
code,
|
||||||
scriptOptions,
|
BuildScriptOptions(),
|
||||||
globalsType: typeof(ScriptGlobals));
|
globalsType: globalsType);
|
||||||
|
|
||||||
var diagnostics = script.Compile();
|
var diagnostics = script.Compile();
|
||||||
var errors = diagnostics
|
var errors = diagnostics
|
||||||
@@ -133,16 +154,16 @@ public class ScriptCompilationService
|
|||||||
{
|
{
|
||||||
_logger.LogWarning(
|
_logger.LogWarning(
|
||||||
"Script {Script} compilation failed: {Errors}",
|
"Script {Script} compilation failed: {Errors}",
|
||||||
scriptName, string.Join("; ", errors));
|
name, string.Join("; ", errors));
|
||||||
return ScriptCompilationResult.Failed(errors);
|
return ScriptCompilationResult.Failed(errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
_logger.LogDebug("Script {Script} compiled successfully", scriptName);
|
_logger.LogDebug("Script {Script} compiled successfully", name);
|
||||||
return ScriptCompilationResult.Succeeded(script);
|
return ScriptCompilationResult.Succeeded(script);
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
_logger.LogError(ex, "Unexpected error compiling script {Script}", scriptName);
|
_logger.LogError(ex, "Unexpected error compiling script {Script}", name);
|
||||||
return ScriptCompilationResult.Failed([$"Compilation exception: {ex.Message}"]);
|
return ScriptCompilationResult.Failed([$"Compilation exception: {ex.Message}"]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,99 @@
|
|||||||
|
using System.Text.Json;
|
||||||
|
|
||||||
|
namespace ScadaLink.SiteRuntime.Scripts;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Read-only globals a trigger expression is compiled against. Exposes only
|
||||||
|
/// attribute reads, backed by an in-memory snapshot — no I/O, no actor Ask,
|
||||||
|
/// no side-effecting APIs. A missing attribute key reads as <c>null</c> and
|
||||||
|
/// never throws.
|
||||||
|
///
|
||||||
|
/// Canonical attribute keys are dotted (e.g. "TempSensor.Reading"); the prefix
|
||||||
|
/// logic here mirrors <see cref="AttributeAccessor.Resolve"/>.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class TriggerExpressionGlobals
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Extracts the <c>"expression"</c> field from an Expression-trigger config
|
||||||
|
/// JSON document. Returns <c>null</c> for a missing, blank, or malformed
|
||||||
|
/// config — the single parsing idiom shared by InstanceActor, ScriptActor,
|
||||||
|
/// and AlarmActor.
|
||||||
|
/// </summary>
|
||||||
|
public static string? ExtractExpression(string? triggerConfigJson)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrEmpty(triggerConfigJson)) return null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var doc = JsonDocument.Parse(triggerConfigJson);
|
||||||
|
var expr = doc.RootElement.TryGetProperty("expression", out var e)
|
||||||
|
? e.GetString()
|
||||||
|
: null;
|
||||||
|
return string.IsNullOrWhiteSpace(expr) ? null : expr;
|
||||||
|
}
|
||||||
|
catch (JsonException)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _snapshot;
|
||||||
|
|
||||||
|
public TriggerExpressionGlobals(IReadOnlyDictionary<string, object?> snapshot)
|
||||||
|
=> _snapshot = snapshot;
|
||||||
|
|
||||||
|
/// <summary>Attributes in the expression's own scope (root prefix).</summary>
|
||||||
|
public ReadOnlyAttributes Attributes => new(_snapshot, "");
|
||||||
|
|
||||||
|
/// <summary>Indexed access to child compositions' attributes.</summary>
|
||||||
|
public ReadOnlyChildren Children => new(_snapshot);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Parent composition (null at root). Set by the caller for derived/composed
|
||||||
|
/// scopes; the runtime actors evaluate at root scope, so this stays null.
|
||||||
|
/// </summary>
|
||||||
|
public ReadOnlyComposition? Parent { get; init; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Read-only attribute view anchored at a canonical-name prefix. Indexing
|
||||||
|
/// resolves to the canonical key ("" → key, "TempSensor" → "TempSensor.key").
|
||||||
|
/// </summary>
|
||||||
|
public sealed class ReadOnlyAttributes
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _s;
|
||||||
|
private readonly string _prefix;
|
||||||
|
|
||||||
|
public ReadOnlyAttributes(IReadOnlyDictionary<string, object?> s, string prefix)
|
||||||
|
{
|
||||||
|
_s = s;
|
||||||
|
_prefix = prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public object? this[string key] =>
|
||||||
|
_s.TryGetValue(_prefix.Length == 0 ? key : _prefix + "." + key, out var v) ? v : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>A read-only view of one composition at a canonical-name path.</summary>
|
||||||
|
public sealed class ReadOnlyComposition
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _s;
|
||||||
|
private readonly string _path;
|
||||||
|
|
||||||
|
public ReadOnlyComposition(IReadOnlyDictionary<string, object?> s, string path)
|
||||||
|
{
|
||||||
|
_s = s;
|
||||||
|
_path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReadOnlyAttributes Attributes => new(_s, _path);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Dictionary-style accessor for child compositions.</summary>
|
||||||
|
public sealed class ReadOnlyChildren
|
||||||
|
{
|
||||||
|
private readonly IReadOnlyDictionary<string, object?> _s;
|
||||||
|
|
||||||
|
public ReadOnlyChildren(IReadOnlyDictionary<string, object?> s) => _s = s;
|
||||||
|
|
||||||
|
public ReadOnlyComposition this[string compositionName] => new(_s, compositionName);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -8,11 +8,11 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Akka" Version="1.5.62" />
|
<PackageReference Include="Akka" />
|
||||||
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ public static class ServiceCollectionExtensions
|
|||||||
var storage = sp.GetRequiredService<StoreAndForwardStorage>();
|
var storage = sp.GetRequiredService<StoreAndForwardStorage>();
|
||||||
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
|
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
|
||||||
var logger = sp.GetRequiredService<ILogger<StoreAndForwardService>>();
|
var logger = sp.GetRequiredService<ILogger<StoreAndForwardService>>();
|
||||||
return new StoreAndForwardService(storage, options, logger);
|
var replication = sp.GetRequiredService<ReplicationService>();
|
||||||
|
return new StoreAndForwardService(storage, options, logger, replication);
|
||||||
});
|
});
|
||||||
|
|
||||||
services.AddSingleton<ReplicationService>(sp =>
|
services.AddSingleton<ReplicationService>(sp =>
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ public class StoreAndForwardService
|
|||||||
{
|
{
|
||||||
private readonly StoreAndForwardStorage _storage;
|
private readonly StoreAndForwardStorage _storage;
|
||||||
private readonly StoreAndForwardOptions _options;
|
private readonly StoreAndForwardOptions _options;
|
||||||
|
private readonly ReplicationService? _replication;
|
||||||
private readonly ILogger<StoreAndForwardService> _logger;
|
private readonly ILogger<StoreAndForwardService> _logger;
|
||||||
private Timer? _retryTimer;
|
private Timer? _retryTimer;
|
||||||
private int _retryInProgress;
|
private int _retryInProgress;
|
||||||
@@ -48,11 +49,13 @@ public class StoreAndForwardService
|
|||||||
public StoreAndForwardService(
|
public StoreAndForwardService(
|
||||||
StoreAndForwardStorage storage,
|
StoreAndForwardStorage storage,
|
||||||
StoreAndForwardOptions options,
|
StoreAndForwardOptions options,
|
||||||
ILogger<StoreAndForwardService> logger)
|
ILogger<StoreAndForwardService> logger,
|
||||||
|
ReplicationService? replication = null)
|
||||||
{
|
{
|
||||||
_storage = storage;
|
_storage = storage;
|
||||||
_options = options;
|
_options = options;
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
|
_replication = replication;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -109,7 +112,8 @@ public class StoreAndForwardService
|
|||||||
string payloadJson,
|
string payloadJson,
|
||||||
string? originInstanceName = null,
|
string? originInstanceName = null,
|
||||||
int? maxRetries = null,
|
int? maxRetries = null,
|
||||||
TimeSpan? retryInterval = null)
|
TimeSpan? retryInterval = null,
|
||||||
|
bool attemptImmediateDelivery = true)
|
||||||
{
|
{
|
||||||
var message = new StoreAndForwardMessage
|
var message = new StoreAndForwardMessage
|
||||||
{
|
{
|
||||||
@@ -125,8 +129,10 @@ public class StoreAndForwardService
|
|||||||
OriginInstanceName = originInstanceName
|
OriginInstanceName = originInstanceName
|
||||||
};
|
};
|
||||||
|
|
||||||
// Attempt immediate delivery
|
// Attempt immediate delivery — unless the caller has already made a
|
||||||
if (_deliveryHandlers.TryGetValue(category, out var handler))
|
// delivery attempt of its own (attemptImmediateDelivery: false). In that
|
||||||
|
// case re-invoking the handler here would dispatch the request twice.
|
||||||
|
if (attemptImmediateDelivery && _deliveryHandlers.TryGetValue(category, out var handler))
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@@ -136,11 +142,9 @@ public class StoreAndForwardService
|
|||||||
RaiseActivity("Delivered", category, $"Immediate delivery to {target}");
|
RaiseActivity("Delivered", category, $"Immediate delivery to {target}");
|
||||||
return new StoreAndForwardResult(true, message.Id, false);
|
return new StoreAndForwardResult(true, message.Id, false);
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
// Permanent failure — do not buffer
|
||||||
// Permanent failure — do not buffer
|
return new StoreAndForwardResult(false, message.Id, false);
|
||||||
return new StoreAndForwardResult(false, message.Id, false);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
@@ -152,19 +156,39 @@ public class StoreAndForwardService
|
|||||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||||
message.RetryCount = 1;
|
message.RetryCount = 1;
|
||||||
message.LastError = ex.Message;
|
message.LastError = ex.Message;
|
||||||
await _storage.EnqueueAsync(message);
|
await BufferAsync(message);
|
||||||
|
|
||||||
RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})");
|
RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})");
|
||||||
return new StoreAndForwardResult(true, message.Id, true);
|
return new StoreAndForwardResult(true, message.Id, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// No handler registered — buffer for later
|
// Either no handler is registered yet, or the caller already attempted
|
||||||
await _storage.EnqueueAsync(message);
|
// delivery itself — buffer for the background retry sweep to deliver.
|
||||||
RaiseActivity("Queued", category, $"No handler registered, buffered: {target}");
|
if (!attemptImmediateDelivery)
|
||||||
|
{
|
||||||
|
// The caller made (and failed) one attempt before handing the
|
||||||
|
// message over, so it counts as the first retry.
|
||||||
|
message.RetryCount = 1;
|
||||||
|
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||||
|
}
|
||||||
|
await BufferAsync(message);
|
||||||
|
RaiseActivity("Queued", category, attemptImmediateDelivery
|
||||||
|
? $"No handler registered, buffered: {target}"
|
||||||
|
: $"Buffered for retry: {target}");
|
||||||
return new StoreAndForwardResult(true, message.Id, true);
|
return new StoreAndForwardResult(true, message.Id, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Persists a message to the local SQLite buffer and (WP-11) replicates the
|
||||||
|
/// add to the standby node so a failover does not lose the buffered message.
|
||||||
|
/// </summary>
|
||||||
|
private async Task BufferAsync(StoreAndForwardMessage message)
|
||||||
|
{
|
||||||
|
await _storage.EnqueueAsync(message);
|
||||||
|
_replication?.ReplicateEnqueue(message);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// WP-10: Background retry sweep. Processes all pending messages that are due for retry.
|
/// WP-10: Background retry sweep. Processes all pending messages that are due for retry.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -210,6 +234,7 @@ public class StoreAndForwardService
|
|||||||
if (success)
|
if (success)
|
||||||
{
|
{
|
||||||
await _storage.RemoveMessageAsync(message.Id);
|
await _storage.RemoveMessageAsync(message.Id);
|
||||||
|
_replication?.ReplicateRemove(message.Id);
|
||||||
RaiseActivity("Delivered", message.Category,
|
RaiseActivity("Delivered", message.Category,
|
||||||
$"Delivered to {message.Target} after {message.RetryCount} retries");
|
$"Delivered to {message.Target} after {message.RetryCount} retries");
|
||||||
return;
|
return;
|
||||||
@@ -220,6 +245,7 @@ public class StoreAndForwardService
|
|||||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||||
message.LastError = "Permanent failure (handler returned false)";
|
message.LastError = "Permanent failure (handler returned false)";
|
||||||
await _storage.UpdateMessageAsync(message);
|
await _storage.UpdateMessageAsync(message);
|
||||||
|
_replication?.ReplicatePark(message);
|
||||||
RaiseActivity("Parked", message.Category,
|
RaiseActivity("Parked", message.Category,
|
||||||
$"Permanent failure for {message.Target}: handler returned false");
|
$"Permanent failure for {message.Target}: handler returned false");
|
||||||
}
|
}
|
||||||
@@ -234,6 +260,7 @@ public class StoreAndForwardService
|
|||||||
{
|
{
|
||||||
message.Status = StoreAndForwardMessageStatus.Parked;
|
message.Status = StoreAndForwardMessageStatus.Parked;
|
||||||
await _storage.UpdateMessageAsync(message);
|
await _storage.UpdateMessageAsync(message);
|
||||||
|
_replication?.ReplicatePark(message);
|
||||||
RaiseActivity("Parked", message.Category,
|
RaiseActivity("Parked", message.Category,
|
||||||
$"Max retries ({message.MaxRetries}) reached for {message.Target}");
|
$"Max retries ({message.MaxRetries}) reached for {message.Target}");
|
||||||
_logger.LogWarning(
|
_logger.LogWarning(
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ public class StoreAndForwardStorage
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public async Task InitializeAsync()
|
public async Task InitializeAsync()
|
||||||
{
|
{
|
||||||
|
EnsureDatabaseDirectoryExists();
|
||||||
|
|
||||||
await using var connection = new SqliteConnection(_connectionString);
|
await using var connection = new SqliteConnection(_connectionString);
|
||||||
await connection.OpenAsync();
|
await connection.OpenAsync();
|
||||||
|
|
||||||
@@ -53,6 +55,32 @@ public class StoreAndForwardStorage
|
|||||||
_logger.LogInformation("Store-and-forward SQLite storage initialized");
|
_logger.LogInformation("Store-and-forward SQLite storage initialized");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Ensures the directory for a file-backed SQLite database exists. SQLite creates
|
||||||
|
/// the database file on demand but not its parent directory, so a configured path
|
||||||
|
/// such as "./data/store-and-forward.db" fails to open ("unable to open database
|
||||||
|
/// file") when the "data" directory does not yet exist. In-memory databases and
|
||||||
|
/// bare filenames in the working directory have no directory to create and are
|
||||||
|
/// skipped.
|
||||||
|
/// </summary>
|
||||||
|
private void EnsureDatabaseDirectoryExists()
|
||||||
|
{
|
||||||
|
var builder = new SqliteConnectionStringBuilder(_connectionString);
|
||||||
|
if (builder.Mode == SqliteOpenMode.Memory)
|
||||||
|
return;
|
||||||
|
|
||||||
|
var dataSource = builder.DataSource;
|
||||||
|
if (string.IsNullOrEmpty(dataSource) || dataSource == ":memory:")
|
||||||
|
return;
|
||||||
|
|
||||||
|
var directory = System.IO.Path.GetDirectoryName(System.IO.Path.GetFullPath(dataSource));
|
||||||
|
if (!string.IsNullOrEmpty(directory) && !System.IO.Directory.Exists(directory))
|
||||||
|
{
|
||||||
|
System.IO.Directory.CreateDirectory(directory);
|
||||||
|
_logger.LogInformation("Created store-and-forward database directory: {Directory}", directory);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// WP-9: Enqueues a new message with Pending status.
|
/// WP-9: Enqueues a new message with Pending status.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -12,8 +12,8 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
|
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -12,8 +12,13 @@ namespace ScadaLink.TemplateEngine.Validation;
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public class ScriptCompiler
|
public class ScriptCompiler
|
||||||
{
|
{
|
||||||
// Forbidden namespace patterns - scripts must not use these
|
/// <summary>
|
||||||
private static readonly string[] ForbiddenPatterns =
|
/// Forbidden namespace patterns — scripts (and trigger expressions, via
|
||||||
|
/// <see cref="ValidationService"/>) must not use these. Trigger expressions run
|
||||||
|
/// under the same trust model as scripts, so the list is shared from here rather
|
||||||
|
/// than duplicated.
|
||||||
|
/// </summary>
|
||||||
|
internal static readonly string[] ForbiddenPatterns =
|
||||||
[
|
[
|
||||||
"System.IO.",
|
"System.IO.",
|
||||||
"System.Diagnostics.Process",
|
"System.Diagnostics.Process",
|
||||||
|
|||||||
@@ -13,8 +13,9 @@ namespace ScadaLink.TemplateEngine.Validation;
|
|||||||
/// 3. Script compilation (via ScriptCompiler)
|
/// 3. Script compilation (via ScriptCompiler)
|
||||||
/// 4. Alarm trigger references exist (referenced attributes must be in the flattened config)
|
/// 4. Alarm trigger references exist (referenced attributes must be in the flattened config)
|
||||||
/// 5. Script trigger references exist (referenced attributes must be in the flattened config)
|
/// 5. Script trigger references exist (referenced attributes must be in the flattened config)
|
||||||
/// 6. Connection binding completeness (all data-sourced attributes must have a binding)
|
/// 6. Expression triggers — blank check, syntax check, and attribute-reference scan
|
||||||
/// 7. Does NOT verify tag path resolution on devices
|
/// 7. Connection binding completeness (all data-sourced attributes must have a binding)
|
||||||
|
/// 8. Does NOT verify tag path resolution on devices
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class ValidationService
|
public class ValidationService
|
||||||
{
|
{
|
||||||
@@ -48,6 +49,7 @@ public class ValidationService
|
|||||||
ValidateScriptCompilation(configuration),
|
ValidateScriptCompilation(configuration),
|
||||||
ValidateAlarmTriggerReferences(configuration),
|
ValidateAlarmTriggerReferences(configuration),
|
||||||
ValidateScriptTriggerReferences(configuration),
|
ValidateScriptTriggerReferences(configuration),
|
||||||
|
ValidateExpressionTriggers(configuration),
|
||||||
ValidateConnectionBindingCompleteness(configuration),
|
ValidateConnectionBindingCompleteness(configuration),
|
||||||
_semanticValidator.Validate(configuration, sharedScripts)
|
_semanticValidator.Validate(configuration, sharedScripts)
|
||||||
};
|
};
|
||||||
@@ -178,6 +180,293 @@ public class ValidationService
|
|||||||
: ValidationResult.Success();
|
: ValidationResult.Success();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Validates Expression-trigger scripts and alarms before deployment.
|
||||||
|
///
|
||||||
|
/// For every script/alarm whose trigger type is "Expression" this performs three
|
||||||
|
/// checks against the <c>{ "expression": "..." }</c> trigger configuration:
|
||||||
|
/// <list type="bullet">
|
||||||
|
/// <item>Blank expression → warning (the trigger will never fire).</item>
|
||||||
|
/// <item>Syntax check → error if the expression uses a forbidden API or has
|
||||||
|
/// unbalanced brackets/quotes. The TemplateEngine project does not reference a
|
||||||
|
/// Roslyn compiler (see <see cref="ScriptCompiler"/>), so this mirrors that
|
||||||
|
/// string-based syntax check rather than a full compile.</item>
|
||||||
|
/// <item>Attribute-reference scan → error for any <c>Attributes["X"]</c> literal
|
||||||
|
/// whose key is absent from the flattened configuration, mirroring
|
||||||
|
/// <see cref="ValidateScriptTriggerReferences"/> for the structured triggers.</item>
|
||||||
|
/// </list>
|
||||||
|
/// </summary>
|
||||||
|
public static ValidationResult ValidateExpressionTriggers(FlattenedConfiguration configuration)
|
||||||
|
{
|
||||||
|
var errors = new List<ValidationEntry>();
|
||||||
|
var warnings = new List<ValidationEntry>();
|
||||||
|
var attributeNames = new HashSet<string>(
|
||||||
|
configuration.Attributes.Select(a => a.CanonicalName), StringComparer.Ordinal);
|
||||||
|
|
||||||
|
foreach (var script in configuration.Scripts)
|
||||||
|
{
|
||||||
|
if (!IsExpressionTrigger(script.TriggerType))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
CheckExpressionTrigger(
|
||||||
|
ValidationCategory.ScriptTriggerReference, "script",
|
||||||
|
script.CanonicalName, script.TriggerConfiguration,
|
||||||
|
attributeNames, errors, warnings);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var alarm in configuration.Alarms)
|
||||||
|
{
|
||||||
|
if (!IsExpressionTrigger(alarm.TriggerType))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
CheckExpressionTrigger(
|
||||||
|
ValidationCategory.AlarmTriggerReference, "alarm",
|
||||||
|
alarm.CanonicalName, alarm.TriggerConfiguration,
|
||||||
|
attributeNames, errors, warnings);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ValidationResult { Errors = errors, Warnings = warnings };
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsExpressionTrigger(string? triggerType) =>
|
||||||
|
string.Equals(triggerType, "Expression", StringComparison.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Runs the blank / syntax / attribute-reference checks for a single
|
||||||
|
/// Expression-trigger entity and appends any findings to the shared lists.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="category">
|
||||||
|
/// The <see cref="ValidationCategory"/> to file every finding under
|
||||||
|
/// (<see cref="ValidationCategory.ScriptTriggerReference"/> for scripts,
|
||||||
|
/// <see cref="ValidationCategory.AlarmTriggerReference"/> for alarms). The same
|
||||||
|
/// category is used for blank, syntax, and attribute-reference findings so an
|
||||||
|
/// alarm's syntax error is not miscategorised as script compilation.
|
||||||
|
/// </param>
|
||||||
|
/// <param name="entityLabel">
|
||||||
|
/// Human-readable entity-type label (<c>"script"</c>/<c>"alarm"</c>) used in
|
||||||
|
/// message text only.
|
||||||
|
/// </param>
|
||||||
|
private static void CheckExpressionTrigger(
|
||||||
|
ValidationCategory category,
|
||||||
|
string entityLabel,
|
||||||
|
string entityName,
|
||||||
|
string? triggerConfigJson,
|
||||||
|
HashSet<string> attributeNames,
|
||||||
|
List<ValidationEntry> errors,
|
||||||
|
List<ValidationEntry> warnings)
|
||||||
|
{
|
||||||
|
var expression = ExtractExpressionFromTriggerConfig(triggerConfigJson);
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(expression))
|
||||||
|
{
|
||||||
|
warnings.Add(ValidationEntry.Warning(category,
|
||||||
|
$"The {entityLabel} '{entityName}' has an expression trigger with no expression; it will never fire.",
|
||||||
|
entityName));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var syntaxError = CheckExpressionSyntax(expression);
|
||||||
|
if (syntaxError != null)
|
||||||
|
{
|
||||||
|
errors.Add(ValidationEntry.Error(category,
|
||||||
|
$"The {entityLabel} '{entityName}' expression trigger failed validation: {syntaxError}",
|
||||||
|
entityName));
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var attrName in ExtractAttributeReferences(expression))
|
||||||
|
{
|
||||||
|
if (!attributeNames.Contains(attrName))
|
||||||
|
{
|
||||||
|
errors.Add(ValidationEntry.Error(category,
|
||||||
|
$"The {entityLabel} '{entityName}' expression trigger references attribute '{attrName}' which does not exist in the flattened configuration.",
|
||||||
|
entityName));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Reads the "expression" string from a <c>{ "expression": "..." }</c> trigger
|
||||||
|
/// configuration. Returns <c>null</c> on malformed JSON or a missing key.
|
||||||
|
/// </summary>
|
||||||
|
internal static string? ExtractExpressionFromTriggerConfig(string? triggerConfigJson)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(triggerConfigJson))
|
||||||
|
return null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var doc = JsonDocument.Parse(triggerConfigJson);
|
||||||
|
if (doc.RootElement.TryGetProperty("expression", out var prop)
|
||||||
|
&& prop.ValueKind == JsonValueKind.String)
|
||||||
|
{
|
||||||
|
return prop.GetString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (JsonException)
|
||||||
|
{
|
||||||
|
// Not valid JSON — treated as a blank expression by the caller.
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Lightweight string-based syntax check for a trigger expression. Mirrors the
|
||||||
|
/// approach in <see cref="ScriptCompiler"/> (the TemplateEngine project has no
|
||||||
|
/// Roslyn compiler reference): rejects forbidden APIs and unbalanced
|
||||||
|
/// brackets/quotes. Returns an error message, or <c>null</c> when the expression
|
||||||
|
/// looks well-formed.
|
||||||
|
/// </summary>
|
||||||
|
internal static string? CheckExpressionSyntax(string expression)
|
||||||
|
{
|
||||||
|
foreach (var pattern in ScriptCompiler.ForbiddenPatterns)
|
||||||
|
{
|
||||||
|
if (expression.Contains(pattern, StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
return $"uses forbidden API '{pattern.TrimEnd('.')}'. " +
|
||||||
|
"Trigger expressions cannot use System.IO, Process, Threading, Reflection, or raw network APIs.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var parenDepth = 0;
|
||||||
|
var bracketDepth = 0;
|
||||||
|
var braceDepth = 0;
|
||||||
|
var inString = false;
|
||||||
|
var inChar = false;
|
||||||
|
var inLineComment = false;
|
||||||
|
var inBlockComment = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < expression.Length; i++)
|
||||||
|
{
|
||||||
|
var c = expression[i];
|
||||||
|
var next = i + 1 < expression.Length ? expression[i + 1] : '\0';
|
||||||
|
|
||||||
|
if (inLineComment)
|
||||||
|
{
|
||||||
|
if (c == '\n') inLineComment = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inBlockComment)
|
||||||
|
{
|
||||||
|
if (c == '*' && next == '/')
|
||||||
|
{
|
||||||
|
inBlockComment = false;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inString)
|
||||||
|
{
|
||||||
|
if (c == '\\') { i++; continue; }
|
||||||
|
if (c == '"') inString = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inChar)
|
||||||
|
{
|
||||||
|
if (c == '\\') { i++; continue; }
|
||||||
|
if (c == '\'') inChar = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c == '/' && next == '/')
|
||||||
|
{
|
||||||
|
inLineComment = true;
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c == '/' && next == '*')
|
||||||
|
{
|
||||||
|
inBlockComment = true;
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case '"': inString = true; break;
|
||||||
|
case '\'': inChar = true; break;
|
||||||
|
case '(': parenDepth++; break;
|
||||||
|
case ')':
|
||||||
|
parenDepth--;
|
||||||
|
if (parenDepth < 0) return "mismatched parentheses (unexpected ')').";
|
||||||
|
break;
|
||||||
|
case '[': bracketDepth++; break;
|
||||||
|
case ']':
|
||||||
|
bracketDepth--;
|
||||||
|
if (bracketDepth < 0) return "mismatched brackets (unexpected ']').";
|
||||||
|
break;
|
||||||
|
case '{': braceDepth++; break;
|
||||||
|
case '}':
|
||||||
|
braceDepth--;
|
||||||
|
if (braceDepth < 0) return "mismatched braces (unexpected '}').";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inBlockComment) return "unterminated block comment.";
|
||||||
|
if (inString) return "unterminated string literal.";
|
||||||
|
if (inChar) return "unterminated character literal.";
|
||||||
|
if (parenDepth != 0) return $"mismatched parentheses ({parenDepth} unclosed).";
|
||||||
|
if (bracketDepth != 0) return $"mismatched brackets ({bracketDepth} unclosed).";
|
||||||
|
if (braceDepth != 0) return $"mismatched braces ({braceDepth} unclosed).";
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Scans an expression for <c>Attributes["..."]</c> string-literal accessor keys.
|
||||||
|
/// Best-effort: only matches double-quoted literals (the form the editor emits)
|
||||||
|
/// and skips keys built dynamically.
|
||||||
|
/// </summary>
|
||||||
|
internal static IEnumerable<string> ExtractAttributeReferences(string expression)
|
||||||
|
{
|
||||||
|
var seen = new HashSet<string>(StringComparer.Ordinal);
|
||||||
|
const string marker = "Attributes[";
|
||||||
|
var index = 0;
|
||||||
|
|
||||||
|
while ((index = expression.IndexOf(marker, index, StringComparison.Ordinal)) >= 0)
|
||||||
|
{
|
||||||
|
// Only treat this as a self-attribute reference when it is not a member
|
||||||
|
// access. A bare `Attributes["X"]` resolves against the flattened
|
||||||
|
// configuration; `Children["Pump"].Attributes["X"]` and
|
||||||
|
// `Parent.Attributes["X"]` are member accesses (preceded by '.') whose
|
||||||
|
// dotted/composed canonical names cannot be checked against the flat
|
||||||
|
// self-attribute set — skip them rather than emit a false positive.
|
||||||
|
if (index > 0 && expression[index - 1] == '.')
|
||||||
|
{
|
||||||
|
index += marker.Length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var cursor = index + marker.Length;
|
||||||
|
// Skip whitespace between '[' and the literal.
|
||||||
|
while (cursor < expression.Length && char.IsWhiteSpace(expression[cursor]))
|
||||||
|
cursor++;
|
||||||
|
|
||||||
|
if (cursor < expression.Length && expression[cursor] == '"')
|
||||||
|
{
|
||||||
|
var keyStart = cursor + 1;
|
||||||
|
var keyEnd = keyStart;
|
||||||
|
while (keyEnd < expression.Length && expression[keyEnd] != '"')
|
||||||
|
{
|
||||||
|
if (expression[keyEnd] == '\\') keyEnd++; // skip escaped char
|
||||||
|
keyEnd++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keyEnd < expression.Length)
|
||||||
|
{
|
||||||
|
var key = expression.Substring(keyStart, keyEnd - keyStart);
|
||||||
|
if (key.Length > 0 && seen.Add(key))
|
||||||
|
yield return key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index += marker.Length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Validates that all data-sourced attributes have connection bindings.
|
/// Validates that all data-sourced attributes have connection bindings.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
using System.CommandLine;
|
||||||
|
using ScadaLink.CLI;
|
||||||
|
using ScadaLink.CLI.Commands;
|
||||||
|
|
||||||
|
namespace ScadaLink.CLI.Tests;
|
||||||
|
|
||||||
|
public class FormatResolutionTests
|
||||||
|
{
|
||||||
|
private static (Option<string> formatOption, RootCommand root) BuildHarness()
|
||||||
|
{
|
||||||
|
var formatOption = new Option<string>("--format") { Recursive = true };
|
||||||
|
var root = new RootCommand();
|
||||||
|
root.Add(formatOption);
|
||||||
|
return (formatOption, root);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ResolveFormat_ExplicitFlag_OverridesConfig()
|
||||||
|
{
|
||||||
|
var (formatOption, root) = BuildHarness();
|
||||||
|
var result = root.Parse(new[] { "--format", "table" });
|
||||||
|
var config = new CliConfig { DefaultFormat = "json" };
|
||||||
|
|
||||||
|
var format = CommandHelpers.ResolveFormat(result, formatOption, config);
|
||||||
|
|
||||||
|
Assert.Equal("table", format);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ResolveFormat_FlagAbsent_UsesConfigDefaultFormat()
|
||||||
|
{
|
||||||
|
// Regression for CLI-001: when --format is not supplied, the config-file /
|
||||||
|
// env-var DefaultFormat must be honoured instead of always falling back to "json".
|
||||||
|
var (formatOption, root) = BuildHarness();
|
||||||
|
var result = root.Parse(Array.Empty<string>());
|
||||||
|
var config = new CliConfig { DefaultFormat = "table" };
|
||||||
|
|
||||||
|
var format = CommandHelpers.ResolveFormat(result, formatOption, config);
|
||||||
|
|
||||||
|
Assert.Equal("table", format);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void ResolveFormat_FlagAbsent_AndNoConfig_DefaultsToJson()
|
||||||
|
{
|
||||||
|
var (formatOption, root) = BuildHarness();
|
||||||
|
var result = root.Parse(Array.Empty<string>());
|
||||||
|
var config = new CliConfig { DefaultFormat = "json" };
|
||||||
|
|
||||||
|
var format = CommandHelpers.ResolveFormat(result, formatOption, config);
|
||||||
|
|
||||||
|
Assert.Equal("json", format);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,11 +9,11 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="coverlet.collector" Version="6.0.4" />
|
<PackageReference Include="coverlet.collector" />
|
||||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
|
<PackageReference Include="Microsoft.NET.Test.Sdk" />
|
||||||
<PackageReference Include="NSubstitute" Version="5.3.0" />
|
<PackageReference Include="NSubstitute" />
|
||||||
<PackageReference Include="xunit" Version="2.9.3" />
|
<PackageReference Include="xunit" />
|
||||||
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.4" />
|
<PackageReference Include="xunit.runner.visualstudio" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
+5
-5
@@ -9,11 +9,11 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="coverlet.collector" Version="6.0.4" />
|
<PackageReference Include="coverlet.collector" />
|
||||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
|
<PackageReference Include="Microsoft.NET.Test.Sdk" />
|
||||||
<PackageReference Include="Microsoft.Playwright" Version="1.58.0" />
|
<PackageReference Include="Microsoft.Playwright" />
|
||||||
<PackageReference Include="xunit" Version="2.9.3" />
|
<PackageReference Include="xunit" />
|
||||||
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.4" />
|
<PackageReference Include="xunit.runner.visualstudio" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|||||||
@@ -0,0 +1,79 @@
|
|||||||
|
using System.Security.Claims;
|
||||||
|
using Microsoft.AspNetCore.Http;
|
||||||
|
using ScadaLink.CentralUI.Auth;
|
||||||
|
|
||||||
|
namespace ScadaLink.CentralUI.Tests.Auth;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Regression tests for CentralUI-004. The provider used to read
|
||||||
|
/// <see cref="IHttpContextAccessor.HttpContext"/> on every call; once the Blazor
|
||||||
|
/// circuit is established that context is gone, so later re-evaluations saw an
|
||||||
|
/// unauthenticated principal. The provider must snapshot the principal once at
|
||||||
|
/// construction (during the initial HTTP request) and serve it for the circuit.
|
||||||
|
/// </summary>
|
||||||
|
public class CookieAuthenticationStateProviderTests
|
||||||
|
{
|
||||||
|
private static ClaimsPrincipal AuthenticatedUser(string name)
|
||||||
|
{
|
||||||
|
var identity = new ClaimsIdentity(
|
||||||
|
new[] { new Claim(ClaimTypes.Name, name) },
|
||||||
|
authenticationType: "TestCookie");
|
||||||
|
return new ClaimsPrincipal(identity);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task GetAuthenticationStateAsync_ReturnsAuthenticatedUser_WhenHttpContextPresent()
|
||||||
|
{
|
||||||
|
var accessor = new HttpContextAccessor
|
||||||
|
{
|
||||||
|
HttpContext = new DefaultHttpContext { User = AuthenticatedUser("alice") }
|
||||||
|
};
|
||||||
|
|
||||||
|
var provider = new CookieAuthenticationStateProvider(accessor);
|
||||||
|
var state = await provider.GetAuthenticationStateAsync();
|
||||||
|
|
||||||
|
Assert.True(state.User.Identity?.IsAuthenticated);
|
||||||
|
Assert.Equal("alice", state.User.Identity?.Name);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task GetAuthenticationStateAsync_StillReturnsUser_AfterHttpContextIsGone()
|
||||||
|
{
|
||||||
|
// The circuit is built during the HTTP request: HttpContext is valid then.
|
||||||
|
var accessor = new HttpContextAccessor
|
||||||
|
{
|
||||||
|
HttpContext = new DefaultHttpContext { User = AuthenticatedUser("bob") }
|
||||||
|
};
|
||||||
|
var provider = new CookieAuthenticationStateProvider(accessor);
|
||||||
|
|
||||||
|
// After the request completes, IHttpContextAccessor.HttpContext is null for
|
||||||
|
// the life of the long-lived SignalR circuit.
|
||||||
|
accessor.HttpContext = null;
|
||||||
|
|
||||||
|
var state = await provider.GetAuthenticationStateAsync();
|
||||||
|
|
||||||
|
// The pre-fix implementation returned an anonymous principal here.
|
||||||
|
Assert.True(state.User.Identity?.IsAuthenticated);
|
||||||
|
Assert.Equal("bob", state.User.Identity?.Name);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task GetAuthenticationStateAsync_IsStableAcrossCalls_IgnoringStaleForeignContext()
|
||||||
|
{
|
||||||
|
var accessor = new HttpContextAccessor
|
||||||
|
{
|
||||||
|
HttpContext = new DefaultHttpContext { User = AuthenticatedUser("carol") }
|
||||||
|
};
|
||||||
|
var provider = new CookieAuthenticationStateProvider(accessor);
|
||||||
|
|
||||||
|
// A stale/foreign context leaking through the AsyncLocal accessor must NOT
|
||||||
|
// change what this circuit's provider reports.
|
||||||
|
accessor.HttpContext = new DefaultHttpContext { User = AuthenticatedUser("intruder") };
|
||||||
|
|
||||||
|
var first = await provider.GetAuthenticationStateAsync();
|
||||||
|
var second = await provider.GetAuthenticationStateAsync();
|
||||||
|
|
||||||
|
Assert.Equal("carol", first.User.Identity?.Name);
|
||||||
|
Assert.Equal("carol", second.User.Identity?.Name);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
using System.Security.Claims;
|
||||||
|
using Microsoft.AspNetCore.Components.Authorization;
|
||||||
|
using ScadaLink.CentralUI.Auth;
|
||||||
|
using ScadaLink.Commons.Entities.Sites;
|
||||||
|
using ScadaLink.Security;
|
||||||
|
|
||||||
|
namespace ScadaLink.CentralUI.Tests.Auth;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Regression tests for CentralUI-002. Site-scoped Deployment permissions are
|
||||||
|
/// written as <c>SiteId</c> claims at login but were never read — Deployment
|
||||||
|
/// pages listed and acted on every site. <see cref="SiteScopeService"/> is the
|
||||||
|
/// shared helper that reads those claims; these tests pin its behaviour.
|
||||||
|
/// </summary>
|
||||||
|
public class SiteScopeServiceTests
|
||||||
|
{
|
||||||
|
private sealed class StubAuthStateProvider : AuthenticationStateProvider
|
||||||
|
{
|
||||||
|
private readonly ClaimsPrincipal _user;
|
||||||
|
public StubAuthStateProvider(ClaimsPrincipal user) => _user = user;
|
||||||
|
public override Task<AuthenticationState> GetAuthenticationStateAsync()
|
||||||
|
=> Task.FromResult(new AuthenticationState(_user));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static SiteScopeService ForUser(params Claim[] claims)
|
||||||
|
{
|
||||||
|
var identity = new ClaimsIdentity(claims, authenticationType: "TestCookie");
|
||||||
|
return new SiteScopeService(new StubAuthStateProvider(new ClaimsPrincipal(identity)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Claim Role(string role) => new(JwtTokenService.RoleClaimType, role);
|
||||||
|
private static Claim SiteClaim(int id) => new(JwtTokenService.SiteIdClaimType, id.ToString());
|
||||||
|
|
||||||
|
private static List<Site> Sites(params int[] ids)
|
||||||
|
=> ids.Select(id => new Site($"Site{id}", $"SITE-{id}") { Id = id }).ToList();
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task DeploymentUserWithNoSiteClaims_IsSystemWide()
|
||||||
|
{
|
||||||
|
var svc = ForUser(Role("Deployment"));
|
||||||
|
|
||||||
|
Assert.True(await svc.IsSystemWideAsync());
|
||||||
|
Assert.Empty(await svc.PermittedSiteIdsAsync());
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task SystemWideUser_FilterSites_ReturnsAllSites()
|
||||||
|
{
|
||||||
|
var svc = ForUser(Role("Deployment"));
|
||||||
|
|
||||||
|
var filtered = await svc.FilterSitesAsync(Sites(1, 2, 3));
|
||||||
|
|
||||||
|
Assert.Equal(new[] { 1, 2, 3 }, filtered.Select(s => s.Id));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task ScopedUser_FilterSites_ReturnsOnlyPermittedSites()
|
||||||
|
{
|
||||||
|
// Regression: a Deployment user scoped to sites 1 and 3 must NOT see site 2.
|
||||||
|
var svc = ForUser(Role("Deployment"), SiteClaim(1), SiteClaim(3));
|
||||||
|
|
||||||
|
var filtered = await svc.FilterSitesAsync(Sites(1, 2, 3, 4));
|
||||||
|
|
||||||
|
Assert.Equal(new[] { 1, 3 }, filtered.Select(s => s.Id).OrderBy(x => x));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task ScopedUser_IsSiteAllowed_OnlyForGrantedSites()
|
||||||
|
{
|
||||||
|
var svc = ForUser(Role("Deployment"), SiteClaim(5));
|
||||||
|
|
||||||
|
Assert.True(await svc.IsSiteAllowedAsync(5));
|
||||||
|
Assert.False(await svc.IsSiteAllowedAsync(6));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task ScopedUser_IsNotSystemWide_AndReportsItsPermittedIds()
|
||||||
|
{
|
||||||
|
var svc = ForUser(Role("Deployment"), SiteClaim(7), SiteClaim(9));
|
||||||
|
|
||||||
|
Assert.False(await svc.IsSystemWideAsync());
|
||||||
|
Assert.Equal(new[] { 7, 9 }, (await svc.PermittedSiteIdsAsync()).OrderBy(x => x));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task SystemWideUser_IsSiteAllowed_ForAnySite()
|
||||||
|
{
|
||||||
|
var svc = ForUser(Role("Deployment"));
|
||||||
|
|
||||||
|
Assert.True(await svc.IsSiteAllowedAsync(1));
|
||||||
|
Assert.True(await svc.IsSiteAllowedAsync(999));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,7 +7,7 @@ using Microsoft.Extensions.DependencyInjection;
|
|||||||
using NSubstitute;
|
using NSubstitute;
|
||||||
using ScadaLink.Commons.Entities.Sites;
|
using ScadaLink.Commons.Entities.Sites;
|
||||||
using ScadaLink.Commons.Interfaces.Repositories;
|
using ScadaLink.Commons.Interfaces.Repositories;
|
||||||
using DataConnectionForm = ScadaLink.CentralUI.Components.Pages.Admin.DataConnectionForm;
|
using DataConnectionForm = ScadaLink.CentralUI.Components.Pages.Design.DataConnectionForm;
|
||||||
|
|
||||||
namespace ScadaLink.CentralUI.Tests;
|
namespace ScadaLink.CentralUI.Tests;
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ using NSubstitute;
|
|||||||
using ScadaLink.CentralUI.Components.Shared;
|
using ScadaLink.CentralUI.Components.Shared;
|
||||||
using ScadaLink.Commons.Entities.Sites;
|
using ScadaLink.Commons.Entities.Sites;
|
||||||
using ScadaLink.Commons.Interfaces.Repositories;
|
using ScadaLink.Commons.Interfaces.Repositories;
|
||||||
using DataConnectionsPage = ScadaLink.CentralUI.Components.Pages.Admin.DataConnections;
|
using DataConnectionsPage = ScadaLink.CentralUI.Components.Pages.Design.DataConnections;
|
||||||
|
|
||||||
namespace ScadaLink.CentralUI.Tests;
|
namespace ScadaLink.CentralUI.Tests;
|
||||||
|
|
||||||
@@ -147,16 +147,18 @@ public class DataConnectionsPageTests : BunitContext
|
|||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void LegacyDataConnectionsRoute_IsDeclaredOnListPage()
|
public void DataConnectionsRoutes_AreDeclaredOnListPage()
|
||||||
{
|
{
|
||||||
// Old bookmarks to /admin/data-connections must still resolve.
|
// The page moved from Admin to Design; both the canonical
|
||||||
|
// /design/connections route and the /design/data-connections alias
|
||||||
|
// must resolve to the list page.
|
||||||
var routes = typeof(DataConnectionsPage).GetCustomAttributes(
|
var routes = typeof(DataConnectionsPage).GetCustomAttributes(
|
||||||
typeof(Microsoft.AspNetCore.Components.RouteAttribute), inherit: false)
|
typeof(Microsoft.AspNetCore.Components.RouteAttribute), inherit: false)
|
||||||
.Cast<Microsoft.AspNetCore.Components.RouteAttribute>()
|
.Cast<Microsoft.AspNetCore.Components.RouteAttribute>()
|
||||||
.Select(a => a.Template)
|
.Select(a => a.Template)
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
Assert.Contains("/admin/connections", routes);
|
Assert.Contains("/design/connections", routes);
|
||||||
Assert.Contains("/admin/data-connections", routes);
|
Assert.Contains("/design/data-connections", routes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user