69 Commits

Author SHA1 Message Date
Joseph Doherty 632d44f38c fix(host,deployment-manager,communication): repair cross-module DI regressions from batch 1-2
- DeploymentManager-008: revert IConfiguration overload (violated OptionsTests
  component-convention); Host now binds the ScadaLink:DeploymentManager section
- SiteStreamGrpcServer: make test-only int ctor internal so DI sees one public
  ctor (resolves ambiguous-constructor failure in SiteCompositionRootTests)
- Host site composition-root test config: supply Cluster:SeedNodes for the new
  ClusterOptionsValidator
2026-05-16 21:28:50 -04:00
Joseph Doherty 49fb85e92e docs(code-reviews): regenerate index after batch 3 medium fixes 2026-05-16 21:22:01 -04:00
Joseph Doherty 30ebbdd183 fix(security): resolve Security-004..007 — configurable user-id attribute, DN escaping, JWT issuer/audience validation, idle-timeout preservation 2026-05-16 21:22:01 -04:00
Joseph Doherty a702cb96a8 fix(notification-service): resolve NotificationService-005..009 — explicit TLS modes, per-credential token cache, timeout/throttle, address validation, credential redaction 2026-05-16 21:22:01 -04:00
Joseph Doherty 57679d49f2 fix(management-service): resolve ManagementService-004,006,007,013 — PipeTo dispatch, JsonDocument disposal, unified serialization, endpoint tests; re-triage MS-009 2026-05-16 21:22:01 -04:00
Joseph Doherty da955042aa fix(inbound-api): resolve InboundAPI-002,004,006,008 — disconnect vs timeout, body size limit, active-node gate; surface InboundAPI-007 2026-05-16 21:22:01 -04:00
Joseph Doherty 6563511b5f fix(host): resolve Host-003,004 — replace plaintext secrets with env placeholders, validate site seed-node ports; re-triage Host-002 2026-05-16 21:22:01 -04:00
Joseph Doherty 016bdf9c3c docs(code-reviews): regenerate index after batch 2 medium fixes 2026-05-16 21:11:24 -04:00
Joseph Doherty 9f634e37c3 fix(health-monitoring): resolve HealthMonitoring-003..009 — central offline grace, register unknown-site heartbeats, test coverage 2026-05-16 21:11:24 -04:00
Joseph Doherty 2502e4d10a fix(external-system-gateway): resolve ExternalSystemGateway-004..010 — honour retry settings, dispose HTTP messages, fix URL building, truncate error bodies, fix connection leak 2026-05-16 21:11:24 -04:00
Joseph Doherty 8c67ffad2a fix(deployment-manager): resolve DeploymentManager-003..011 — atomic status commit, orphan-delete handling, semaphore reclamation, structured diff, options binding, lifecycle test coverage 2026-05-16 21:11:24 -04:00
Joseph Doherty c9b236e507 fix(data-connection): resolve DataConnectionLayer-006..012 — quality-counter reconciliation, per-tag batch reads, configurable failover threshold, dedup retry, stale-callback guard, secure cert default 2026-05-16 21:11:24 -04:00
Joseph Doherty 0c82ffcbe6 fix(configuration-database): resolve ConfigurationDatabase-002..007 — remove hardcoded sa creds, fail-fast no-arg DI, encrypt secret columns, resilient audit serialization 2026-05-16 21:11:24 -04:00
Joseph Doherty 8fc04d43c2 docs(code-reviews): regenerate index after batch 1 medium fixes; fix CentralUI-014 severity field format 2026-05-16 20:58:29 -04:00
Joseph Doherty 31a6995d24 fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection 2026-05-16 20:58:03 -04:00
Joseph Doherty 3e7a3d7e31 fix(commons): resolve Commons-001..004 — stale-fire race, JsonDocument lifetime, GetNullable strictness, registry symmetry 2026-05-16 20:58:03 -04:00
Joseph Doherty dba1a1b25f fix(cluster-infrastructure): resolve ClusterInfrastructure-002..006 — options validation, DI registration, down-if-alone 2026-05-16 20:58:03 -04:00
Joseph Doherty 71b90ba499 fix(central-ui): resolve CentralUI-007..014 — nav authz, UTC date filters, disposal guards, N+1 fix, async script analysis 2026-05-16 20:58:03 -04:00
Joseph Doherty 738e67acc5 fix(cli): resolve CLI-002..007 — robust response rendering, URL/JSON arg validation, credential env-vars, doc refresh 2026-05-16 20:58:03 -04:00
Joseph Doherty 658b659c0c docs(code-reviews): regenerate index — all High findings resolved or re-triaged 2026-05-16 20:12:24 -04:00
Joseph Doherty 305b42ea6d feat(template-engine): resolve TemplateEngine-002 — per-slot alarm override for derived templates
Adds IsInherited/LockedInDerived to the TemplateAlarm entity (mirroring the
attribute/script override model), an EF migration, base-alarm copy-on-derive,
inherited-alarm flattening skip, and LockedInDerived override-rejection validation.
2026-05-16 20:12:24 -04:00
Joseph Doherty bc548e1447 feat(deployment-manager): resolve DeploymentManager-006 — query site deployment state before redeploy and reconcile
Adds DeploymentStateQuery request/response contracts (Commons), a site-side
handler (SiteRuntime), a CommunicationService query method (Communication), and
reconciliation in DeploymentService: when a prior record is InProgress or
Failed-on-timeout, query the site; if it already holds the target revision hash
mark the record Success without re-sending; on query failure fall through to a
normal deploy (site-side stale-rejection is the safety net).
2026-05-16 20:12:24 -04:00
Joseph Doherty cac8aebe9f docs(cluster-infrastructure): resolve ClusterInfrastructure-001 — document that the Host owns the Akka bootstrap 2026-05-16 20:12:24 -04:00
Joseph Doherty 2ba5d5d578 docs(code-reviews): regenerate index after batch 4 High fixes; normalize re-triaged SF-002 severity field 2026-05-16 19:57:54 -04:00
Joseph Doherty 74aae53500 fix(template-engine): resolve TemplateEngine-001/003/004/005, re-triage 002 — recursive composed flattening, fixed-field guard, alarm script refs, dead collision query 2026-05-16 19:57:28 -04:00
Joseph Doherty 71c0564ec0 fix(store-and-forward): resolve StoreAndForward-003, re-triage 002 — fix retry-count off-by-one 2026-05-16 19:57:28 -04:00
Joseph Doherty 09b4bd5dfa fix(site-runtime): resolve SiteRuntime-001/002/003 — route data-sourced writes to DCL, real per-attribute API results, race-free redeploy 2026-05-16 19:57:28 -04:00
Joseph Doherty 1ae11d1135 docs(code-reviews): regenerate index after batch 3 High fixes; fix regen-readme.py to parse the Won't Fix status 2026-05-16 19:48:17 -04:00
Joseph Doherty 0529cf2d40 fix(site-event-logging): resolve SiteEventLogging-001/002/003, re-triage 004 — incremental auto_vacuum, cap-purge guard, write-lock connection access 2026-05-16 19:47:51 -04:00
Joseph Doherty 0d9363766d fix(security): resolve Security-001/002/003 — reachable StartTLS path, Secure cookie, JWT signing key validation 2026-05-16 19:47:17 -04:00
Joseph Doherty 393172f169 fix(notification-service): resolve NotificationService-002/003/004 — error classification by SMTP status code, single SMTP client 2026-05-16 19:47:17 -04:00
Joseph Doherty b249ca3bf7 fix(management-service): resolve ManagementService-001/002/003 — enforce site scope on query/snapshot handlers and DebugStreamHub 2026-05-16 19:47:17 -04:00
Joseph Doherty 6f4efdfa2e fix(inbound-api): resolve InboundAPI-001/003/005 — concurrent handler cache, constant-time API key compare, script trust-model enforcement 2026-05-16 19:47:17 -04:00
Joseph Doherty d30ded7e72 docs(code-reviews): regenerate index after batch 2 High fixes 2026-05-16 19:40:40 -04:00
Joseph Doherty a0e6a36e79 fix(host): resolve Host-001 — exclude leader-only active-node check from /health/ready 2026-05-16 19:40:40 -04:00
Joseph Doherty 7d7214a4ca fix(health-monitoring): resolve HealthMonitoring-001/002 — populate S&F buffer depth, make SiteHealthState immutable 2026-05-16 19:40:40 -04:00
Joseph Doherty 340a70f0e6 fix(external-system-gateway): resolve ExternalSystemGateway-002/003 — apply HTTP call timeout, confirm CachedCall no double-dispatch 2026-05-16 19:40:40 -04:00
Joseph Doherty ab098bf6c8 fix(deployment-manager): resolve DeploymentManager-001/002 — broaden failure catch, persist failure status with non-cancellable token 2026-05-16 19:40:40 -04:00
Joseph Doherty fccd3274d3 fix(data-connection-layer): resolve DataConnectionLayer-002/003/004/005 — Resume supervision, concurrent dicts, subscribe-failure classification, write timeout 2026-05-16 19:40:40 -04:00
Joseph Doherty d7630d80fe docs(code-reviews): regenerate index after batch 1 High fixes 2026-05-16 19:33:11 -04:00
Joseph Doherty db08c6eb38 docs(code-reviews): re-triage ClusterInfrastructure-001 — bootstrap lives in Host, needs design decision 2026-05-16 19:33:09 -04:00
Joseph Doherty 9043f0089b fix(configuration-database): resolve ConfigurationDatabase-001 — remove dead child-template query in GetTemplateWithChildrenAsync 2026-05-16 19:33:09 -04:00
Joseph Doherty 301e7fb854 fix(communication): resolve Communication-002/003 — gRPC reconnect stream cleanup and subscription map safety 2026-05-16 19:33:09 -04:00
Joseph Doherty 87f14c190a fix(central-ui): resolve CentralUI-002/003/004 — site-scope enforcement, per-circuit console capture, cached auth state 2026-05-16 19:33:09 -04:00
Joseph Doherty 5a08b04535 fix(cli): resolve CLI-001 — honor SCADALINK_FORMAT and config-file format precedence 2026-05-16 19:33:09 -04:00
Joseph Doherty d8f99ba781 docs(code-reviews): add regen-readme.py to generate the review index
README.md is now generated from the per-module findings.md files by
code-reviews/regen-readme.py (discovers modules, parses each finding's
severity/status, rebuilds the Pending Findings and Module Status tables).
Run with --check to fail when README.md is stale (CI-friendly).

REVIEW-PROCESS.md section 5 now points to the script instead of describing
a manual edit, and README.md carries a generated-file banner.
2026-05-16 19:18:18 -04:00
Joseph Doherty 91438dcc1b fix(store-and-forward): create the SQLite database directory on init (StoreAndForward-014)
StoreAndForwardStorage.InitializeAsync opened a SqliteConnection against the
configured SqliteDbPath (default ./data/store-and-forward.db) without ensuring
the parent directory exists. SQLite creates the database file but not its
directory, so when data/ was absent the connection failed with
"SQLite Error 14: unable to open database file" — aborting the site host's
RegisterSiteActors at StoreAndForwardService.StartAsync.

This was the root cause of the six failing SiteActorPathTests. Production
masked it because the Docker image / deployment creates data/.

InitializeAsync now calls EnsureDatabaseDirectoryExists, which parses the
connection string and creates the parent directory of a file-backed database
(in-memory databases and bare filenames are skipped).

Regression test InitializeAsync_FileInMissingDirectory_CreatesDirectory fails
against the pre-fix code. Host suite now 155/155 green (was 149/155).
2026-05-16 19:13:00 -04:00
Joseph Doherty 61253e3269 fix(store-and-forward): resolve S&F delivery + replication wiring (3 Critical findings)
Resolves StoreAndForward-001, ExternalSystemGateway-001, NotificationService-001
— one systemic gap where buffered messages were persisted but never delivered,
and the active node never replicated its buffer to the standby.

Delivery handlers (ExternalSystemGateway-001 / NotificationService-001):
- AkkaHostedService registers delivery handlers for the ExternalSystem,
  CachedDbWrite and Notification categories after StoreAndForwardService starts;
  each resolves its scoped consumer in a fresh DI scope.
- ExternalSystemClient, DatabaseGateway and NotificationDeliveryService each
  gain a DeliverBufferedAsync method: re-resolve the target and re-attempt
  delivery, returning true/false/throwing per the transient-vs-permanent contract.
- EnqueueAsync gains an attemptImmediateDelivery flag; CachedCallAsync and
  NotificationDeliveryService.SendAsync pass false (they already attempted
  delivery themselves) so registering a handler does not dispatch twice.

Replication (StoreAndForward-001):
- ReplicationService is injected into StoreAndForwardService; a new BufferAsync
  helper replicates every enqueue, and successful-retry removes and parks are
  replicated too. Fire-and-forget, no-op when replication is disabled.

Tests: StoreAndForwardReplicationTests (Add/Remove/Park observed),
attemptImmediateDelivery behaviour, and DeliverBufferedAsync paths for each
consumer. Full solution builds; StoreAndForward/ExternalSystemGateway/
NotificationService suites green.
2026-05-16 18:58:11 -04:00
Joseph Doherty a9bd7ee37c fix(central-ui): resolve CentralUI-001 — enforce script trust model before sandbox execution
ScriptAnalysisService.RunInSandboxAsync compiled and executed arbitrary
user C# in the central host process with no trust-model enforcement — the
forbidden-API set was only a Monaco editor diagnostic. A Design-role user
could run System.IO/Process/Reflection/network code on the central node.

Added a Roslyn semantic gate (EnforceTrustModel) invoked after compilation
and before script.RunAsync, and on nested shared scripts in callSharedFunc;
a script referencing any forbidden API is rejected before it runs.

Reworked FindForbiddenApiUsages: it now resolves every identifier against
the semantic model and checks types and members, so a fully-qualified call
(System.IO.File.WriteAllText) is caught — the pre-fix check only inspected
the leftmost identifier and missed that shape. This is a static semantic
gate, not a process sandbox.

Adds gate regression tests that fail against the pre-fix code, plus a
clean-script test guarding against over-blocking.
2026-05-16 18:41:12 -04:00
Joseph Doherty a9ceba00d0 fix(communication): resolve Communication-001 — early stream termination handling
DebugStreamService.StartStreamAsync awaited the initial debug snapshot inside
a try whose only handler was catch (OperationCanceledException). When the
stream terminated before the snapshot arrived, onTerminatedWrapper completed
the await with an InvalidOperationException that escaped the catch — the
caller got a raw, untranslated exception and the service did no teardown of
its own on that path.

Replaced with catch (Exception): it removes the session entry, sends
StopDebugStream to the bridge actor via the local reference (deterministic
teardown, idempotent), and throws a descriptive exception — TimeoutException
for the 30s timeout, otherwise an InvalidOperationException naming the
instance/site and wrapping the cause.

Re-triaged Critical -> Medium: the originally-claimed multi-minute site-side
resource leak does not occur (the bridge actor self-terminates on every
onTerminated path). Adds the first DebugStreamService test, which fails
against the pre-fix code.
2026-05-16 18:32:52 -04:00
Joseph Doherty 239bee3bc4 fix(data-connection): resolve DataConnectionLayer-001 — off-thread actor state mutation
HandleSubscribe spawned a Task.Run that mutated DataConnectionActor private
state (_subscriptionIds, _subscriptionsByInstance, _totalSubscribed,
_resolvedTags, _unresolvedTags) from a thread-pool thread, racing the actor's
own message loop — a data race on non-thread-safe Dictionary/HashSet and
non-atomic counters.

Restructured HandleSubscribe to follow the actor's existing PipeTo(Self)
pattern: the background task now performs only adapter I/O and pipes a
SubscribeCompleted message to Self; all subscription-state mutation happens
in the new HandleSubscribeCompleted handler on the actor thread (wired into
the Connected, Connecting and Reconnecting states).

Adds DCL001_ConcurrentSubscribes_DoNotCorruptSubscriptionCounters (30x30
concurrent subscribes) which fails against the pre-fix code and passes after.
2026-05-16 18:26:43 -04:00
Joseph Doherty 977d7369a7 docs: add code review process and baseline review of all 19 modules
Establishes a per-module code review workflow under code-reviews/ and
records the 2026-05-16 baseline review (commit 9c60592): 241 findings
across all src/ modules (6 Critical, 46 High, 100 Medium, 89 Low).
This is the clean starting point for remediation work.
2026-05-16 18:09:09 -04:00
Joseph Doherty 9c60592632 build: adopt NuGet Central Package Management
Move all package versions into Directory.Packages.props so every project
resolves a single consistent version. Consolidates the Roslyn packages
(Microsoft.CodeAnalysis.CSharp.Scripting/Workspaces) onto 5.0.0, which
resolves the pre-existing NU1608 version-skew error in the test projects.
2026-05-16 15:56:30 -04:00
Joseph Doherty fd1518f4f4 test(central-ui): remove vacuous tests for removed analyzer diagnostics
Six tests asserted DoesNotContain(SCADA004/SCADA005) or an empty InlayHints
result — all pass for the wrong reason now that those diagnostics and the
positional InlayHints were removed in the analyzer realignment. They also
used the obsolete top-level CallScript syntax. Removed.
2026-05-16 15:06:30 -04:00
Joseph Doherty b949dc4183 test(central-ui): realign analyzer tests with the reworked script-call API 2026-05-16 15:04:06 -04:00
Joseph Doherty 3cc174c3cd test(central-ui): fix the CentralUI.Tests build
Two stale references blocked compilation: the DataConnection page tests
still pointed at Components.Pages.Admin (the pages moved to .Design), and
ScriptAnalysisServiceTests constructed ScriptAnalysisService without the
IServiceProvider parameter. The project now compiles.
2026-05-16 14:44:30 -04:00
Joseph Doherty d030153378 test(site-runtime): fix stale SetStaticAttribute tests
HandleSetStaticAttribute was made fire-and-forget (commit 2951507) — it no
longer replies with SetStaticAttributeResponse — but three InstanceActor
tests still ExpectMsg<SetStaticAttributeResponse> and timed out. Verify the
mutation via the GetAttributeRequest round-trip instead, which the FIFO
mailbox makes a sound sync point. Test intent (in-memory update, SQLite
persistence, serialized ordering) is unchanged.
2026-05-16 14:33:09 -04:00
Joseph Doherty d63d412461 test(triggers): expect AlarmTriggerType.Expression in the enum membership test 2026-05-16 06:42:17 -04:00
Joseph Doherty 0a535cd4a5 fix(triggers): don't false-flag Children/Parent attribute refs in expression validation 2026-05-16 06:08:06 -04:00
Joseph Doherty 5065384305 fix(triggers): use explicit ValidationCategory + tighten expression syntax validation 2026-05-16 05:57:39 -04:00
Joseph Doherty bf3f572ad9 feat(triggers): validate expression triggers pre-deployment 2026-05-16 05:52:25 -04:00
Joseph Doherty 3499d76f14 feat(ui/triggers): expression trigger panel in the script & alarm editors 2026-05-16 05:46:27 -04:00
Joseph Doherty 78b10d00d8 fix(triggers): bound expression evaluation, align AlarmActor error handling, dedupe config parsing 2026-05-16 05:43:18 -04:00
Joseph Doherty 41c3fa3d84 fix(triggers): seed the trigger-expression attribute snapshot at actor startup 2026-05-16 05:38:50 -04:00
Joseph Doherty 9e21b47080 feat(triggers): runtime expression trigger evaluation for scripts and alarms 2026-05-16 05:35:02 -04:00
Joseph Doherty f789ab4a91 docs(triggers): list the Expression config shape in the codec summaries 2026-05-16 05:30:12 -04:00
Joseph Doherty 199cdbe798 feat(triggers): add Expression to the script & alarm trigger codecs 2026-05-16 05:27:33 -04:00
Joseph Doherty 8050a1996f docs(plans): implementation plan for expression triggers 2026-05-16 05:25:10 -04:00
Joseph Doherty c94d3b7570 docs(plans): design for expression-based script & alarm triggers
Captures the brainstormed design for a new Expression trigger: a read-only
boolean C# expression evaluated on attribute updates, edge-triggered for
scripts and level-based for alarms, compiled against a restricted read-only
globals type.
2026-05-16 05:21:57 -04:00
283 changed files with 28523 additions and 1414 deletions
+4
View File
@@ -32,3 +32,7 @@ TestResults/
**/logs/
site_events.db
data/
# Claude Code local files
.claude/settings.local.json
.claude/scheduled_tasks.lock
+69
View File
@@ -0,0 +1,69 @@
<Project>
<PropertyGroup>
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
</PropertyGroup>
<ItemGroup>
<PackageVersion Include="Akka" Version="1.5.62" />
<PackageVersion Include="Akka.Cluster" Version="1.5.62" />
<PackageVersion Include="Akka.Cluster.Hosting" Version="1.5.62" />
<PackageVersion Include="Akka.Cluster.Tools" Version="1.5.62" />
<PackageVersion Include="Akka.Hosting" Version="1.5.62" />
<PackageVersion Include="Akka.Remote" Version="1.5.62" />
<PackageVersion Include="Akka.Remote.Hosting" Version="1.5.62" />
<PackageVersion Include="Akka.Streams" Version="1.5.62" />
<PackageVersion Include="Akka.Streams.TestKit" Version="1.5.62" />
<PackageVersion Include="Akka.TestKit.Xunit2" Version="1.5.62" />
<PackageVersion Include="AspNetCore.HealthChecks.UI.Client" Version="9.0.0" />
<PackageVersion Include="bunit" Version="2.0.33-preview" />
<PackageVersion Include="coverlet.collector" Version="6.0.4" />
<PackageVersion Include="FluentAssertions" Version="8.3.0" />
<PackageVersion Include="Google.Protobuf" Version="3.29.3" />
<PackageVersion Include="Grpc.AspNetCore" Version="2.71.0" />
<PackageVersion Include="Grpc.Net.Client" Version="2.71.0" />
<PackageVersion Include="Grpc.Tools" Version="2.71.0" />
<PackageVersion Include="MailKit" Version="4.16.0" />
<PackageVersion Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.Authorization" Version="10.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.DataProtection" Version="10.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" Version="10.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.Mvc.Testing" Version="10.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.SignalR.Client" Version="9.0.3" />
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="5.0.0" />
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Workspaces" Version="5.0.0" />
<PackageVersion Include="Microsoft.Data.SqlClient" Version="6.0.2" />
<PackageVersion Include="Microsoft.Data.Sqlite" Version="10.0.7" />
<PackageVersion Include="Microsoft.EntityFrameworkCore" Version="10.0.7" />
<PackageVersion Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.7" />
<PackageVersion Include="Microsoft.EntityFrameworkCore.InMemory" Version="10.0.7" />
<PackageVersion Include="Microsoft.EntityFrameworkCore.Sqlite" Version="10.0.7" />
<PackageVersion Include="Microsoft.EntityFrameworkCore.SqlServer" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Configuration.Json" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.DependencyInjection" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Hosting.Abstractions" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Hosting.WindowsServices" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Http" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Logging" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Options" Version="10.0.7" />
<PackageVersion Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="10.0.7" />
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
<PackageVersion Include="Microsoft.Playwright" Version="1.58.0" />
<PackageVersion Include="Moq" Version="4.20.72" />
<PackageVersion Include="Novell.Directory.Ldap.NETStandard" Version="3.6.0" />
<PackageVersion Include="NSubstitute" Version="5.3.0" />
<PackageVersion Include="OPCFoundation.NetStandard.Opc.Ua.Client" Version="1.5.378.106" />
<PackageVersion Include="OpenTelemetry.Api" Version="1.15.3" />
<PackageVersion Include="Serilog" Version="4.3.1" />
<PackageVersion Include="Serilog.AspNetCore" Version="10.0.0" />
<PackageVersion Include="Serilog.Sinks.Console" Version="6.1.1" />
<PackageVersion Include="Serilog.Sinks.File" Version="7.0.0" />
<PackageVersion Include="System.CommandLine" Version="2.0.5" />
<PackageVersion Include="System.IdentityModel.Tokens.Jwt" Version="8.11.0" />
<PackageVersion Include="xunit" Version="2.9.3" />
<PackageVersion Include="xunit.runner.visualstudio" Version="3.1.4" />
</ItemGroup>
</Project>
+1 -1
View File
@@ -46,7 +46,7 @@ This document serves as the master index for the SCADA system design. The system
| 10 | Security & Auth | [docs/requirements/Component-Security.md](docs/requirements/Component-Security.md) | Direct LDAP bind (LDAPS/StartTLS), JWT sessions (HMAC-SHA256, 15-min refresh, 30-min idle), role-based authorization, site-scoped permissions. |
| 11 | Health Monitoring | [docs/requirements/Component-HealthMonitoring.md](docs/requirements/Component-HealthMonitoring.md) | 30s report interval, 60s offline threshold, monotonic sequence numbers, raw error counts, tag resolution counts, dead letter monitoring. |
| 12 | Site Event Logging | [docs/requirements/Component-SiteEventLogging.md](docs/requirements/Component-SiteEventLogging.md) | SQLite storage, 30-day retention + 1GB cap, daily purge, paginated remote queries with keyword search. |
| 13 | Cluster Infrastructure | [docs/requirements/Component-ClusterInfrastructure.md](docs/requirements/Component-ClusterInfrastructure.md) | Akka.NET cluster, keep-oldest SBR with down-if-alone, min-nr-of-members=1, 2s/10s/15s failure detection, CoordinatedShutdown, automatic dual-node recovery. |
| 13 | Cluster Infrastructure | [docs/requirements/Component-ClusterInfrastructure.md](docs/requirements/Component-ClusterInfrastructure.md) | Akka.NET cluster, keep-oldest SBR with down-if-alone, min-nr-of-members=1, 2s/10s/15s failure detection, CoordinatedShutdown, automatic dual-node recovery. The `ClusterInfrastructure` project owns the `ClusterOptions` config model; the Akka bootstrap/SBR/CoordinatedShutdown wiring lives in the Host. |
| 14 | Inbound API | [docs/requirements/Component-InboundAPI.md](docs/requirements/Component-InboundAPI.md) | POST /api/{methodName}, X-API-Key header, flat JSON, extended type system (Object/List), script-based implementations, failures-only logging. |
| 15 | Host | [docs/requirements/Component-Host.md](docs/requirements/Component-Host.md) | Single deployable binary, role-based component registration, per-component config binding (Options pattern), readiness gating, dead letter monitoring, Akka.NET bootstrap, ASP.NET Core hosting for central. |
| 16 | Commons | [docs/requirements/Component-Commons.md](docs/requirements/Component-Commons.md) | Namespace/folder convention (Types/Interfaces/Entities/Messages), shared data types, POCOs, repository interfaces, message contracts with additive-only versioning, UTC timestamp convention. |
+490
View File
@@ -0,0 +1,490 @@
# Code Review — CLI
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.CLI` |
| Design doc | `docs/requirements/Component-CLI.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 6 |
## Summary
The CLI is a small, well-structured HTTP client over the Management API. The command-tree
construction is consistent and repetitive in a good way: every subcommand funnels through
`CommandHelpers.ExecuteCommandAsync`, which centralizes URL/credential resolution, HTTP
dispatch, and response handling. There are no Akka.NET concerns (the CLI is a pure HTTP
client) and no concurrency-sensitive code apart from the `debug stream` SignalR handler.
The dominant theme is **graceful-degradation gaps**: several user-supplied inputs (malformed
URLs, malformed `--bindings`/`--overrides` JSON, non-JSON success bodies) are deserialized
or constructed without `try/catch`, so a normal user mistake surfaces as an unhandled
exception with a stack trace instead of a clean error message and exit code 1. A second
theme is **dead configuration**: the `SCADALINK_FORMAT` environment variable and the
`defaultFormat` config-file field are loaded by `CliConfig` but never consulted by any
command, so the documented format-precedence chain does not work. The third theme is
**substantial design-document drift**: `Component-CLI.md` describes a name-keyed,
`--file`-based command surface that bears little resemblance to the implemented
ID-keyed, flag-based surface. Test coverage exercises `OutputFormatter`, `CliConfig`, and
`CommandHelpers.HandleResponse`, but the HTTP client, the `debug stream` path, the JSON
argument parsing, and the command-tree wiring are untested.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | Format precedence is broken (CLI-001); empty/non-JSON success bodies crash table rendering (CLI-002, CLI-003). |
| 2 | Akka.NET conventions | ☑ | Not applicable — CLI is a pure HTTP/SignalR client with no Akka.NET runtime (design doc confirms). No issues. |
| 3 | Concurrency & thread safety | ☑ | Only `debug stream` is concurrent; `CancellationTokenSource` is never disposed (CLI-011). Exit-code resolution after Ctrl+C is loose (CLI-012). |
| 4 | Error handling & resilience | ☑ | Unhandled exceptions on malformed URL (CLI-004) and malformed JSON arguments (CLI-005); `StartAsync` cancellation is misreported (CLI-010). |
| 5 | Security | ☑ | `--password` on the command line leaks into process listings / shell history with no env-var or prompt alternative (CLI-006). |
| 6 | Performance & resource management | ☑ | `HttpClient` per invocation is acceptable for a one-shot CLI. `CancellationTokenSource` leak noted in CLI-011. |
| 7 | Design-document adherence | ☑ | `Component-CLI.md` is heavily stale relative to the implemented command surface (CLI-007). |
| 8 | Code organization & conventions | ☑ | Consistent and clean; `CliConfig.DefaultFormat` is loaded but unused (covered by CLI-001). Minor: `--format` not validated (CLI-008). |
| 9 | Testing coverage | ☑ | No tests for `ManagementHttpClient`, `DebugCommands`, command-tree wiring, or JSON argument parsing (CLI-013). |
| 10 | Documentation & comments | ☑ | `Component-CLI.md` mismatch (CLI-007); the in-repo `README.md` is reasonably accurate. Minor exit-code doc mismatch (CLI-009). |
## Findings
### CLI-001 — `SCADALINK_FORMAT` env var and config-file format are dead; format precedence broken
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:18`, `src/ScadaLink.CLI/Commands/DebugCommands.cs:45`, `src/ScadaLink.CLI/CliConfig.cs:37-39` |
**Description**
`CliConfig.Load()` reads `SCADALINK_FORMAT` and the `defaultFormat` config-file field into
`CliConfig.DefaultFormat`, and `Component-CLI.md` documents a format-precedence chain
(command-line option → env var → config file). However, every command resolves the format
with `var format = result.GetValue(formatOption) ?? "json";` and `formatOption` is created
in `Program.cs:11` with `DefaultValueFactory = _ => "json"`. `GetValue` therefore always
returns a non-null value ("json" when the flag is absent), so the `?? "json"` fallback never
fires and `config.DefaultFormat` is never consulted. The env var and config-file format
settings are dead code: `scadalink site list` always outputs JSON regardless of
`SCADALINK_FORMAT=table` or a `defaultFormat` entry in `~/.scadalink/config.json`. The
documented behaviour silently does not work.
**Recommendation**
Either remove the `--format` option's `DefaultValueFactory` and have `CommandHelpers`
resolve precedence explicitly (`result.GetValue(formatOption)``config.DefaultFormat`),
or detect whether the option was explicitly supplied (`result.GetResult(formatOption)`) and
only then override the config value. Apply the same fix to `DebugCommands.BuildStream`.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Removed the `--format` option's
`DefaultValueFactory` in `Program.cs` and added `CommandHelpers.ResolveFormat`, which uses
`ParseResult.GetResult(formatOption)` to detect an explicitly supplied flag and resolves
precedence explicitly: explicit `--format``CliConfig.DefaultFormat` (env var / config
file) → `"json"`. Both `CommandHelpers.ExecuteCommandAsync` and `DebugCommands.BuildStream`
now call `ResolveFormat`. Regression tests added in `FormatResolutionTests`.
### CLI-002 — Empty success body crashes table rendering with an unhandled exception
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:59-68`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:78-80` |
**Description**
`ManagementHttpClient.SendCommandAsync` returns `JsonData = responseBody` for any
success status code, including a 200/204 with an empty body. `HandleResponse` then tests
`response.JsonData != null` — an empty string is non-null — and for `--format table`
calls `WriteAsTable(response.JsonData)`, which immediately does `JsonDocument.Parse(json)`.
`JsonDocument.Parse("")` throws `JsonException`, which is not caught anywhere, so a
command that legitimately returns no body (e.g. a delete that returns 204) terminates with
a stack trace instead of a clean success message.
**Recommendation**
In `HandleResponse`, treat a null-or-whitespace `JsonData` as a "command succeeded, no
output" case (print nothing or `(ok)`), and return 0 before attempting to parse.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed against source —
`HandleResponse` tested `JsonData != null`, so an empty success body fell through to
`WriteAsTable``JsonDocument.Parse("")` and threw an uncaught `JsonException`.
`HandleResponse` now treats a null-or-whitespace `JsonData` as a "succeeded, no output"
case, prints `(ok)`, and returns 0 before any parse. Regression tests added in
`ResponseRenderingTests` (`HandleResponse_EmptyBody_TableFormat_DoesNotThrow_ReturnsZero`,
`HandleResponse_EmptyBody_JsonFormat_DoesNotThrow_ReturnsZero`).
### CLI-003 — Non-JSON success body crashes table rendering
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Commands/CommandHelpers.cs:80` |
**Description**
`WriteAsTable` calls `JsonDocument.Parse(json)` with no `try/catch`. If the server returns
a success status but a body that is not valid JSON (a proxy/HTML error page returned with
a 200, a plain-text message, etc.), the CLI throws an unhandled `JsonException`. The
error-path code in `ManagementHttpClient` (lines 52-61) already defensively wraps
`JsonDocument.Parse` in a `try/catch`; the success path and `WriteAsTable` do not get the
same treatment.
**Recommendation**
Wrap the `JsonDocument.Parse` in `WriteAsTable` in a `try/catch`; on failure, fall back to
printing the raw body verbatim (as the JSON path already does at line 66).
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed — `WriteAsTable` parsed the
body with no `try/catch`. The `JsonDocument.Parse` call is now wrapped in a
`try/catch (JsonException)` that prints the raw body verbatim on failure, mirroring the
raw-body fallback on the JSON path. Regression test
`ResponseRenderingTests.HandleResponse_NonJsonBody_TableFormat_FallsBackToRaw_ReturnsZero`.
### CLI-004 — Malformed `--url` throws an unhandled `UriFormatException`
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/ManagementHttpClient.cs:13` |
**Description**
The `ManagementHttpClient` constructor does `new Uri(baseUrl.TrimEnd('/') + "/")` with no
validation. If the user passes a malformed URL (e.g. `--url localhost:9001` without a
scheme, or `--url ""`), `new Uri(...)` throws `UriFormatException`. This call is not
guarded by the `try/catch` in `SendCommandAsync` (it happens in the constructor at
`CommandHelpers.cs:50`), so a common typo terminates the CLI with a stack trace rather
than the documented "connection failure → exit 1 with a descriptive message".
**Recommendation**
Validate the URL before constructing the client — e.g. `Uri.TryCreate(url, UriKind.Absolute, out _)` in `CommandHelpers.ExecuteCommandAsync` and `DebugCommands.BuildStream` — and emit a
clean `INVALID_URL` error with exit code 1 on failure.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed — the
`ManagementHttpClient` constructor's `new Uri(...)` ran outside the `SendCommandAsync`
`try/catch`. Added `CommandHelpers.IsValidManagementUrl`, which checks for an absolute
http/https URL via `Uri.TryCreate`. Both `CommandHelpers.ExecuteCommandAsync` and
`DebugCommands.BuildStream` now validate the resolved URL up front and emit a clean
`INVALID_URL` error with exit code 1. Regression tests in `UrlValidationTests`.
### CLI-005 — Malformed `--bindings` / `--overrides` JSON throws unhandled exceptions
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Commands/InstanceCommands.cs:55-58`, `src/ScadaLink.CLI/Commands/InstanceCommands.cs:181-182` |
**Description**
`set-bindings` deserializes the `--bindings` argument with
`JsonSerializer.Deserialize<List<List<JsonElement>>>(...)` and then indexes `p[0]`/`p[1]`
and calls `p[0].GetString()!` / `p[1].GetInt32()`. `set-overrides` deserializes `--overrides`
with `JsonSerializer.Deserialize<Dictionary<string, string?>>(...)`. None of this is wrapped
in a `try/catch`. Invalid JSON throws `JsonException`; a pair with fewer than two elements
throws `ArgumentOutOfRangeException`; a non-string/non-int element throws `InvalidOperationException`. All of these surface as raw stack traces, so a user typo in a JSON argument
crashes the CLI instead of producing a clean validation error and exit code 1.
**Recommendation**
Wrap the parsing in `try/catch (JsonException ...)` (and guard the pair length / element
kinds), and on failure call `OutputFormatter.WriteError(...)` with an `INVALID_ARGUMENT`
code and return 1.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed — both `set-bindings` and
`set-overrides` deserialized and indexed JSON inline with no `try/catch`. Extracted the
parsing into testable `InstanceCommands.TryParseBindings` / `TryParseOverrides` helpers
that catch `JsonException`, guard against null results, and (for bindings) validate pair
arity and element kinds (`JsonValueKind`) instead of letting `ArgumentOutOfRangeException`
/ `InvalidOperationException` escape. The command actions now emit a clean
`INVALID_ARGUMENT` error and return 1 on failure. Regression tests in
`InstanceArgumentParsingTests` (8 tests covering valid input, malformed JSON, short pairs,
wrong element types, and JSON null).
### CLI-006 — Password is passed as a command-line argument with no safer alternative
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.CLI/Program.cs:9`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:36-44` |
**Description**
Credentials are supplied only via `--username` / `--password`. A password on the command
line is visible to any local user via the process list (`ps`, `/proc/<pid>/cmdline`) and is
typically persisted into shell history. Unlike the management URL — which can also come
from `SCADALINK_MANAGEMENT_URL` or the config file — there is no environment-variable
fallback, no `--password-stdin`, and no interactive prompt for the password. For a tool
explicitly intended for CI/CD automation this materially increases the chance of credential
leakage.
**Recommendation**
Add a `SCADALINK_PASSWORD` environment variable fallback and/or a `--password-stdin`
option (read the password from stdin), and document that `--password` on the command line
is discouraged. Optionally prompt interactively when stdin is a TTY and no password was
supplied.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed — credentials had no
non-command-line source. Added `SCADALINK_USERNAME` / `SCADALINK_PASSWORD` environment
fallbacks: `CliConfig.Load` now reads them into new `CliConfig.Username` / `Password`
properties (credentials are sourced from environment variables only, never the config
file, so they are not persisted). `CommandHelpers.ResolveCredential` resolves precedence
(explicit `--username`/`--password` → env var); both `ExecuteCommandAsync` and
`DebugCommands.BuildStream` use it. The design doc and the in-repo `README.md` now
document that `--password` on the command line is discouraged. The `--password-stdin`
option / interactive prompt was not added — the env-var fallback fully satisfies the
CI/CD safe-credential need; a stdin/prompt variant can be a follow-up if interactive use
demands it. Regression tests in `CredentialResolutionTests`.
### CLI-007 — `Component-CLI.md` command surface is substantially stale
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `docs/requirements/Component-CLI.md:51-211` (vs. all files under `src/ScadaLink.CLI/Commands/`) |
**Description**
The "Command Structure" section of the design doc no longer matches the implemented CLI.
Examples of the drift:
- The doc keys most operations by **name** (`template get <name>`, `instance get <code>`,
`site get <site-id>`); the implementation keys everything by integer **ID** via `--id`
(`TemplateCommands.cs:40`, `InstanceCommands.cs:31`, `SiteCommands.cs:26`).
- The doc shows `template create ... --file <path>` and `site update <site-id> --file <path>`;
the implementation has no `--file` option anywhere and instead takes individual flags
(`TemplateCommands.cs:52-72`, `SiteCommands.cs:83-115`).
- The doc lists commands that do not exist (`template diff`, `instance bind-connections`,
`instance assign-area`, `template attribute add --tag-path`, `data-connection assign/unassign`,
`security api-key enable/disable` as separate commands) and omits commands that do exist
(`instance alarm-override set/delete/list`, `external-system method` subgroup).
- The doc's `notification smtp update --file` differs from the implemented
`--server/--port/--auth-mode/--from-address` flags (`NotificationCommands.cs:72-94`).
- The doc uses `--site` for site identification in several places where the implementation
uses `--site-id` or `--identifier`.
A reader following the design doc would be unable to drive the CLI.
**Recommendation**
Regenerate the "Command Structure" section of `Component-CLI.md` from the actual command
tree (the in-repo `src/ScadaLink.CLI/README.md` is much closer to reality and could be the
source), or mark the doc's command list as illustrative and point to the README as
authoritative.
**Resolution**
Resolved 2026-05-16 (commit pending). Drift confirmed against every file under
`src/ScadaLink.CLI/Commands/`. Regenerated the entire "Command Structure" section of
`Component-CLI.md` from the actual command tree: all entities are now keyed by integer
`--id`; the non-existent `--file` option is removed; create/update commands list their
real individual flags; non-existent commands (`template diff`, `instance
bind-connections`/`assign-area`, `data-connection assign/unassign`, `security api-key
enable/disable`) are removed; previously-omitted commands (`instance alarm-override
set/delete/list`, `external-system method` subgroup, `site deploy-artifacts`) are added.
A note now points to `src/ScadaLink.CLI/README.md` as the authoritative reference. The
Configuration section also documents the new `SCADALINK_USERNAME`/`SCADALINK_PASSWORD`
env vars (see CLI-006).
### CLI-008 — `--format` value is not validated
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.CLI/Program.cs:10-11`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:60` |
**Description**
The `--format` option accepts any string. `HandleResponse` only checks
`string.Equals(format, "table", ...)`; any other value — including a typo like
`--format tabel` or `--format xml` — silently falls through to JSON output. The user gets
no feedback that their requested format was not honoured.
**Recommendation**
Restrict the option to the accepted values, e.g. `formatOption.AcceptOnlyFromAmong("json", "table")`, so `System.CommandLine` rejects invalid input with a clear parse error.
**Resolution**
_Unresolved._
### CLI-009 — Exit-code documentation does not match `HandleResponse` behaviour
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `docs/requirements/Component-CLI.md:238-249`, `src/ScadaLink.CLI/Commands/CommandHelpers.cs:75` |
**Description**
The design doc's Exit Codes table defines code 2 as "Authorization failure (insufficient
role)" and the Error Handling section says "If the server returns HTTP 403, the CLI exits
with code 2." `HandleResponse` implements `return response.StatusCode == 403 ? 2 : 1;`,
which is correct for the HTTP error path. However, the `NO_URL`, `NO_CREDENTIALS`,
`INVALID_OPERATION` (from `set-bindings`/`set-overrides`) and any other client-side failure
all return 1, and a connection failure carries `StatusCode == 0` — none of which the doc
enumerates. More importantly, an authorization failure that the server signals with a body
`code` of `UNAUTHORIZED` but an HTTP status other than 403 would be classified as a generic
error (exit 1). The mapping is purely status-driven and the doc does not state that.
**Recommendation**
Either document precisely that exit code 2 is determined solely by HTTP 403, or key the
"authorization failure" exit code off the response `code` field as well. Align the doc
with whichever is chosen.
**Resolution**
_Unresolved._
### CLI-010 — `debug stream` reports Ctrl+C during connect as a connection failure
| | |
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.CLI/Commands/DebugCommands.cs:181-189` |
**Description**
`StreamDebugAsync` calls `await connection.StartAsync(cts.Token)` inside a
`try { } catch (Exception ex)` that unconditionally reports
`"Connection failed: {ex.Message}"` with code `CONNECTION_FAILED` and returns 1. If the
user presses Ctrl+C while the connection is still being established, `cts` is cancelled and
`StartAsync` throws `OperationCanceledException`; this is caught by the generic handler and
misreported as a connection failure (with exit code 1) rather than a clean user-initiated
cancellation (exit code 0).
**Recommendation**
Catch `OperationCanceledException` separately (return 0 quietly) before the generic
`catch (Exception)` handler, mirroring how the `exitTcs.Task.WaitAsync(cts.Token)` path at
lines 209-215 already treats cancellation as graceful.
**Resolution**
_Unresolved._
### CLI-011 — `CancellationTokenSource` in `debug stream` is never disposed
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.CLI/Commands/DebugCommands.cs:89` |
**Description**
`var cts = new CancellationTokenSource();` is created in `StreamDebugAsync` but never
disposed; there is no `using` declaration and no explicit `Dispose()` call on any exit
path. `CancellationTokenSource` owns a `WaitHandle` and should be disposed. The impact is
small because the process exits shortly after, but it is an `IDisposable` left undisposed,
contrary to the review checklist's resource-management expectation.
**Recommendation**
Declare it as `using var cts = new CancellationTokenSource();` (or wrap the method body in
a `try/finally`).
**Resolution**
_Unresolved._
### CLI-012 — `debug stream` exit code is unreliable after stream termination
| | |
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.CLI/Commands/DebugCommands.cs:208-227` |
**Description**
After `await exitTcs.Task.WaitAsync(cts.Token)`, the method returns
`exitTcs.Task.IsCompletedSuccessfully ? exitTcs.Task.Result : 0`. When the user cancels
with Ctrl+C, `WaitAsync` throws `OperationCanceledException` and `exitTcs` is typically
still incomplete, so the method returns 0 — correct. However, the `OnStreamTerminated`
handler and the `Closed` handler both call `exitTcs.TrySetResult`, and these run on
SignalR callback threads concurrently with the Ctrl+C path. If a stream termination and a
Ctrl+C race, the final exit code depends on which `TrySetResult` won and whether
`WaitAsync` observed completion before cancellation — the result is not deterministic. A
stream the server terminated abnormally can end up returning 0.
**Recommendation**
Resolve the exit code from a single authoritative source: after the `try/catch` around
`WaitAsync`, check `exitTcs.Task` completion explicitly and treat a Ctrl+C with no prior
result as 0, but always prefer a result that was set by `OnStreamTerminated`/`Closed`.
Consider awaiting `exitTcs.Task` without the cancellation token after a brief grace period.
**Resolution**
_Unresolved._
### CLI-013 — HTTP client, `debug stream`, and JSON-argument parsing are untested
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.CLI.Tests/` (vs. `src/ScadaLink.CLI/ManagementHttpClient.cs`, `src/ScadaLink.CLI/Commands/DebugCommands.cs`, `src/ScadaLink.CLI/Commands/InstanceCommands.cs:55-58`) |
**Description**
The test project covers `OutputFormatter`, `CliConfig.Load`, and
`CommandHelpers.HandleResponse`. It does not cover:
- `ManagementHttpClient.SendCommandAsync` — the timeout (504), connection-failure (code 0),
and error-body-parsing paths are untested.
- The `debug stream` SignalR command — no tests at all.
- The JSON-argument parsing in `InstanceCommands` (`set-bindings`, `set-overrides`) — the
paths most likely to crash on bad input (CLI-005) have no coverage.
- Command-tree wiring — there is no test asserting that each `Build` produces the expected
subcommands/options or that the command-name derivation
(`ManagementCommandRegistry.GetCommandName`) resolves for every command type the CLI
constructs.
**Recommendation**
Add tests for `ManagementHttpClient` (using a stub `HttpMessageHandler`), for the
JSON-argument parsing helpers (extracting the parsing into testable methods), and a
smoke test that walks the root command tree and asserts every leaf command's payload type
resolves via `ManagementCommandRegistry`.
**Resolution**
_Unresolved._
+801
View File
@@ -0,0 +1,801 @@
# Code Review — CentralUI
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.CentralUI` |
| Design doc | `docs/requirements/Component-CentralUI.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 7 |
## Summary
The Central UI is a sizeable, generally well-structured Blazor Server module:
custom Bootstrap components only (no third-party UI frameworks, as required),
consistent list/form page patterns, careful disposal in most components, and a
thoughtful Roslyn-backed script editor. The most serious problem is the
**Test Run sandbox** (`ScriptAnalysisService.RunInSandboxAsync`): it compiles
and executes arbitrary user C# *in the central process* with no enforcement of
the documented script trust model — the forbidden-API list is only a Monaco
editor diagnostic, never applied before execution — so a Design user can run
`System.IO`/`Process`/`Reflection` code on the central node. Several other
themes recur: (1) per-circuit security drift — site-scoped Deployment claims
are written at login but never read, so site scoping is not enforced anywhere;
(2) Blazor render-thread and disposal hazards — background `Timer` / `Task.Delay`
callbacks and stream callbacks touch component state and `@ref` children that
may already be disposed; (3) process-global mutation (`Console.SetOut`) shared
across concurrent circuits; (4) drift from the design doc on session expiry and
on the "deployment status pushes via SignalR" claim (the page actually polls).
Testing coverage is thin for a module this large: only the script analyzer,
TreeView, schema model, and a few data-connection pages have unit tests; most
pages and the auth bridge are untested.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | DebugView cap logic, audit-log timezone, toast race — see findings. |
| 2 | Akka.NET conventions | ☑ | Module is mostly UI; `DebugStreamService` actor usage reviewed (in Communication but driven from here). No actor-convention violations in CentralUI proper. |
| 3 | Concurrency & thread safety | ☑ | `Console.SetOut` global mutation, stream/timer callbacks on non-render threads, toast `_ = Task.Delay`. |
| 4 | Error handling & resilience | ☑ | Broad `catch {}` swallowing, dangling `TaskCompletionSource` on dialog disposal. |
| 5 | Security | ☑ | Sandbox not enforcing trust model (Critical); site scoping never enforced; auth bridge reads stale HttpContext; logout CSRF. |
| 6 | Performance & resource management | ☑ | N+1 site-connection query, repeated `FilteredMessages` recomputation, full-page paginators rendering all page buttons. |
| 7 | Design-document adherence | ☑ | Session expiry diverges from "15-min sliding + 30-min idle"; Deployments polls despite "push via SignalR"; nav exposes Deployment-only pages to all roles. |
| 8 | Code organization & conventions | ☑ | Generally good; options classes absent (no appsettings binding here); no major violations. |
| 9 | Testing coverage | ☑ | Auth, sandbox-run, DebugView, Health, ParkedMessages, most pages untested. |
| 10 | Documentation & comments | ☑ | Comments are accurate and helpful; a few stale claims noted. |
## Findings
### CentralUI-001 — Test Run sandbox executes arbitrary C# with no trust-model enforcement
| | |
|--|--|
| Severity | Critical |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:171-424` |
**Description**
`RunInSandboxAsync` compiles user-supplied script code with `CSharpScript.Create`
and executes it (`script.RunAsync`) directly inside the central process. The
"sandbox" applies only a wall-clock timeout and an output-size cap. It does
**not** enforce the documented script trust model: the forbidden-API set
(`System.IO`, `System.Diagnostics`/`Process`, `System.Reflection`, `System.Net`,
threading) is checked only in `FindForbiddenApiUsages`, which feeds Monaco
editor diagnostics — it is never consulted before `RunInSandboxAsync` executes.
`DefaultOptions` references `typeof(object).Assembly` (the full BCL), so a
Design-role user can submit `System.IO.File.WriteAllText(...)`,
`System.Diagnostics.Process.Start(...)`, reflection, or raw socket code via
`POST /api/script-analysis/run` and it runs with the central host process's
full privileges. The endpoint is gated only by `RequireDesign`. This is a
remote code execution path on the central cluster node.
**Recommendation**
Before executing, run the same forbidden-API analysis used for diagnostics and
reject any script with a `SCADA001`/`SCADA002` (severity-8) marker; additionally
restrict the compilation's metadata references to the curated script API
surface, and ideally execute in an isolated `AssemblyLoadContext`/process with
constrained permissions. Treat the trust model as an execution-time gate, not
an editor hint.
**Resolution**
Resolved 2026-05-16. A Roslyn semantic trust-model gate was added. `RunInSandboxAsync`
now calls `EnforceTrustModel` after compilation and before `script.RunAsync`; if the
script references any forbidden API the run is rejected (`SandboxErrorKind.CompileError`)
with the offending markers, and the same gate is applied to nested shared scripts in
`callSharedFunc`. `FindForbiddenApiUsages` was reworked so it resolves every identifier
(not just the leftmost) against the semantic model and checks types **and** members —
so a fully-qualified call such as `System.IO.File.WriteAllText(...)` is now caught, not
only `using`-directive or bare-type forms. This is a static semantic gate consistent
with the documented trust model; it is not a process sandbox — reflection-based
indirection remains out of its reach, and full isolation would require running scripts
in a separate constrained process (a larger change deliberately not taken here).
Regression tests `RunInSandbox_FullyQualifiedForbiddenApi_IsBlockedBeforeExecution`,
`RunInSandbox_ForbiddenUsingDirective_IsBlockedBeforeExecution` and
`Diagnose_FullyQualifiedForbiddenCall_RaisesSCADA002` fail against the pre-fix code and
pass after; `RunInSandbox_CleanScript_StillRuns` guards against over-blocking. Fixed by
the commit whose message references `CentralUI-001`.
### CentralUI-002 — Site-scoped Deployment permissions are issued but never enforced
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Auth/AuthEndpoints.cs:63-69`; `src/ScadaLink.CentralUI/Components/Pages/Deployment/*.razor` |
**Description**
Login adds `SiteId` claims (`JwtTokenService.SiteIdClaimType`) for non-system-wide
Deployment users, and the design doc (Component-CentralUI "Responsibilities" and
CLAUDE.md Security & Auth) requires the Deployment role to be site-scoped. A
repo-wide search shows the `SiteId` claim is written at login and **never read
anywhere in CentralUI**. Deployment pages — `DebugView.razor`, `Deployments.razor`,
`InstanceCreate.razor`, `InstanceConfigure.razor`, `Topology.razor`,
`ParkedMessages.razor`, `EventLogs.razor` — list and act on every site with no
filtering by the user's permitted sites. A Deployment user scoped to one site
can deploy to, debug, and manage instances at any site.
**Recommendation**
Enforce site scoping: filter site/instance lists by the user's `SiteId` claims
(or treat the absence of `SiteId` claims as system-wide), and re-check the claim
server-side before any mutating cross-site command (deploy, enable/disable/delete,
debug stream, parked-message retry/discard). A shared helper that reads the
claims from `AuthenticationStateProvider` and exposes "permitted site ids" would
keep this consistent.
**Resolution**
Resolved 2026-05-16. Confirmed: the `SiteId` claim was written at login
(`AuthEndpoints`, `RoleMapper`) but never read by any CentralUI page — site
scoping was unenforced. Added a scoped `SiteScopeService` (`Auth/SiteScopeService.cs`)
that reads the current circuit's `SiteId` claims and exposes `IsSystemWideAsync`,
`PermittedSiteIdsAsync`, `FilterSitesAsync`, and `IsSiteAllowedAsync` (absence of
claims = system-wide, matching `SiteScopeAuthorizationHandler`). All seven
Deployment/Monitoring pages now consume it: `Topology`, `DebugView`,
`InstanceCreate`, `Deployments` filter their site/instance lists; `InstanceConfigure`
rejects direct navigation to an instance on a non-permitted site; `DebugView`,
`InstanceCreate`, and `ParkedMessages` re-check the claim server-side before any
mutating/streaming command. Regression tests: `SiteScopeServiceTests` (6 tests
pinning the helper logic) and `TopologyPageTests.SiteScoping_ScopedDeploymentUser_OnlySeesPermittedSites`
/ `SiteScoping_SystemWideDeploymentUser_SeesAllSites`. Fixed by the commit whose
message references `CentralUI-002`.
### CentralUI-003 — `Console.SetOut`/`SetError` mutates process-global state across concurrent circuits
| | |
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:359-423` |
**Description**
`RunInSandboxAsync` redirects `Console.Out`/`Console.Error` to a per-call
`StringWriter`, runs the script, then restores them in `finally`. `Console.Out`
is process-global. If two users (two Blazor circuits) run Test Run concurrently,
their captured outputs interleave or cross over, and the `finally` of whichever
finishes first restores `Console.Out` to the *original* writer while the other
run is still executing — so the second run's script output is lost or written
to the real console. `RunInSandboxAsync` is `async` and the script runs on a
thread-pool thread, so concurrent execution is fully expected.
**Recommendation**
Do not redirect process-global `Console`. Provide console capture through the
script globals surface (e.g. a `TextWriter` exposed on `SandboxScriptHost` that
the sandbox API writes to), or serialize Test Run executions with a semaphore if
global redirection must be kept. Capturing per-call without global mutation is
the correct fix.
**Resolution**
Resolved 2026-05-16. Confirmed: `RunInSandboxAsync` redirected the process-global
`Console.Out`/`Console.Error` per call and restored them in `finally`, so a
concurrent run's `finally` could restore the writer while another run was still
executing — the long run silently lost output (reproduced by the regression
test, 74 of 80 expected lines captured). Added `SandboxConsoleCapture`, a routing
`TextWriter` installed into `Console.Out`/`Console.Error` exactly once for the
process; each run pushes its own `StringWriter` onto an `AsyncLocal` capture
scope via `BeginCapture`, so writes are routed per logical call-tree with no
per-run mutation of global `Console` state. `RunInSandboxAsync` now opens the
scope with `using` declarations instead of calling `Console.SetOut`. Regression
tests `RunInSandbox_CapturesConsoleOutput` and
`RunInSandbox_ConcurrentRuns_DoNotCrossContaminateConsoleOutput` fail against the
pre-fix code and pass after. Fixed by the commit whose message references
`CentralUI-003`.
### CentralUI-004 — `CookieAuthenticationStateProvider` reads `HttpContext` for the life of the circuit
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Auth/CookieAuthenticationStateProvider.cs:22-28` |
**Description**
`GetAuthenticationStateAsync` returns `_httpContextAccessor.HttpContext?.User`.
In Blazor Server, `HttpContext` is only valid during the initial HTTP request
that establishes the circuit; for the lifetime of the long-lived SignalR circuit
`IHttpContextAccessor.HttpContext` is `null` (or, worse, a stale/foreign context
if the accessor's `AsyncLocal` leaks). Any later call to
`GetAuthenticationStateAsync` — e.g. an `<AuthorizeView>` re-evaluating, or pages
that call it directly (`Sites.razor`, `Templates.razor`) — then sees an
unauthenticated principal and may render the wrong UI, or returns a stale
identity that never reflects role changes. The class derives from
`ServerAuthenticationStateProvider`, which is designed to be seeded once via
`SetAuthenticationState`; overriding `GetAuthenticationStateAsync` to read
`HttpContext` defeats that design.
**Recommendation**
Capture the authenticated principal once when the circuit is created (e.g. via
the root component / `AuthenticationStateProvider` seeding pattern used by the
Blazor Web App template) and store it on the scoped provider, instead of reading
`IHttpContextAccessor` on every call. Do not depend on `HttpContext` after the
circuit is established.
**Resolution**
Resolved 2026-05-16. Confirmed: `GetAuthenticationStateAsync` read
`_httpContextAccessor.HttpContext?.User` on every call; the provider is
registered `Scoped`, so it is constructed within the initial HTTP request's DI
scope while `HttpContext` is still valid, but every later call (an
`<AuthorizeView>` re-evaluating, or a page calling it directly) over the
long-lived SignalR circuit saw `HttpContext == null` and returned an anonymous
principal. The provider now snapshots the principal once in the constructor into
a cached `Task<AuthenticationState>` and serves that for the life of the
circuit, never touching `IHttpContextAccessor` again. Regression tests
`CookieAuthenticationStateProviderTests.GetAuthenticationStateAsync_StillReturnsUser_AfterHttpContextIsGone`
and `..._IsStableAcrossCalls_IgnoringStaleForeignContext` fail against the
pre-fix code (they would see an anonymous / foreign principal) and pass after.
Fixed by the commit whose message references `CentralUI-004`.
### CentralUI-005 — Session expiry implementation diverges from the documented policy
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.CentralUI/Auth/AuthEndpoints.cs:47-81`; `src/ScadaLink.CentralUI/Components/Shared/SessionExpiry.razor:18-30` |
**Description**
CLAUDE.md (Security & Auth) specifies "15-minute expiry with sliding refresh,
30-minute idle timeout." `AuthEndpoints` instead sets a single fixed
`expires_at = UtcNow + 30 minutes` claim and a 30-minute cookie `ExpiresUtc`,
with no sliding refresh and no separate idle vs absolute timeout.
`SessionExpiry.razor` schedules a single hard redirect at that fixed time. The
result is a hard 30-minute cap with no sliding renewal — an active user is
logged out mid-session, and there is no 15-minute component at all.
**Recommendation**
Either implement the documented policy (sliding 15-minute token with refresh on
activity, plus a 30-minute idle cutoff) or update the design docs to match the
fixed 30-minute model. The code and the documented decision must agree.
**Resolution**
_Unresolved — requires a cross-module change plus a design decision, both out of
scope for a CentralUI-only fix._ Verified 2026-05-16: the discrepancy is real.
The sliding-expiration mechanism, however, is owned by the cookie
authentication middleware configured in **`ScadaLink.Security`**
(`ServiceCollectionExtensions.AddCookie` — currently sets neither
`ExpireTimeSpan` nor `SlidingExpiration`); `AuthEndpoints` (CentralUI) only sets
the absolute `ExpiresUtc`/`expires_at`. Implementing "15-minute sliding token"
means editing `ScadaLink.Security`, which this module's review cannot touch, and
the alternative — amending the documented decision to a fixed 30-minute model —
is a design decision, not a code fix. Left Open and surfaced for a follow-up
that spans CentralUI + Security, or a design-doc amendment.
### CentralUI-006 — Deployment status page polls every 10s despite the documented SignalR-push design
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Deployment/Deployments.razor:196-216` |
**Description**
Component-CentralUI "Real-Time Updates" states: "Deployment status:
Pending/in-progress/success/failed transitions push to the UI immediately via
SignalR (built into Blazor Server). No polling required for deployment
tracking." `Deployments.razor` instead runs a `Timer` that reloads all
deployment records and instance names from the database every 10 seconds. This
is a full N-record + instance-map reload per tick for every open circuit, and
contradicts the design. It also re-issues two repository round-trips on each
tick regardless of whether anything changed.
**Recommendation**
Implement push-based updates (an injected event/observable raised by the
Deployment Manager that the page subscribes to and renders via
`InvokeAsync(StateHasChanged)`), or amend the design doc to acknowledge polling.
If polling is kept as a fallback, fetch only changed/in-progress records.
**Resolution**
_Unresolved — a genuine SignalR-push fix requires an event source in another
module._ Verified 2026-05-16: `Deployments.razor` does poll every 10s, contrary
to the design doc. But a real push implementation needs the **Deployment
Manager** module (`ScadaLink.DeploymentManager``DeploymentService` /
`ArtifactDeploymentService` write the `DeploymentRecord` rows) to raise a
status-change event/observable that the page subscribes to; there is no such
event today and no CentralUI-only seam to subscribe to. Building that event
source is out of scope for a CentralUI-only review. Left Open and surfaced for a
follow-up that adds a deployment-status broadcaster in DeploymentManager (or a
design-doc amendment acknowledging the polling fallback).
### CentralUI-007 — Monitoring nav links to Deployment-only pages are shown to all roles
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Layout/NavMenu.razor:69-78`; `src/ScadaLink.CentralUI/Components/Pages/Monitoring/EventLogs.razor:2`; `src/ScadaLink.CentralUI/Components/Pages/Monitoring/ParkedMessages.razor:2` |
**Description**
`NavMenu` renders the "Event Logs" and "Parked Messages" links inside the
all-authenticated-users Monitoring section. The design doc classifies both the
Site Event Log Viewer and Parked Message Management as **Deployment Role**.
Two inconsistencies result: (a) an Admin- or Design-only user sees nav links
they cannot use; (b) the pages themselves are annotated only `[Authorize]`
(any authenticated user), not `[Authorize(Policy = RequireDeployment)]`, so a
non-Deployment user who follows the link is *not* blocked — they can query site
event logs and retry/discard parked messages. The authorization attribute and
the nav visibility both contradict the design.
**Recommendation**
Add `[Authorize(Policy = AuthorizationPolicies.RequireDeployment)]` to
`EventLogs.razor` and `ParkedMessages.razor`, and move their nav links into a
`<AuthorizeView Policy="RequireDeployment">` block (consistent with the Topology
/ Deployments / Debug View links). Confirm Health Dashboard is intentionally
all-roles (it is, per the design).
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: both `EventLogs.razor` and
`ParkedMessages.razor` carried a bare `[Authorize]`, so any authenticated user
could query site event logs and retry/discard parked messages — contrary to the
design doc's Deployment-Role classification. Both pages now use
`[Authorize(Policy = AuthorizationPolicies.RequireDeployment)]`, and the
"Event Logs" / "Parked Messages" nav links were moved out of the all-roles
Monitoring block into an `<AuthorizeView Policy="RequireDeployment">` (Health
Dashboard stays all-roles, as the design intends). Regression tests
`MonitoringAuthorizationTests.{EventLogsPage,ParkedMessagesPage}_RequiresDeploymentPolicy`
fail against the pre-fix code and pass after;
`HealthDashboard_IsIntentionallyAllAuthenticatedRoles` guards the all-roles page.
### CentralUI-008 — Audit-log date filters treat browser-local datetimes as UTC
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Monitoring/AuditLog.razor:242-243` |
**Description**
The `From`/`To` filters bind `<input type="datetime-local">` to `DateTime?`
fields. A `datetime-local` input yields the value the user typed in their
*browser-local* time zone. `FetchPage` converts them with
`new DateTimeOffset(_filterFrom.Value, TimeSpan.Zero)` — i.e. it labels the
local wall-clock value as UTC. For any non-UTC user the audit query window is
shifted by their UTC offset, silently returning the wrong rows. CLAUDE.md
mandates UTC throughout, but that requires converting the local input *to* UTC,
not relabelling it.
**Recommendation**
Convert the picked local time to UTC before querying — capture the browser
offset (JS interop) and apply it, or document the inputs as UTC and label them
in the UI. The same issue should be checked in `EventLogs.razor` if it has
time-range filters.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: `FetchPage` wrapped the
`datetime-local` value with `new DateTimeOffset(value, TimeSpan.Zero)`,
relabelling the browser-local wall-clock value as UTC and shifting the audit
query window by the user's offset. Added a pure helper
`Components/BrowserTime.LocalInputToUtc(DateTime?, int)` that converts a
local-input value to UTC using the browser's `Date.getTimezoneOffset()`;
`AuditLog.razor` now fetches that offset once via JS interop in
`OnAfterRenderAsync` (defaulting to 0/UTC on prerender or a disconnected
circuit) and runs both `from`/`to` filters through the helper. Regression suite
`BrowserTimeTests` (5 tests) fails against the naive relabelling and passes
after — including `LocalInputToUtc_NonUtcBrowser_DoesNotEqualNaiveRelabelling`,
which pins the exact pre-fix bug. `EventLogs.razor` was checked and has no
time-range filters, so it is unaffected.
### CentralUI-009 — `DebugView` stream callbacks touch a possibly-disposed `ToastNotification`
| | |
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Deployment/DebugView.razor:400-409,538-544` |
**Description**
The `onTerminated` callback passed to `DebugStreamService.StartStreamAsync`
captures `_toast` and `this` and runs on an Akka/gRPC thread. If the user
navigates away, `Dispose()` calls `StopStream`, but a stream-termination event
already in flight can still invoke `onTerminated`, which calls
`_toast.ShowError(...)` and `StateHasChanged()` on a disposed component. The
component does not guard callbacks with a disposed flag or a
`CancellationTokenSource`. The same applies to the `onEvent` callbacks at
lines 391-398 that call `InvokeAsync(StateHasChanged)`.
**Recommendation**
Track a `_disposed`/`CancellationTokenSource` on the component, check it at the
top of every stream callback, and stop the stream synchronously before marking
disposed. `InvokeAsync` after disposal throws `ObjectDisposedException`; the
callbacks should no-op once disposed.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: the `onEvent`/`onTerminated`
callbacks captured `this` and `_toast` and ran on an Akka/gRPC thread with no
disposal guard. Added a `volatile bool _disposed` flag, set first thing in
`Dispose()` before the stream is stopped. Every callback now checks `_disposed`
and no-ops if set; the render dispatch goes through a new `SafeInvokeAsync`
helper that re-checks the flag and swallows `ObjectDisposedException` should the
component be disposed between the guard and the dispatch. Regression tests
`DebugViewDisposalTests.{DebugView_HasDisposalGuardField,
DebugView_Dispose_SetsDisposedFlag_AndIsIdempotent}` pin the observable contract
(the guard field exists; `Dispose()` sets it and is idempotent) — the first
fails against the pre-fix code, which had no `_disposed` field. The Akka-thread
timing race itself is not deterministically reproducible in a unit test:
`DebugStreamService` is a non-virtual concrete class with no seam to inject and
later fire the callbacks, so the closest meaningful tests pin the guard
mechanism rather than the race window.
### CentralUI-010 — `ToastNotification` auto-dismiss continuation runs after component disposal
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Shared/ToastNotification.razor:62-71,90` |
**Description**
`AddToast` schedules `Task.Delay(dismissMs).ContinueWith(...)` with the result
discarded (`_ =`). The continuation calls `InvokeAsync(StateHasChanged)`. If the
host page is disposed before the 5-second delay elapses (common — navigate away
right after an action), the continuation runs against a disposed component and
`InvokeAsync` throws `ObjectDisposedException` on a thread-pool thread with no
catch, producing an unobserved task exception. `Dispose()` is an empty body and
cancels nothing.
**Recommendation**
Hold a `CancellationTokenSource`, pass its token to `Task.Delay`, cancel it in
`Dispose()`, and guard the continuation. Alternatively wrap the continuation
body in a try/catch for `ObjectDisposedException`.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: `AddToast` scheduled
`Task.Delay(...).ContinueWith(...)` with no cancellation and `Dispose()` was an
empty body, so the continuation ran `InvokeAsync(StateHasChanged)` against a
disposed component. Added a `CancellationTokenSource _disposalCts` cancelled in
`Dispose()`; the auto-dismiss is now an `AutoDismissAsync` method that awaits
`Task.Delay(dismissMs, token)`, returns on `OperationCanceledException`, and
wraps the post-delay `InvokeAsync(StateHasChanged)` in a try/catch for
`ObjectDisposedException`. `AddToast` also short-circuits if the component is
already disposed. Regression tests:
`ToastNotificationTests.ShowToast_AfterDisposal_IsNoOp_AndSchedulesNothing`
fails against the pre-fix code (which still added the toast / mis-scheduled
after disposal) and passes after;
`AutoDismiss_AfterDisposal_DoesNotThrowUnobservedException` and
`AutoDismiss_BeforeDisposal_StillRemovesToast` guard the no-throw and
still-works behaviours.
### CentralUI-011 — `DiffDialog` leaves a dangling `TaskCompletionSource` when disposed while open
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Shared/DiffDialog.razor:89-95,151-157` |
**Description**
`OpenAsync` creates `_tcs` and returns `_tcs.Task` to the caller, which
typically `await`s it. The task is completed only by `Close()`. If the user
navigates away while the dialog is open, `DisposeAsync` runs but never completes
`_tcs`, so the awaiting caller's continuation never resumes — a permanently
suspended `Task` (and any `using`/cleanup after the await is skipped). The
`IDialogService.Confirm/Prompt` path has the same shape but at least its host
is a single long-lived `DialogHost`; `DiffDialog` is per-page.
**Recommendation**
In `DisposeAsync`, call `_tcs?.TrySetResult(false)` (or `TrySetCanceled`) so any
awaiter completes deterministically.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: `OpenAsync` returned
`_tcs.Task`, completed only by `Close()`; `DisposeAsync` never touched the TCS,
so disposing the dialog while open left the awaiting caller suspended forever.
`DisposeAsync` now calls `_tcs?.TrySetResult(false)` before unlocking the body,
so a dialog disposed while open resolves its caller to `false` (not confirmed).
Regression test `DiffDialogTests.DisposeAsync_WhileOpen_CompletesPendingTask`
fails against the pre-fix code (the pending task stays `WaitingForActivation`)
and passes after; `Close_CompletesPendingTaskWithTrue` guards the normal close
path.
### CentralUI-012 — N+1 query loading data connections for the Sites page
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/Components/Pages/Admin/Sites.razor:196-205` |
**Description**
`LoadDataAsync` fetches all sites, then issues
`SiteRepository.GetDataConnectionsBySiteIdAsync(site.Id)` once per site in a
loop. With N sites this is N+1 database round-trips on every page load and every
post-delete refresh. The connection lists are only used for a small per-card
summary.
**Recommendation**
Add a repository method that returns all data connections (or connections for a
set of site ids) in one query and group them client-side, or project the small
summary in a single query.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: `LoadDataAsync` looped
`GetDataConnectionsBySiteIdAsync(site.Id)` once per site (N+1). `ISiteRepository`
already exposes `GetAllDataConnectionsAsync()` and `DataConnection` carries a
`SiteId`, so the loop was replaced with a single `GetAllDataConnectionsAsync()`
call grouped client-side by `SiteId` — one query regardless of site count, on
every load and post-delete refresh. Regression tests
`SitesPageTests.{LoadData_FetchesAllConnectionsInOneQuery_NoPerSiteQueries,
LoadData_GroupsConnectionsBySite_AndRendersThem}` fail against the pre-fix code
(`GetDataConnectionsBySiteIdAsync` was called per site) and pass after.
### CentralUI-013 — `ScriptAnalysisService` blocks on async shared-script lookups
| | |
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:951-952` (actual call at `:975`) |
**Description**
`ResolveCalledShape` calls `_sharedScripts.GetShapesAsync().GetAwaiter().GetResult()`
to resolve a shared-script shape synchronously. `GetShapesAsync` ultimately hits
`SharedScriptService` and its EF Core repository. Sync-over-async on a request
thread risks thread-pool starvation under load and can deadlock if any awaited
continuation needs a captured context. `Hover` and `SignatureHelp` (which call
`ResolveCalledShape`) are themselves synchronous methods, so the blocking call
is structural.
**Recommendation**
Make `Hover` and `SignatureHelp` async and `await` `GetShapesAsync`, or have the
catalog expose a cached synchronous snapshot that is refreshed asynchronously.
The `IMemoryCache` is already present — caching the shapes there and reading
them synchronously would remove the blocking call.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed (the sync-over-async call is at
`:975`, not `:951-952` as originally cited — `ResolveCalledShape`'s
`Scripts.CallShared` branch). Took the recommended root-cause fix: `Hover` and
`SignatureHelp` are now `async Task<...>` and `ResolveCalledShape` is
`async Task<ScriptShape?>` which `await`s `_sharedScripts.GetShapesAsync()`
instead of `.GetAwaiter().GetResult()`. The two minimal-API endpoints
(`/hover`, `/signature-help`) were updated to `await` the methods. Regression
suite `ScriptAnalysisAsyncResolveTests` (3 tests): the structural test
`HoverAndSignatureHelp_AreAsync_NotSyncOverAsync` fails against the pre-fix
synchronous signatures, and two behavioural tests resolve shared-script shapes
through a catalog that only completes after `Task.Yield()` (a genuinely async
source). The five existing `Hover`/`SignatureHelp` tests in
`ScriptAnalysisServiceTests` were updated to `await` the now-async methods.
### CentralUI-014 — Test Run side effects (HTTP/SQL/SMTP) fire against production services
| | |
|--|--|
| Severity | Low (re-triaged from Medium 2026-05-16 — see Resolution) |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs:254-259`; `src/ScadaLink.CentralUI/ScriptAnalysis/SandboxHostHelpers.cs:26-117` |
**Description**
By design (documented in the XML comments) Test Run wires `ExternalSystem`,
`Database`, and `Notify` to central's *real* `IExternalSystemClient`,
`IDatabaseGateway`, and `INotificationDeliveryService`, so a Test Run that calls
`Notify.To(...).Send(...)` actually emails recipients, `Database.Connection(...)`
opens a real DB connection, and `External.Call(...)` makes real HTTP calls —
with production-equivalent side effects. There is no dry-run mode, no
confirmation, and (combined with CentralUI-001) no restriction on what a script
can do. A Design user testing a draft script can dispatch real notifications or
mutate external databases. The behaviour is intentional but the blast radius is
not surfaced to the user.
**Recommendation**
At minimum, surface a clear warning in the Test Run UI that side effects are
real, and require explicit opt-in for side-effecting calls. Preferably offer a
dry-run mode that stubs the helpers, defaulting to dry-run.
**Resolution**
Resolved 2026-05-16 (commit pending) — **re-triaged**. Re-verified against the
reviewed commit `9c60592`: the finding's premise that "the blast radius is not
surfaced to the user" is **inaccurate**. Both Test Run surfaces that can produce
real side effects — `SharedScriptForm.razor` and the script Test Run in
`TemplateEdit.razor` — already carry a prominent `Real I/O` badge on the panel
header and an `alert-warning` block stating `External`/`Database`/`Notify` calls
"fire for real … real HTTP, real SQL, real emails. Side effects are permanent"
(present since commit `2951507`, an ancestor of the reviewed commit, confirmed
via `git merge-base`). `ApiMethodForm.razor` (Inbound API kind) has **no**
real-I/O surface at all — `SandboxInboundScriptHost` exposes only
`Parameters`/`Route` (Route throws) — and correctly omits the badge while still
warning. Revealing the panel ("Test Run" toggle) then clicking "Run" is itself a
two-step explicit opt-in. The minimum recommendation is therefore already met;
the optional dry-run mode is a separate feature decision the design doc does not
mandate. Severity re-triaged Medium → Low (intentional, documented, clearly
warned behaviour — not a bug). Regression suite `TestRunWarningTests` (3 tests)
pins the `Real I/O` badge + warning text in `SharedScriptForm`/`TemplateEdit`
and the deliberate absence of the badge in `ApiMethodForm`, so the warning
cannot silently regress.
### CentralUI-015 — `DialogService` continuations resolve off the render thread
| | |
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.CentralUI/ServiceCollectionExtensions.cs:24`; `src/ScadaLink.CentralUI/Components/Shared/DialogService.cs:18-69` |
**Description**
`DialogService` is `AddScoped` (one per circuit, correct) but
`ConfirmAsync`/`PromptAsync` complete via `ContinueWith(..., TaskScheduler.Default)`,
so a caller awaiting them resumes on a thread-pool thread. Any subsequent
component state mutation by the caller is then off the render thread unless the
caller wraps it in `InvokeAsync`. Call sites are not consistently doing so,
which can produce non-deterministic render glitches.
**Recommendation**
Either resolve continuations on the circuit's sync context or document that
callers must `InvokeAsync` after awaiting `ConfirmAsync`/`PromptAsync`. Audit
call sites for off-thread state mutation.
**Resolution**
_Unresolved._
### CentralUI-016 — Pagers render one button per page with no windowing
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.CentralUI/Components/Shared/DataTable.razor:62-68`; `src/ScadaLink.CentralUI/Components/Pages/Deployment/Deployments.razor:167-173` |
**Description**
The `DataTable` and `Deployments` paginators loop `for i = 1..totalPages` and
emit a `<li>` button for every page. With a few thousand records at page size 25
that is hundreds of buttons rendered into the diff on every state change. It is
not a correctness bug but degrades render performance and usability on large
datasets.
**Recommendation**
Window the pager (first / prev / a few around current / next / last) or switch
large lists to a "load more" / numeric jump input.
**Resolution**
_Unresolved._
### CentralUI-017 — `/auth/logout` POST disables antiforgery, enabling logout CSRF
| | |
|--|--|
| Severity | Low |
| Category | Security |
| Status | Open |
| Location | `src/ScadaLink.CentralUI/Auth/AuthEndpoints.cs:127-138` |
**Description**
The `POST /auth/logout` endpoint calls `.DisableAntiforgery()`, and a plain
`GET /logout` endpoint also signs the user out. Either can be triggered
cross-site (an `<img src="/logout">` or an auto-submitting form) to forcibly log
a user out. Login itself reasonably disables antiforgery (pre-auth), but logout
is a state-changing authenticated action and should be CSRF-protected.
**Recommendation**
Require an antiforgery token on `POST /auth/logout` (the `NavMenu` sign-out form
can include the antiforgery token), and remove or protect the state-changing
`GET /logout` route.
**Resolution**
_Unresolved._
### CentralUI-018 — Broad `catch {}` blocks swallow JS interop and storage errors silently
| | |
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.CentralUI/Components/Shared/MonacoEditor.razor:116-118,123,142,164,170,176,182,189`; `src/ScadaLink.CentralUI/Components/Shared/TreeView.razor:129,139`; `src/ScadaLink.CentralUI/Components/Pages/Admin/Sites.razor:316-319` |
**Description**
Numerous `try { ... } catch { }` blocks swallow every exception with no logging.
The prerender-time JS-unavailable case is legitimate, but these catches also
hide real failures: a genuine Monaco init failure, or a clipboard permission
error become invisible. In `TreeView.razor` the storage-restore
`JsonSerializer.Deserialize` (line 139) is not inside a try at all and would
throw uncaught on a corrupt `treeviewStorage` payload. Debugging UI issues in
production is then guesswork.
**Recommendation**
Catch the specific expected exception type (e.g. `JSDisconnectedException`,
`InvalidOperationException` during prerender) and log anything else via
`ILogger`. Wrap the TreeView storage `Deserialize` in its own guarded block.
**Resolution**
_Unresolved._
### CentralUI-019 — Sparse unit-test coverage for a large module; critical paths untested
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.CentralUI.Tests/` |
**Description**
The module has ~65 source files but unit tests cover only the script analyzer,
TreeView, schema model, and two data-connection pages. Untested critical paths
include: the auth bridge (`CookieAuthenticationStateProvider`,
`AuthEndpoints`), `RunInSandboxAsync` (timeout, recursion limit, error
classification, side-effect wiring), `DialogService` resolution semantics,
`DebugView` stream lifecycle and the `UpsertWithCap` cap logic, `Health` and
`Deployments` timer behaviour, and `SchemaBuilderModel` round-tripping of nested
schemas. Given findings CentralUI-001/003/009/010 sit on untested code, the gap
is material. The Playwright suite covers login and navigation only.
**Recommendation**
Add bUnit/unit tests for the auth bridge, sandbox-run behaviour (including
forbidden-API rejection once CentralUI-001 is fixed), dialog resolution, and the
DebugView cap/lifecycle logic. Prioritise the paths named in the Critical/High
findings.
**Resolution**
_Unresolved._
@@ -0,0 +1,442 @@
# Code Review — ClusterInfrastructure
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.ClusterInfrastructure` |
| Design doc | `docs/requirements/Component-ClusterInfrastructure.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 3 |
## Summary
The ClusterInfrastructure module is currently a **Phase 0 skeleton**. It contains
only two source files: `ClusterOptions.cs`, a plain options POCO, and
`ServiceCollectionExtensions.cs`, whose two registration methods are explicit no-ops.
None of the responsibilities described in `Component-ClusterInfrastructure.md`
Akka.NET cluster bootstrap, leader election, failover detection, split-brain
resolution, cluster singleton hosting, Windows service lifecycle — are implemented.
There are therefore no correctness, concurrency, or Akka-convention defects to find
in *behaviour*, because there is no behaviour. The findings below instead concern
(a) the large gap between the design doc and the code, (b) the options class missing
the validation, configuration-binding affordances, and coverage of documented
settings that peer modules provide, and (c) the no-op DI extensions silently
returning success, which is a latent reliability hazard once the Host wires this
module in. The dominant theme is **incompleteness**: this module is the foundation
every other component runs on, yet it presently delivers nothing the design requires.
The single options class is clean and its test covers defaults and setters
adequately for what exists.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | No executable logic exists beyond an options POCO; no logic bugs, but `ServiceCollectionExtensions` returns success while doing nothing (CI-002). |
| 2 | Akka.NET conventions | ✓ | No actors, no `ActorSystem` bootstrap, no supervision, no cluster/singleton wiring exist despite the design doc requiring all of them (CI-001). Nothing to assess against `Tell`/`Ask`, immutability, or `PipeTo`. |
| 3 | Concurrency & thread safety | ✓ | No shared mutable state, no actors, no async code. No issues found in current code. |
| 4 | Error handling & resilience | ✓ | Failover, split-brain, dual-node recovery, and graceful-shutdown logic are entirely absent (CI-001). No exception paths to review in current code. |
| 5 | Security | ✓ | No authn/authz surface in this module. Akka remoting is unconfigured, so transport security cannot be assessed; flagged as part of the missing implementation (CI-001). No secret handling present. |
| 6 | Performance & resource management | ✓ | No streams, connections, timers, or `IDisposable` resources exist yet. No issues found in current code. |
| 7 | Design-document adherence | ✓ | Severe drift: the module implements none of its documented responsibilities (CI-001). `ClusterOptions` also omits remoting host/port, cluster role/site identifier, gRPC port, storage paths, and `down-if-alone` (CI-003). |
| 8 | Code organization & conventions | ✓ | Options class is correctly owned by the component project. Missing config-section-name constant (CI-005) and missing `IValidateOptions`/data-annotation validation (CI-004) versus the Options pattern intent. |
| 9 | Testing coverage | ✓ | `ClusterOptionsTests` covers defaults and setters. No tests for any cluster behaviour because none exists; the test project references nothing else (CI-006). |
| 10 | Documentation & comments | ✓ | `ClusterOptions` has no XML doc comments unlike peer options classes (CI-007). The "Phase 0 skeleton" placeholders are undocumented at the module level — no README or tracking note (CI-008). |
## Findings
### ClusterInfrastructure-001 — Module implements none of its documented responsibilities
| | |
|--|--|
| Severity | High |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:9`, `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:16` |
**Description**
`Component-ClusterInfrastructure.md` assigns this module seven concrete
responsibilities: bootstrap the Akka.NET `ActorSystem`, form the two-node cluster,
manage leader election / active-standby role assignment, detect node failures and
trigger failover, provide remoting, host the cluster singleton, and manage the
Windows service lifecycle. The entire module is two files: a `ClusterOptions` POCO
and a `ServiceCollectionExtensions` whose methods are explicitly commented
`// Phase 0: skeleton only` and `// Phase 0: placeholder for Akka actor registration`
and simply return the unmodified `IServiceCollection`. There is no `Akka.Cluster`,
`Akka.Cluster.Tools`, `Akka.Remote`, or split-brain-resolver dependency in the
`.csproj` at all (it references only `Microsoft.Extensions.DependencyInjection.Abstractions`,
`Microsoft.Extensions.Options`, and `ScadaLink.Commons`). Because every other
ScadaLink component runs inside the actor system this module is responsible for
creating, the absence of any implementation blocks the foundational layer of the
system.
**Recommendation**
Track the gap explicitly (a milestone/issue) and implement the documented behaviour:
add the Akka cluster/remote/cluster-tools and split-brain-resolver package
references, build the cluster bootstrap (HOCON generation from `ClusterOptions`),
the split-brain resolver configuration, cluster-singleton hosting support, and
`CoordinatedShutdown` wiring. Until then, the module's `Status` and the design doc
should clearly state it is unimplemented so callers do not assume otherwise.
**Resolution**
_Re-triaged 2026-05-16 — remains Open, needs a design decision from the user._
Verified against the source at the reviewed commit: the finding's factual claims hold.
`src/ScadaLink.ClusterInfrastructure` still contains only `ClusterOptions.cs` and a
no-op `ServiceCollectionExtensions.cs`, and the `.csproj` references no Akka packages.
However, the documented cluster behaviour is **not actually absent from the system**
it has been implemented in the **Host** project rather than in this module:
- `src/ScadaLink.Host/Actors/AkkaHostedService.cs` bootstraps the `ActorSystem`,
generates the HOCON from `ClusterOptions` (it imports `ScadaLink.ClusterInfrastructure`
and injects `IOptions<ClusterOptions>`), and configures the `keep-oldest` split-brain
resolver with `down-if-alone = on` (see `AkkaHostedService.cs:95-96`).
- `src/ScadaLink.Host/Health/AkkaClusterHealthCheck.cs`, `AkkaClusterNodeProvider.cs`,
and `Health/ActiveNodeHealthCheck.cs` cover cluster membership / active-node detection.
- Akka cluster/remote package references live in `ScadaLink.Host.csproj` and the
per-component projects (`SiteRuntime`, `Communication`, etc.).
So the real situation is an **ownership / design-doc drift**, not missing behaviour:
`Component-ClusterInfrastructure.md` assigns the Akka bootstrap, HOCON generation,
split-brain config and `CoordinatedShutdown` wiring to this module, but the
implementation deliberately lives in the Host. `ClusterOptions` is the one piece this
module legitimately owns and it is consumed correctly by the Host.
Resolving CI-001 as literally written is **not a small, well-scoped fix** — it is one
of two substantial decisions, both requiring the user:
1. **Move the bootstrap into this module** — relocate the HOCON generation, split-brain
config, cluster-singleton helpers and `CoordinatedShutdown` wiring out of
`ScadaLink.Host` into `ScadaLink.ClusterInfrastructure`, add the Akka package
references, and re-wire the Host to call into it. This is a cross-module refactor
touching `src/ScadaLink.Host/*` and several other projects — outside the edit scope
permitted for this finding (only `src/ScadaLink.ClusterInfrastructure/`,
`tests/ScadaLink.ClusterInfrastructure.Tests/`, and this file may be edited).
2. **Accept the current placement** — keep the bootstrap in the Host and update
`Component-ClusterInfrastructure.md` (and the README component table) to record that
the Host owns the actor-system/cluster bootstrap and that this module's role is the
shared `ClusterOptions` contract. That fix is a design-doc edit, also outside this
module's permitted edit scope.
Either path is a deliberate architecture decision, not a bug fix. The decision was
surfaced to the user, who chose **option 2 — accept the current placement**: the Akka
bootstrap stays in the Host (the single deployable binary that performs all actor-system
bring-up), and the design docs are corrected to record the true ownership.
**Resolved** — fixing commit `<pending>`, date 2026-05-16. The finding was a design-doc
drift, not missing behaviour. `docs/requirements/Component-ClusterInfrastructure.md` now
carries an "Implementation Note — Code Placement" section stating that the
`ScadaLink.ClusterInfrastructure` project owns the `ClusterOptions` configuration model
while `ScadaLink.Host` owns the Akka bootstrap, HOCON generation, split-brain-resolver
wiring, `CoordinatedShutdown` integration, and active-node health checks. The README
component table (row 13) was updated to match. No code change was required — the
documented cluster behaviour already exists and is exercised; only the doc's
module-ownership claim was wrong. Module test suite green (3 passed).
### ClusterInfrastructure-002 — No-op DI extension methods report success while doing nothing
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:7-17` |
**Description**
`AddClusterInfrastructure` and `AddClusterInfrastructureActors` both accept an
`IServiceCollection` and return it unchanged. A caller (e.g. the Host) that invokes
`services.AddClusterInfrastructure()` receives a fluent, success-looking result but
no actor system, no cluster, and no singleton support is actually registered. This
is a silent failure: the system will appear to start, then fail later and far from
the cause (e.g. when a component resolves an `ActorSystem` that was never added, or
when the cluster singleton never forms). A no-op that masquerades as a completed
registration is worse than an unimplemented method that throws.
**Recommendation**
Until the real implementation exists, make the placeholder loud rather than silent —
either throw `NotImplementedException` from the methods, or have them log a
prominent warning, so an integrating caller fails fast with a clear cause. Replace
with the genuine registration when CI-001 is addressed.
**Resolution**
Confirmed against the source: both methods returned the `IServiceCollection`
unchanged. Verified the consumers — `ScadaLink.Host` calls `AddClusterInfrastructure()`
(`Program.cs:68`, `SiteServiceRegistration.cs:24`); `AddClusterInfrastructureActors`
is dead — it is called nowhere in the solution.
**Resolved** — fixing commit `commit pending`, date 2026-05-16.
`AddClusterInfrastructure` now does real work: it registers the
`ClusterOptionsValidator` (CI-004) via `TryAddEnumerable`, so the method is no longer a
no-op and a misconfigured `ScadaLink:Cluster` section fails fast on the first
`IOptions<ClusterOptions>` resolution. `AddClusterInfrastructureActors` — which this
component never had any actors to register, as CI-001 established the Akka bootstrap
lives in `ScadaLink.Host` — now throws `NotImplementedException` with a message
pointing the caller to the Host, rather than masquerading as a completed registration.
Covered by `ServiceCollectionExtensionsTests`
(`AddClusterInfrastructure_RegistersOptionsValidator`,
`AddClusterInfrastructure_ValidatorRejectsBadOptionsAtResolution`,
`AddClusterInfrastructureActors_ThrowsRatherThanSilentlySucceeding`).
### ClusterInfrastructure-003 — ClusterOptions omits several documented node-configuration settings
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3-11` |
**Description**
The "Node Configuration", "Split-Brain Resolution", and "Failure Detection Timing"
sections of the design doc enumerate the settings each node needs. `ClusterOptions`
exposes `SeedNodes`, `SplitBrainResolverStrategy`, `StableAfter`,
`HeartbeatInterval`, `FailureDetectionThreshold`, and `MinNrOfMembers`, but is
missing: the Akka remoting hostname/port (default 8081 central, 8082 site), the
cluster role (Central vs. Site) and the site identifier, the `down-if-alone` flag
(the design explicitly requires `down-if-alone = on` for the keep-oldest resolver),
and — for site nodes — the gRPC port (default 8083) and local SQLite storage paths.
Without these, the options class cannot drive a correct HOCON configuration when
CI-001 is implemented. (Some settings such as remoting host/port may instead belong
in `Host/NodeOptions.cs`; the split of ownership should be decided deliberately, but
at minimum `down-if-alone` belongs with the split-brain settings here.)
**Recommendation**
Add the missing settings — at minimum a `DownIfAlone` boolean (default `true`) and
the cluster role / site identifier — or document explicitly which settings are
owned by `Host/NodeOptions.cs` instead, so the design doc and the options classes
agree on where each value lives.
**Resolution**
Partially re-triaged. Verified against the source: most of the "missing" settings are
**deliberately owned by `ScadaLink.Host.NodeOptions`**`NodeOptions` already carries
`Role`, `NodeHostname`, `SiteId`, `RemotingPort` and `GrpcPort`, and `AkkaHostedService`
builds the HOCON from `NodeOptions` for exactly those values. Local SQLite storage paths
live in the database / store-and-forward options. This is the ownership split CI-001
established (the Host owns node identity and bootstrap; this project owns the
cluster-formation contract), so those settings do **not** belong in `ClusterOptions`.
The one genuine gap the finding identifies is `down-if-alone`, which the design doc
puts with the split-brain settings.
**Resolved** — fixing commit `commit pending`, date 2026-05-16. Added the
`DownIfAlone` boolean (default `true`) to `ClusterOptions` so the split-brain
configuration contract is complete, and added a class-level XML doc that records the
deliberate ownership split — node identity/remoting/gRPC in `Host.NodeOptions`, storage
paths in the database options, cluster-formation settings here — so the design doc and
the options classes now agree on where each value lives. (`AkkaHostedService` currently
hard-codes `down-if-alone = on` in HOCON; wiring it to read `DownIfAlone` is a one-line
`ScadaLink.Host` change, outside this module's permitted edit scope, and is noted for
the Host's review.) Covered by `ClusterOptionsTests.DefaultValues_AreCorrect` and
`ClusterOptionsTests.DownIfAlone_CanBeSet`.
### ClusterInfrastructure-004 — ClusterOptions has no validation despite safety-critical values
| | |
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Resolved |
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3-11` |
**Description**
`ClusterOptions` carries values whose misconfiguration has cluster-wide
consequences. The design doc is emphatic that `min-nr-of-members` must be `1` (a
value of `2` blocks the singleton and therefore all data collection indefinitely
after failover), that `SplitBrainResolverStrategy` must be `keep-oldest` for a
two-node cluster (quorum strategies cause total shutdown), and that the timing
values are interdependent (`HeartbeatInterval` must be well below
`FailureDetectionThreshold`). The class has no data annotations, no
`IValidateOptions<ClusterOptions>`, and no guard logic, so an `appsettings.json`
setting `MinNrOfMembers: 2` or `SplitBrainResolverStrategy: "keep-majority"` (the
exact value the test at `ClusterOptionsTests.cs:35` shows is settable) would be
accepted silently and produce the catastrophic outcomes the design doc warns
against.
**Recommendation**
Add validation — data annotations (`[Range]` for `MinNrOfMembers`, etc.) plus an
`IValidateOptions<ClusterOptions>` implementation that enforces
`MinNrOfMembers == 1`, restricts `SplitBrainResolverStrategy` to a known set,
requires `SeedNodes` non-empty, and asserts `HeartbeatInterval <
FailureDetectionThreshold` and positive `StableAfter`. Register it with
`ValidateOnStart()` so misconfiguration fails fast at boot.
**Resolution**
Confirmed: `ClusterOptions` had no validation of any kind, and the design doc's
catastrophic-misconfiguration values (`MinNrOfMembers: 2`, a quorum split-brain
strategy) would have been bound silently.
**Resolved** — fixing commit `commit pending`, date 2026-05-16. Added
`ClusterOptionsValidator : IValidateOptions<ClusterOptions>`, which enforces
`MinNrOfMembers == 1`, restricts `SplitBrainResolverStrategy` to the
`keep-oldest`-only allowed set, requires a non-empty `SeedNodes`, requires positive
`StableAfter` / `HeartbeatInterval` / `FailureDetectionThreshold`, and asserts
`HeartbeatInterval < FailureDetectionThreshold`. It accumulates every failure into one
result. It is registered by `AddClusterInfrastructure()` (CI-002) as a singleton
`IValidateOptions<ClusterOptions>`, so a misconfigured section throws
`OptionsValidationException` on the first `IOptions<ClusterOptions>.Value` resolution
— which `AkkaHostedService` performs during startup, giving the fail-fast-at-boot
behaviour the recommendation asks for without the src project taking a dependency on
the full `Microsoft.Extensions.DependencyInjection` package needed for the
`ValidateOnStart()` overload. Data annotations were not used — a single
`IValidateOptions` implementation expresses the interdependent timing rules that
attributes cannot. Covered by `ClusterOptionsValidatorTests` (8 cases) and
`ServiceCollectionExtensionsTests.AddClusterInfrastructure_ValidatorRejectsBadOptionsAtResolution`.
### ClusterInfrastructure-005 — No configuration section name constant for the Options pattern binding
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3` |
**Description**
CLAUDE.md specifies per-component configuration via `appsettings.json` sections
bound with the Options pattern. `ClusterOptions` provides no `public const string
SectionName` (or equivalent) for the binding site to reference, so whichever code
binds the section must hard-code the magic string, and there is no single source of
truth for the section name. Because `AddClusterInfrastructure` is itself a no-op
(CI-002), the options class is currently bound nowhere at all, making the missing
constant easy to overlook.
**Recommendation**
Add a `public const string SectionName = "Cluster";` (or the agreed name) to
`ClusterOptions` and have the eventual `AddClusterInfrastructure` bind
`configuration.GetSection(ClusterOptions.SectionName)` against it.
**Resolution**
_Unresolved._
### ClusterInfrastructure-006 — No tests for any cluster behaviour; only the options POCO is covered
| | |
|--|--|
| Severity | Medium |
| Category | Testing coverage |
| Status | Resolved |
| Location | `tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs:1-51` |
**Description**
The test project contains only `ClusterOptionsTests`, exercising default values and
property setters of `ClusterOptions`. There are no tests for cluster formation,
leader election, failover detection, split-brain resolution, singleton handover, or
the `ServiceCollectionExtensions` registration methods — none can exist because the
behaviour itself is absent (CI-001). This is recorded so the testing gap is tracked
alongside the implementation gap: the most safety-critical paths of the entire
system (failover, split-brain, dual-node recovery) are completely untested. The
test at line 30-50 also asserts that `SplitBrainResolverStrategy` can be set to
`"keep-majority"`, implicitly endorsing a value the design doc forbids for a
two-node cluster — see CI-004.
**Recommendation**
When CI-001 is implemented, add multi-node `Akka.Cluster.TestKit` /
`MultiNodeTestKit` tests covering cluster formation, failover promotion,
split-brain downing, and singleton handover, plus unit tests for HOCON generation
from `ClusterOptions` and for the options validation from CI-004.
**Resolution**
Re-triaged in light of CI-001's resolution. The Akka bootstrap, HOCON generation,
cluster formation, failover and singleton handover are owned by `ScadaLink.Host`, not
this project — multi-node `Akka.Cluster.TestKit` tests for that behaviour belong in the
Host's test suite, outside this module's scope. What this module legitimately owns is
`ClusterOptions`, its validator, and the DI registration, and the testing gap there is
now closed.
**Resolved** — fixing commit `commit pending`, date 2026-05-16. Added two test classes
to `tests/ScadaLink.ClusterInfrastructure.Tests`: `ClusterOptionsValidatorTests`
(8 cases — valid defaults pass; `MinNrOfMembers != 1`, unsupported split-brain
strategies, empty seed nodes, heartbeat not below the failure threshold, non-positive
`StableAfter` all fail; and a multi-failure accumulation case) and
`ServiceCollectionExtensionsTests` (3 cases — `AddClusterInfrastructure` registers the
validator, the validator rejects bad options at `IOptions` resolution, and
`AddClusterInfrastructureActors` throws). The pre-existing `ClusterOptionsTests` was
extended with `DownIfAlone` coverage. The test project gained references to
`Microsoft.Extensions.DependencyInjection` and `Microsoft.Extensions.Options`. Module
test suite green: 16 passed (was 3). Note: the `keep-majority` value used in the
pre-existing `ClusterOptionsTests.Properties_CanBeSetToCustomValues` is intentionally
left — that test exercises the POCO's property setter (the POCO accepts any string by
design); `ClusterOptionsValidator` is the layer that now rejects `keep-majority`, and
`UnsupportedSplitBrainStrategy_FailsValidation` proves it.
### ClusterInfrastructure-007 — ClusterOptions lacks XML documentation comments
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:3-11` |
**Description**
`ClusterOptions` and each of its six properties have no XML doc comments. Peer
options classes such as `StoreAndForward/StoreAndForwardOptions.cs` document the
class and every property (including units and design-doc references). For a class
whose values carry the cluster-wide consequences described in the design doc
(notably `MinNrOfMembers` and `SplitBrainResolverStrategy`), the absence of inline
documentation is a maintainability and safety gap — a future editor has no in-code
warning that `MinNrOfMembers` must stay `1`.
**Recommendation**
Add `<summary>` comments to the class and each property, stating units and the
documented constraints (e.g. that `MinNrOfMembers` must be `1`, that
`HeartbeatInterval` must be well below `FailureDetectionThreshold`), referencing
the relevant design-doc sections as peer modules do.
**Resolution**
_Unresolved._
### ClusterInfrastructure-008 — "Phase 0 skeleton" status is undocumented at the module level
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:9`, `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:16` |
**Description**
The only indication that this foundational module is unimplemented is two inline
comments inside private method bodies (`// Phase 0: skeleton only` /
`// Phase 0: placeholder for Akka actor registration`). There is no module README,
no `<!-- TODO -->` in the design doc, and no tracking marker visible to anyone
reading the project structure or the component table. Given that the design doc
(`Component-ClusterInfrastructure.md`) describes a fully featured component with no
caveat, a reader will reasonably assume the module is built. The mismatch between a
complete-looking design doc and an empty implementation is itself a documentation
defect.
**Recommendation**
Add a short note to the design doc (or a module-level `README.md`) stating the
current implementation status and what "Phase 0" delivers, and reference a tracked
issue for the remaining work (CI-001). Keep the README component table accurate
about which components are skeletons versus implemented.
**Resolution**
_Unresolved._
+482
View File
@@ -0,0 +1,482 @@
# Code Review — Commons
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.Commons` |
| Design doc | `docs/requirements/Component-Commons.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 8 |
## Summary
Commons is in good overall health. It is a well-organized, dependency-light library:
the architectural-constraint tests enforce the no-Akka/no-EF/no-ASP.NET rule, the
POCO-entity and message-as-record conventions, and the UTC timestamp rule. The folder
and namespace hierarchy closely matches REQ-COM-5b. No Critical issues were found.
The findings cluster around three themes. First, a handful of files quietly stretch
the REQ-COM-6 "no business logic" boundary — `StaleTagMonitor`, `OpcUaEndpointConfigSerializer`,
`OpcUaEndpointConfigValidator`, `ScriptParameters`, `ValueFormatter`, `DynamicJsonElement`
and `ScriptArgs` all carry non-trivial behavior, and a couple have real correctness or
concurrency defects (the `StaleTagMonitor` stale-fire race, the `DynamicJsonElement`
`JsonDocument`-lifetime hazard, the silent conversion-failure swallowing in
`ScriptParameters.GetNullable`). Second, the `ManagementCommandRegistry` name mapping is
asymmetric and namespace-scoped in a way that does not match the broader set of
`*Command` records elsewhere in `Messages/`. Third, several behavior-bearing types
(`ValueFormatter`, `DynamicJsonElement`, `ScriptArgs`, `ManagementCommandRegistry`,
`Result<T>`, the OPC UA serializer round-trip) have no unit tests despite containing the
kind of edge-case logic that warrants them. Entity and message contracts otherwise look
clean and additive-evolution-friendly, with the exception of one `ValueTuple` use in a
wire command.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | `DynamicJsonElement.TryConvert` returns success for non-convertible types; `Result<T>` allows null error; legacy-config fallback loses data. |
| 2 | Akka.NET conventions | ✓ | Commons has no actors (correct). Message contracts are records and immutable. One wire message uses `ValueTuple` (Commons-008). Correlation IDs present on request/response messages. |
| 3 | Concurrency & thread safety | ✓ | `StaleTagMonitor` has a check-then-act race between the timer callback and `OnValueReceived` (Commons-001). |
| 4 | Error handling & resilience | ✓ | `ScriptParameters.GetNullable` silently swallows conversion failures (Commons-003); OPC UA legacy deserialize discards malformed input (Commons-005). |
| 5 | Security | ✓ | No auth logic here. `SmtpConfiguration.Credentials` / OPC UA passwords are plain-string fields (storage/encryption is a consumer concern) — noted, not a finding. No script-trust violations: Commons defines no forbidden-API surface. |
| 6 | Performance & resource management | ✓ | `StaleTagMonitor` disposes its `Timer` correctly. `DynamicJsonElement` references a `JsonElement` whose backing document lifetime is not owned (Commons-002). |
| 7 | Design-document adherence | ✓ | Several behavior-bearing helper/validator/serializer classes push against REQ-COM-6 "no business logic" (Commons-007). Folder layout matches REQ-COM-5b. |
| 8 | Code organization & conventions | ✓ | `ManagementCommandRegistry` naming is asymmetric/namespace-scoped (Commons-004). `DeployedConfigSnapshot`, `InstanceAlarmOverride`, `TemplateFolder`, `ISiteRepository`, several service interfaces and `Messages/Management` exist but are not listed in Component-Commons.md (Commons-009). |
| 9 | Testing coverage | ✓ | `ValueFormatter`, `DynamicJsonElement`, `ScriptArgs`, `ManagementCommandRegistry`, `Result<T>`, `ConfigurationDiff`, `AlarmContext`, and the OPC UA serializer round-trip have no tests (Commons-010). |
| 10 | Documentation & comments | ✓ | `OpcUaEndpointConfigSerializer.Deserialize` XML doc does not mention the silent data-loss path (Commons-005). `Component-Commons.md` is stale relative to the actual file set (Commons-009). `ValueFormatter` uses current-culture formatting without documenting it (Commons-012). |
## Findings
### Commons-001 — `StaleTagMonitor` stale-fire race between timer and `OnValueReceived`
| | |
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.Commons/Types/StaleTagMonitor.cs:42-46`, `:62-67` |
**Description**
`OnValueReceived` sets `_staleFired = false` then calls `_timer.Change(...)`, while the
timer callback `OnTimerElapsed` reads `_staleFired`, sets it to `true`, and invokes the
`Stale` event. `_staleFired` is `volatile`, which guarantees visibility but not
atomicity of the check-then-set. The two methods run on different threads (a value-
arrival thread and a `ThreadPool` timer thread). If the timer callback has already
passed the `if (_staleFired) return;` check when `OnValueReceived` runs, `Stale` fires
even though a fresh value just arrived — a spurious staleness signal. There is also a
window where `OnValueReceived` resets `_staleFired` and reschedules the timer while a
callback for the previous period is mid-flight, so `Stale` can fire once per period as
documented but at the wrong moment. For a heartbeat monitor feeding connection-health
decisions, a false stale signal can trigger an unnecessary reconnect.
**Recommendation**
Guard the state transition with a lock, or replace the `_staleFired` bool with an
`Interlocked.CompareExchange` on an `int` so only one of "fire" / "reset" wins. The
callback should atomically test-and-set; `OnValueReceived` should atomically reset and
only then reschedule the timer.
**Resolution**
Resolved 2026-05-16 (commit pending) — confirmed the race against the source. Replaced
the `volatile bool` guard with a lock-protected monotonic generation token: `Start`,
`OnValueReceived` and `Stop` each bump the generation under a gate, and the timer
callback only raises `Stale` if its scheduled generation still matches. `OnValueReceived`
now recreates the timer (rather than `Change`-ing it) so the rescheduled callback carries
the new token. A superseded or stopped period can no longer emit a spurious staleness
signal. Regression tests added in `StaleTagMonitorRaceTests` (deterministic via an
internal `CallbackEnteredHook` test seam).
### Commons-002 — `DynamicJsonElement` retains a `JsonElement` whose `JsonDocument` lifetime it does not own
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.Commons/Types/DynamicJsonElement.cs:10-17` |
**Description**
`DynamicJsonElement` stores a `JsonElement` and exposes it for deferred, dynamic access
from scripts. A `JsonElement` is only valid while the `JsonDocument` that produced it has
not been disposed; accessing a `JsonElement` after its document is disposed throws
`ObjectDisposedException`. Nothing in `DynamicJsonElement` keeps the document alive or
documents that the caller must. Because the wrapper is explicitly designed for
"convenient property access in scripts" — i.e. access at an arbitrary later time — a
caller that wraps an element from a `using var doc = JsonDocument.Parse(...)` block (the
exact pattern used in `OpcUaEndpointConfigSerializer`) will hand scripts a wrapper that
faults on first member access.
**Recommendation**
Either clone the element on construction with `JsonElement.Clone()` (which detaches it
from the document and makes it safe to retain), or hold a reference to the owning
`JsonDocument` and implement `IDisposable`. Document the lifetime contract on the type
regardless.
**Resolution**
Resolved 2026-05-16 (commit pending) — confirmed the hazard: `ExternalCallResult.Response`
constructs the wrapper from `JsonDocument.Parse(...).RootElement` with no reference kept
to the document, so deferred script-time access could fault. Fixed at the root by cloning
the element with `JsonElement.Clone()` in the `DynamicJsonElement` constructor, detaching
it from the owning document; the public constructor signature is unchanged. Added a
remarks block documenting the lifetime contract. Regression tests added in
`DynamicJsonElementTests` (access after the source document is disposed / GC-collected).
### Commons-003 — `ScriptParameters.GetNullable` silently swallows conversion failures
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.Commons/Types/ScriptParameters.cs:72-86` |
**Description**
`GetNullable<T>` catches `ScriptParameterException` from `ConvertScalar` and returns
`default!` (null) "on conversion failure for nullable". This conflates two distinct
cases: a parameter that is genuinely absent/null, and a parameter that is *present but
holds an unconvertible value* (e.g. `Get<int?>("count")` when `count` is the string
`"banana"`). The latter is almost always a script or caller bug, and silently mapping it
to `null` hides it — the script then proceeds with a null it interprets as "not
supplied". The non-nullable `Get<T>` and the array/list paths correctly throw with a
descriptive message for the same bad input, so the behavior is also inconsistent across
the API surface. The XML doc states "returns null if missing, null, or unconvertible",
so the behavior is intentional, but it remains a footgun.
**Recommendation**
Distinguish "absent/null" from "present but unconvertible": return null only for the
former and throw `ScriptParameterException` for the latter, mirroring the array/list
element handling. If the swallowing must stay for compatibility, at minimum surface it
(e.g. an out-of-band warning) rather than failing silently.
**Resolution**
Resolved 2026-05-16 (commit pending) — confirmed the silent-swallow path against the
source. Removed the `catch (ScriptParameterException)` block in `GetNullable<T>`: an
absent or explicitly-null parameter still returns `null`, but a parameter that is
*present but holds an unconvertible value* now throws `ScriptParameterException` with a
descriptive message, consistent with `Get<T>()` and the array/list element paths. The
`Get<T>` XML doc was corrected accordingly. This is a deliberate behavioral change toward
correctness — the previous behavior masked caller/script bugs; the type-level public
contract is unchanged. Regression tests added in `ScriptParametersTests`
(`Get_NullableInt_PresentButUnparsable_Throws` and siblings).
### Commons-004 — `ManagementCommandRegistry` name mapping is asymmetric and namespace-scoped
| | |
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Resolved |
| Location | `src/ScadaLink.Commons/Messages/Management/ManagementCommandRegistry.cs:14-35` |
**Description**
`BuildRegistry` registers only types in the exact `ScadaLink.Commons.Messages.Management`
namespace whose names end in `Command`. `GetCommandName(Type)`, however, strips a
`Command` suffix from *any* type passed to it. The two halves disagree:
- `GetCommandName` will happily compute a command name for `*Command` records that live
in other `Messages/` sub-namespaces (`DeployInstanceCommand` in `Messages.Deployment`,
`DisableInstanceCommand` in `Messages.Lifecycle`, `SetStaticAttributeCommand` in
`Messages.Instance`, `DeployArtifactsCommand` in `Messages.Artifacts`, etc.), yet
`Resolve` will return `null` for every one of those names because they were never
registered.
- Because of this gap the Management namespace carries deliberately renamed duplicates
(`MgmtDeployInstanceCommand`, `MgmtEnableInstanceCommand`, `MgmtDisableInstanceCommand`,
`MgmtDeleteInstanceCommand` in `InstanceCommands.cs`) whose `Mgmt` prefix exists only
to dodge a collision the registry's namespace filter already prevents — a confusing,
undocumented coupling.
A round-trip `Resolve(GetCommandName(t))` is therefore not guaranteed to return `t`,
which is the implicit contract of a name registry.
**Recommendation**
Make the two methods symmetric: either scan all of `Messages/` (and detect/throw on
duplicate stripped names, since `ToFrozenDictionary` will throw on a collision) or
restrict `GetCommandName` to types the registry actually contains. Document the chosen
scope, and reconsider whether the `Mgmt*` prefixed duplicates are still needed.
**Resolution**
Resolved 2026-05-16 (commit pending) — confirmed the asymmetry: `GetCommandName` stripped
`Command` from any type while `BuildRegistry` only registered the `Messages.Management`
namespace. In practice no defect was observed because every command type the CLI and
ManagementService actually use is in `Messages.Management` (a round-trip test over all
registered commands confirms no name collision). Closed the asymmetry by making
`GetCommandName` registry-bound: it now looks up a reverse `Type→name` frozen dictionary
built from the same registry and throws `ArgumentException` for any unregistered type, so
`Resolve(GetCommandName(t)) == t` holds for every type it accepts. Added an XML remarks
block documenting the registry scope and the symmetry guarantee. The `Mgmt*` prefixed
records were left in place — they are the genuine Management-namespace command types the
CLI constructs and renaming them would change wire command names (out of scope for a
behavior-preserving fix; noted for a future cleanup). CLI, ManagementService, and
SiteRuntime all build clean against the change. Regression tests added in
`ManagementCommandRegistryTests`.
### Commons-005 — `OpcUaEndpointConfigSerializer.Deserialize` discards malformed legacy input and over-reports `IsLegacy`
| | |
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.Commons/Serialization/OpcUaEndpointConfigSerializer.cs:25-51` |
**Description**
When the typed-deserialize path fails or the JSON lacks `endpointUrl`, `Deserialize`
falls through to `LoadLegacy`. If `LoadLegacy` itself throws `JsonException` (genuinely
malformed JSON), the method returns `(new OpcUaEndpointConfig(), IsLegacy: true)` — a
default, empty config with the legacy flag set. The original stored string is silently
discarded, and the caller is told it is a recoverable "legacy" row when in fact the data
was unparseable. A form built on the documented `IsLegacy` contract ("prompt the user to
re-save") will present an empty config as if it were the user's saved configuration,
inviting them to overwrite real (if malformed) data with blanks. The XML doc only
describes the happy legacy path and does not mention this data-loss branch.
**Recommendation**
Distinguish "parsed as legacy" from "could not parse at all" — e.g. return a third state
or throw for genuinely malformed input so the caller can surface an error instead of an
empty form. Update the XML doc to describe the failure branch.
**Resolution**
_Unresolved._
### Commons-006 — `DynamicJsonElement.TryConvert` reports success for unconvertible target types
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.Commons/Types/DynamicJsonElement.cs:47-51`, `:66-76` |
**Description**
`TryConvert` does `result = ConvertTo(binder.Type); return result != null || binder.Type == typeof(object);`.
`ConvertTo` returns `null` for any type/kind pair it does not handle (e.g. requesting
`int` from a JSON string, or `DateTime` from anything). For a non-`object` target this
yields `result == null` and `return false`, which is correct. But the `|| binder.Type == typeof(object)`
clause makes `(object)dynamicElement` succeed with a `null` result even when the wrapped
element is, say, a JSON object or a non-null string — the cast silently produces `null`
instead of the element or its value. Any script doing `object o = jsonThing;` gets `null`
for a present value. The conversion of a present, non-null JSON value should never yield
`null`.
**Recommendation**
For the `object` target, return the element itself (or `Wrap(_element)`) rather than
`null`. Only return `null` when the wrapped element is genuinely `JsonValueKind.Null`.
**Resolution**
_Unresolved._
### Commons-007 — Several Commons types carry non-trivial logic, stretching REQ-COM-6
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.Commons/Types/ScriptParameters.cs`, `src/ScadaLink.Commons/Serialization/OpcUaEndpointConfigSerializer.cs`, `src/ScadaLink.Commons/Validators/OpcUaEndpointConfigValidator.cs`, `src/ScadaLink.Commons/Types/StaleTagMonitor.cs`, `src/ScadaLink.Commons/Types/ScriptArgs.cs` |
**Description**
REQ-COM-6 states Commons "must contain only data structures, interfaces, enums, and
constants" and "must not contain any business logic", with method bodies "limited to
trivial data-access logic". Several files exceed that: `ScriptParameters` performs typed
conversion with reflection and JSON-element unwrapping; `OpcUaEndpointConfigSerializer`
implements a multi-shape (typed + legacy flat-dict) serialization strategy;
`OpcUaEndpointConfigValidator` encodes OPC UA domain rules (e.g. `LifetimeCount` ≥ 3×
`KeepAliveCount`); `StaleTagMonitor` runs a `Timer` and raises events; `ScriptArgs`
reflects over arbitrary objects. The `ArchitecturalConstraintTests` "no service/actor"
heuristic only counts public methods (> 3) and so does not catch these. This is design
drift, not a defect — but it should be a deliberate decision: either move these helpers
into the components that own the behavior (Data Connection Layer, Site Runtime,
Template Engine) or amend Component-Commons.md to explicitly permit "pure stateless
helpers/validators".
**Recommendation**
Decide and document the policy. If these are intentionally allowed in Commons, add a
sentence to REQ-COM-6 carving out pure validators/serializers/parsers; otherwise relocate
them. Tighten the architectural test if the rule is meant to be enforced.
**Resolution**
_Unresolved._
### Commons-008 — `SetConnectionBindingsCommand` uses `ValueTuple` in a wire message contract
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.Commons/Messages/Management/InstanceCommands.cs:10` |
**Description**
`SetConnectionBindingsCommand` declares
`IReadOnlyList<(string AttributeName, int DataConnectionId)> Bindings`. The tuple element
names are compile-time-only; `System.Text.Json` serializes a `ValueTuple` as `Item1` /
`Item2`, and the message is positional with no room for additive evolution (you cannot
add a third field without changing the tuple type, which REQ-COM-5a forbids). Every other
message in `Messages/` uses named records. A management command travels over the
ClusterClient boundary and is exactly the kind of contract REQ-COM-5a's additive-only
rule targets.
**Recommendation**
Replace the tuple with a small named record, e.g.
`record ConnectionBinding(string AttributeName, int DataConnectionId)`, and use
`IReadOnlyList<ConnectionBinding>`.
**Resolution**
_Unresolved._
### Commons-009 — `Component-Commons.md` is stale relative to the actual file set
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `docs/requirements/Component-Commons.md:61-198` |
**Description**
The design doc's entity list, repository list, and folder tree no longer match the code:
- Entities present but undocumented: `DeployedConfigSnapshot`, `InstanceAlarmOverride`,
`TemplateFolder`.
- Repository interface present but undocumented: `ISiteRepository` (the doc lists seven
repositories under REQ-COM-4; the code has eight).
- Service interfaces present but undocumented: `IDatabaseGateway`,
`IExternalSystemClient`, `IInstanceLocator`, `INotificationDeliveryService` — REQ-COM-4a
documents only `IAuditService`.
- Whole namespaces absent from the REQ-COM-5b folder tree: `Messages/Management`,
`Messages/DataConnection`, `Messages/Integration`, `Messages/Instance`,
`Messages/RemoteQuery`, plus `Types/DataConnections`, `Types/Scripts`, `Serialization/`,
and `Validators/`.
CLAUDE.md's editing rules require the design docs to stay in sync with the code; the doc
is now a partial map.
**Recommendation**
Refresh Component-Commons.md to enumerate the current entities, repository and service
interfaces, and the actual `Types/`, `Messages/`, `Serialization/`, and `Validators/`
folders.
**Resolution**
_Unresolved._
### Commons-010 — Behavior-bearing Commons types have no unit tests
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.Commons.Tests/` |
**Description**
`ScadaLink.Commons.Tests` covers `Result`, `RetryPolicy`, `ScriptParameters`,
`StaleTagMonitor`, the OPC UA validator, enums, message conventions, compatibility, and
entity conventions. It does not cover several types that contain exactly the kind of
edge-case logic that warrants tests:
- `ValueFormatter` — scalar vs collection vs null formatting.
- `DynamicJsonElement` — member/index access, conversions, the issues in Commons-002 and
Commons-006 would have been caught by tests.
- `ScriptArgs.Normalize` — dictionary/anonymous-object/primitive-rejection paths.
- `ManagementCommandRegistry``Resolve` / `GetCommandName` round-trip (would have
surfaced Commons-004).
- `Result<T>``Match`, failure/success accessors, error-on-misuse.
- `OpcUaEndpointConfigSerializer` typed↔flat round-trip and legacy fallback.
- `ConfigurationDiff` / `AlarmContext` / `ScriptScope` — minor, but `HasChanges` /
`HasParent` logic is untested.
**Recommendation**
Add focused unit tests for the helper/utility types above, prioritizing
`DynamicJsonElement`, `ScriptArgs`, `ManagementCommandRegistry`, and the OPC UA serializer
round-trip.
**Resolution**
_Unresolved._
### Commons-011 — `Result<T>.Failure` accepts a null error string
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.Commons/Types/Result.cs:15-20`, `:30-32`, `:36` |
**Description**
`Result<T>.Failure(string error)` and the private failure constructor do not validate
`error`. A caller passing `null` produces a failed `Result` whose `Error` getter returns
`null` via `_error!`, and whose `Match` calls `onFailure(_error!)` with `null`. `Result`
is the system-wide error-handling type ("consistent error handling across component
boundaries"); a failed result with no error message defeats its purpose and pushes a
`NullReferenceException` risk onto every consumer that logs or displays `Error`.
**Recommendation**
Throw `ArgumentNullException` (or `ArgumentException` for empty/whitespace) in
`Failure`/the failure constructor so a failed `Result` always carries a message.
**Resolution**
_Unresolved._
### Commons-012 — `ValueFormatter` uses current-culture formatting without documenting it
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.Commons/Types/ValueFormatter.cs:20-27` |
**Description**
`FormatDisplayValue` formats `IFormattable` values (and collection elements) with the
parameterless `ToString()`, which uses the current thread culture. The XML doc calls this
"the value's natural string representation" without noting the culture dependency. The
same numeric or `DateTime` attribute value will render differently depending on the
server/UI locale — e.g. decimal separators, date order. CLAUDE.md mandates UTC for
timestamps and notes local-time conversion is "a UI display concern only"; if
`ValueFormatter` is used outside a UI rendering context (e.g. logging, event-log entries,
diff display) the culture-dependent output is inconsistent and a latent bug.
**Recommendation**
Decide whether `ValueFormatter` is a UI-only helper. If it can be used outside the UI,
format with `CultureInfo.InvariantCulture` (using the `IFormattable.ToString(null, IFormatProvider)`
overload). Either way, document the culture behavior on the method.
**Resolution**
_Unresolved._
+490
View File
@@ -0,0 +1,490 @@
# Code Review — Communication
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.Communication` |
| Design doc | `docs/requirements/Component-Communication.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 3 |
## Summary
The Communication module is generally well-structured and matches the design doc's
two-transport model (ClusterClient for command/control, gRPC server-streaming for
real-time data). The actors keep mutable state on the actor thread, use `PipeTo` for
async work, and the gRPC server/client lifecycle is mostly disciplined. However the
review found several High and Medium issues clustered around two themes:
**(a) gRPC subscription bookkeeping races** — `SiteStreamGrpcClient` overwrites and
removes subscription entries by correlation ID without disposal or ownership checks,
so reconnect cycles leak `CancellationTokenSource`es and can cancel the wrong stream;
and **(b) missing supervision strategy** on the coordinator actors, contrary to the
CLAUDE.md "Resume for coordinator actors" decision. Design-doc adherence is otherwise
good. Test coverage is broad for happy paths but has gaps around failover, cache
mutation races, and the snapshot-timeout cleanup path.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | Snapshot-timeout orphan, reconnect not calling `CleanupGrpc`, subscription-map races. |
| 2 | Akka.NET conventions | ✓ | No supervision strategy on coordinators; `Sender` captured in async-launched closure path. |
| 3 | Concurrency & thread safety | ✓ | `SiteStreamGrpcClient._subscriptions` overwrite/remove race; `_siteClients` field reassignment unused but non-readonly. |
| 4 | Error handling & resilience | ✓ | gRPC reconnect leaks server-side relay; `LoadSiteAddressesFromDb` swallows DB failures silently. |
| 5 | Security | ✓ | No findings in module code. DebugStreamHub auth lives outside this module (Central UI). |
| 6 | Performance & resource management | ✓ | Orphaned subscriptions/CTS leaks; `SiteStreamGrpcClientFactory.Dispose` blocks on async. |
| 7 | Design-document adherence | ✓ | `GrpcMaxStreamLifetime` / keepalive options defined but never applied; hard-coded values used instead. |
| 8 | Code organization & conventions | ✓ | Options pattern correct; minor: public records declared in actor files. No structural issues. |
| 9 | Testing coverage | ✓ | No tests for snapshot-timeout cleanup, address-cache refresh races, or gRPC server reconnect-leak. |
| 10 | Documentation & comments | ✓ | XML comment on `DebugStreamBridgeActor` says "Persistent actor" — it is not an Akka.Persistence actor. |
## Findings
### Communication-001 — Early stream termination escapes StartStreamAsync's narrow exception handling
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/DebugStreamService.cs:130-143` |
**Re-triaged 2026-05-16:** originally filed Critical, claiming an orphaned bridge actor
and a multi-minute site-side resource leak on every snapshot timeout. On verification
that impact does **not** occur: `DebugStreamBridgeActor` calls `CleanupGrpc()` and
`Context.Stop(Self)` on every path that invokes `onTerminated` (site disconnect, gRPC
max-retries, `ReceiveTimeout`), so it always self-terminates and releases its gRPC
subscription; and the pure-timeout path does reach `StopStream`, which also stops it.
The genuine defect described below is an error-handling gap, not a leak — severity
corrected to Medium.
**Description**
`StartStreamAsync` awaits the initial snapshot inside a `try` whose only handler is
`catch (OperationCanceledException)`. When the stream terminates before the snapshot
arrives, `onTerminatedWrapper` completes the await via
`snapshotTcs.TrySetException(new InvalidOperationException(...))`. That
`InvalidOperationException` is not an `OperationCanceledException`, so it escapes the
catch entirely: the caller (Blazor debug view / SignalR hub) receives a raw,
untranslated exception, and `StartStreamAsync` performs no teardown of its own on that
path — it relies implicitly on the bridge actor self-terminating. Cleanup from the
service side is therefore not deterministic, and the failure surfaced to the caller is
not a meaningful, documented result.
**Recommendation**
In `StartStreamAsync`, catch any exception from the snapshot await, deterministically
tear down the bridge actor (`Tell(StopDebugStream)` via the local actor reference, since
a racing `onTerminatedWrapper` may already have removed the session entry), and translate
the failure into a meaningful exception for the caller.
**Resolution**
Resolved 2026-05-16. The `catch (OperationCanceledException)`-only block in
`StartStreamAsync` was replaced with `catch (Exception)`: it removes the session entry,
sends `StopDebugStream` to the bridge actor via the local reference (idempotent — the
actor may already be stopping itself), and throws a descriptive exception —
`TimeoutException` for the 30s timeout, otherwise an `InvalidOperationException` that
names the instance/site and wraps the underlying cause. Regression test
`DebugStreamServiceTests.StartStreamAsync_StreamTerminatesBeforeSnapshot_ThrowsMeaningfulException`
fails against the pre-fix code and passes after. Fixed by the commit whose message
references `Communication-001`.
### Communication-002 — gRPC reconnect does not unsubscribe the previous stream, leaking site-side relay actors
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:170`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:143` |
**Description**
On a gRPC stream error, `HandleGrpcError` increments the retry count, flips
`_useNodeA`, and schedules `OpenGrpcStream`. `OpenGrpcStream` cancels and disposes
`_grpcCts` and starts a fresh `SubscribeInstance` call — but it never calls
`client.Unsubscribe(_correlationId)` on the *old* node's client, and the site-side
`SiteStreamGrpcServer` keys active streams by `correlation_id` only. Because the new
subscription goes to the *other* node (`_useNodeA` flipped), the old node's
`SiteStreamGrpcServer` still has an active stream + `StreamRelayActor` +
`SiteStreamManager` subscription for that correlation ID. The old node only learns the
client is gone via TCP RST or keepalive — exactly the failure mode that triggered the
reconnect (network partition / silent node), so detection may take ~25s or never. Each
reconnect can therefore leave a zombie relay actor on the failed node. `CleanupGrpc`
(which *does* call `Unsubscribe`) is only invoked on terminal paths, not between
reconnect attempts.
**Recommendation**
Before reconnecting in `HandleGrpcError` / at the top of `OpenGrpcStream`, call
`Unsubscribe(_correlationId)` on the client for the *previous* endpoint (the one that
just failed) so the local CTS is cancelled and — where the channel is still alive —
the gRPC cancellation reaches the site and stops the relay actor.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source:
`HandleGrpcError` flipped `_useNodeA` and scheduled `OpenGrpcStream` without ever
unsubscribing the failed stream, leaving the old node's `StreamRelayActor` zombie until
TCP/keepalive timeout. Fix: `HandleGrpcError` now resolves the client for the
*previous* endpoint (before flipping `_useNodeA`) and calls `Unsubscribe(_correlationId)`
on it, so the local CTS is cancelled and gRPC cancellation reaches the still-alive site.
Regression test `DebugStreamBridgeActorTests.On_GrpcError_Unsubscribes_Old_Stream_Before_Reconnect`
fails against the pre-fix code and passes after.
### Communication-003 — SiteStreamGrpcClient subscription map overwritten without disposal; reconnect can cancel the wrong stream
| | |
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:77`, `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:106` |
**Description**
`SubscribeAsync` does `_subscriptions[correlationId] = cts;` (line 77),
unconditionally overwriting any existing entry for that correlation ID without
cancelling or disposing the previous `CancellationTokenSource`. The `finally` block
then does `_subscriptions.TryRemove(correlationId, out _)` (line 106) which removes
the entry **by key only, regardless of which CTS is stored**. Because
`DebugStreamBridgeActor` reuses the same `_correlationId` across reconnect attempts
(and `SiteStreamGrpcClientFactory` returns the same `SiteStreamGrpcClient` for a site
even after a node flip), two `SubscribeAsync` calls can briefly share a correlation
ID. The first call's `finally` then removes the *second* call's CTS entry, so a later
`Unsubscribe(correlationId)` finds nothing and the live stream is never cancelled — an
orphan. Conversely the overwritten CTS is leaked (never disposed).
**Recommendation**
When inserting, cancel+dispose any prior CTS for that correlation ID. In the `finally`,
remove only if the stored CTS is the one this call created (use the
`TryRemove(KeyValuePair)` overload, mirroring what `SiteStreamGrpcServer` already does
with `StreamEntry`). Consider keying subscriptions by a per-call GUID rather than the
caller-supplied correlation ID.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source: the
inline `_subscriptions[correlationId] = cts` overwrote a prior CTS without
cancel/dispose (leak), and the `finally`'s `TryRemove(correlationId, out _)` removed by
key only — a racing reconnect's live CTS could be removed by the prior call's `finally`,
orphaning the live stream. Fix: extracted two internal helpers used by `SubscribeAsync`
`RegisterSubscription` cancels+disposes any existing CTS for the correlation ID before
inserting, and `RemoveSubscription` uses the `ConcurrentDictionary.TryRemove(KeyValuePair)`
overload so it removes only the CTS that call created (mirroring `SiteStreamGrpcServer`'s
`StreamEntry` pattern). Regression tests
`SiteStreamGrpcClientTests.RegisterSubscription_ReusedCorrelationId_CancelsAndDisposesPriorCts`
and `SiteStreamGrpcClientTests.RemoveSubscription_OnlyRemovesOwnCts_NotAReplacement`
fail against the pre-fix logic and pass after.
### Communication-004 — Coordinator actors declare no SupervisorStrategy (design requires Resume)
| | |
|--|--|
| Severity | Medium |
| Category | Akka.NET conventions |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:42`, `src/ScadaLink.Communication/Actors/SiteCommunicationActor.cs:22` |
**Description**
CLAUDE.md ("Explicit supervision strategies: Resume for coordinator actors, Stop for
short-lived execution actors") requires coordinator actors to use an explicit `Resume`
supervision strategy. `CentralCommunicationActor` and `SiteCommunicationActor` are
long-lived coordinators (they own the per-site ClusterClient map, debug
subscriptions, in-progress deployments) but neither overrides `SupervisorStrategy`.
They fall back to the Akka default (`OneForOneStrategy` with `Restart`). A child fault
— e.g. a `ClusterClient` child of `CentralCommunicationActor` created by
`DefaultSiteClientFactory` — would `Restart` under the default strategy, and any
exception in the coordinator itself would restart it, wiping `_siteClients`,
`_debugSubscriptions`, and `_inProgressDeployments` silently. The design intent is
`Resume` so transient child faults do not discard coordinator state.
**Recommendation**
Override `SupervisorStrategy` on both actors to return an explicit
`OneForOneStrategy` with `Directive.Resume` (or the project's standard coordinator
strategy), matching the documented decision and other coordinator actors.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: neither
`CentralCommunicationActor` nor `SiteCommunicationActor` overrode `SupervisorStrategy`,
so child faults fell back to the Akka default (`Restart`). Note that an actor's own
`SupervisorStrategy` governs its *children* — a transient child fault would `Restart`
the child and discard its in-memory state, contrary to the CLAUDE.md "Resume for
coordinator actors" decision. Fix: both actors now override `SupervisorStrategy()` to
return a `OneForOneStrategy` with an unbounded `Decider` resolving to `Directive.Resume`
(mirroring `DataConnectionManagerActor`). Regression tests
`CoordinatorSupervisionTests.CentralCommunicationActor_SupervisorStrategy_IsResume` and
`CoordinatorSupervisionTests.SiteCommunicationActor_SupervisorStrategy_IsResume` fail
against the pre-fix code (decider yields `Restart`) and pass after.
### Communication-005 — gRPC keepalive and max-stream-lifetime options are defined but never applied
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:25`, `src/ScadaLink.Communication/CommunicationOptions.cs:36` |
**Description**
`CommunicationOptions` exposes `GrpcKeepAlivePingDelay`, `GrpcKeepAlivePingTimeout`,
`GrpcMaxStreamLifetime`, and `GrpcMaxConcurrentStreams`, and the design doc's
"gRPC Connection Keepalive" section explicitly states these are configurable. However
`SiteStreamGrpcClient`'s constructor hard-codes `KeepAlivePingDelay =
TimeSpan.FromSeconds(15)` and `KeepAlivePingTimeout = TimeSpan.FromSeconds(10)`
instead of reading the options. `GrpcMaxStreamLifetime` (the documented "Session
timeout — 4 hours" third layer of dead-client detection) is not referenced anywhere
`SiteStreamGrpcServer.SubscribeInstance` creates a linked CTS from the call
cancellation token only, with no `CancelAfter`. The 4-hour zombie-stream safety net
described in the design doc does not exist in code. `GrpcMaxConcurrentStreams` is also
not wired to the server (`SiteStreamGrpcServer` takes a `maxConcurrentStreams`
constructor parameter defaulting to 100, but nothing binds the option to it).
**Recommendation**
Flow `CommunicationOptions` into `SiteStreamGrpcClient` and `SiteStreamGrpcServer`
(via the factory / DI). Apply `GrpcKeepAlivePingDelay` / `GrpcKeepAlivePingTimeout` to
the `SocketsHttpHandler`, bind `GrpcMaxConcurrentStreams` to the server's limit, and
implement the `GrpcMaxStreamLifetime` session timeout with `CancelAfter` on the
server-side stream CTS — or, if the 4-hour cap is intentionally dropped, remove the
option and update the design doc.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: `SiteStreamGrpcClient`
hard-coded the keepalive values, `GrpcMaxStreamLifetime` was referenced nowhere, and
`GrpcMaxConcurrentStreams` was never bound to the server. Fix (scoped to
`src/ScadaLink.Communication`): `SiteStreamGrpcClient` gained a constructor taking
`CommunicationOptions` and now applies `GrpcKeepAlivePingDelay`/`GrpcKeepAlivePingTimeout`
to its `SocketsHttpHandler`; `SiteStreamGrpcClientFactory` gained an
`IOptions<CommunicationOptions>` DI constructor and flows the options into every client
it creates; `SiteStreamGrpcServer` gained an `IOptions<CommunicationOptions>` DI
constructor that binds `GrpcMaxConcurrentStreams` and implements the documented 4-hour
session timeout via `CancellationTokenSource.CancelAfter(GrpcMaxStreamLifetime)` on the
per-stream CTS. The Host's existing `AddSingleton<SiteStreamGrpcServer>()` registration
resolves the new DI constructor via greedy resolution — no Host change required.
Regression tests `GrpcOptionsWiringTests.SiteStreamGrpcClient_AppliesKeepAliveFromOptions`,
`GrpcOptionsWiringTests.SiteStreamGrpcClientFactory_FlowsOptionsToCreatedClients`, and
`GrpcOptionsWiringTests.SiteStreamGrpcServer_BindsMaxConcurrentStreamsAndLifetimeFromOptions`
exercise the wiring (they require the new members to even compile).
### Communication-006 — Site address load failures are silently swallowed, leaving a stale cache
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:204` |
**Description**
`LoadSiteAddressesFromDb` runs the repository query inside `Task.Run(...).PipeTo(self)`.
If `GetAllSitesAsync` throws (database unavailable, transient connection error), the
faulted task is piped to `Self` as a `Status.Failure`. `CentralCommunicationActor` has
no `Receive<Status.Failure>` handler, so the failure becomes an unhandled message
(logged at debug, not surfaced) and the periodic refresh silently fails. If the
*first* startup load fails the actor runs with an empty `_siteClients` map — every
`SiteEnvelope` is dropped (line 187) and every Ask times out with no indication of the
root cause.
**Recommendation**
Add a `Receive<Status.Failure>` handler that logs the load failure at Warning/Error
level so operators can distinguish "site has no addresses configured" from "database
is down". Optionally surface a health metric for repeated load failures.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: a faulted
`LoadSiteAddressesFromDb` task is piped to `Self` as a `Status.Failure`, but the actor
had no handler for it — the failure became an unhandled message (debug-level only) and
the periodic refresh failed silently. Fix: added a `Receive<Status.Failure>` handler
that logs the load failure at `Warning` with the underlying exception as the cause, so
operators can distinguish a missing-addresses configuration from a database outage.
Regression test
`CentralCommunicationActorTests.LoadSiteAddressesFailure_IsLoggedNotSilentlySwallowed`
(repository query throws) asserts the Warning is emitted — it produces no warning
against the pre-fix code and passes after.
### Communication-007 — `SiteStreamGrpcClientFactory.Dispose` blocks on async work (sync-over-async)
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClientFactory.cs:53` |
**Description**
`Dispose()` calls `DisposeAsync().AsTask().GetAwaiter().GetResult()`. This is the
classic sync-over-async pattern: it blocks the calling thread until all per-site
`SiteStreamGrpcClient.DisposeAsync` calls complete. If `Dispose` is invoked from a
context with a single-threaded synchronization context or from DI container shutdown
on a constrained thread pool, this can deadlock or stall host shutdown. The class
already implements `IAsyncDisposable`.
**Recommendation**
Prefer registering and disposing the factory through `IAsyncDisposable` only (modern
.NET DI honours it for singletons). If a synchronous `Dispose` must remain, dispose
the underlying `GrpcChannel`s directly (synchronous) rather than blocking on the async
path, or document why blocking is safe here.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: `Dispose()` called
`DisposeAsync().AsTask().GetAwaiter().GetResult()`, the classic sync-over-async pattern.
Fix: `SiteStreamGrpcClient` now also implements `IDisposable` with a synchronous
`Dispose()` that releases its CancellationTokenSources and underlying `GrpcChannel`
directly (all of that teardown is inherently synchronous); `SiteStreamGrpcClientFactory.Dispose()`
now disposes each cached client via that synchronous path with no blocking on the async
path. A `CreateClient` seam was extracted so the test can substitute a tracking client
while still exercising the factory's real caching/disposal machinery. Regression test
`SiteStreamGrpcClientFactoryDisposeTests.Dispose_DisposesClientsSynchronously_NotViaAsyncPath`
fails against the pre-fix code (clients disposed via `DisposeAsync`) and passes after;
`Dispose_DoesNotDeadlock_UnderSingleThreadedSynchronizationContext` guards the stall path.
### Communication-008 — Reconnect retry-count reset can mask a flapping stream indefinitely
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:71`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:174` |
**Description**
`_retryCount` is reset to 0 every time a single `AttributeValueChanged` or
`AlarmStateChanged` event is received (lines 72, 77). Combined with `MaxRetries = 3`,
a stream that connects, delivers exactly one event, then fails — repeatedly — will
reconnect forever. The design doc states "max 3 retries, terminate the session if all
retries fail"; the current logic only terminates after 3 *consecutive* failures with
zero intervening events, so a flapping site never trips the limit and the debug
session (and its site-side relay) lives on indefinitely. The `ReceiveTimeout` orphan
net is also reset by every received message, so it does not bound this case either.
**Recommendation**
Either reset `_retryCount` only after the stream has been stably connected for some
minimum duration (e.g. a timer armed on stream open, cancelled on the next error), or
keep a separate cumulative reconnect counter / time window that bounds total
reconnects regardless of intervening events.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: `_retryCount` was reset to
0 on every received `AttributeValueChanged`/`AlarmStateChanged`, so a stream that
connected, delivered one event, then failed — repeatedly — never tripped `MaxRetries`.
Fix (recommendation option a): the per-event reset was removed; instead `OpenGrpcStream`
arms a single `StabilityWindow` timer (60s default, internal-settable for tests), and
only when it fires (`GrpcStreamStable`) — i.e. the stream stayed up long enough to be
considered recovered — is `_retryCount` reset. `HandleGrpcError` cancels that timer, so
a stream that fails before the window elapses does not recover its retry budget. A
flapping stream therefore terminates after `MaxRetries` regardless of intervening
events. Regression test
`DebugStreamBridgeActorTests.FlappingStream_DeliveringEventsBetweenFailures_StillTerminatesAfterMaxRetries`
fails against the pre-fix code (actor never terminates) and passes after;
`RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow` verifies the budget
is recovered after a stable interval. The pre-existing test that codified the buggy
per-event reset (`Grpc_Error_Resets_RetryCount_On_Successful_Event`) was replaced.
### Communication-009 — `_siteClients` field is mutable and reassignable; cache update is not atomic on failure
| | |
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:53`, `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:240` |
**Description**
`_siteClients` is a non-`readonly` `Dictionary` field. It is only mutated on the actor
thread (correct), but the field is needlessly reassignable, and
`HandleSiteAddressCacheLoaded` mutates it in place across several loops. If
`ActorPath.Parse` throws on a malformed address mid-loop (e.g. a site row with a
garbage `NodeAAddress`), the method aborts partway through, having already stopped
some ClusterClients and added others — leaving the cache partially updated with no
recovery until the next 60s refresh. The other actor mutable collections
(`_debugSubscriptions`, `_inProgressDeployments`) are correctly `readonly`.
**Recommendation**
Mark `_siteClients` `readonly`. Validate/parse all addresses up front (or wrap
`ActorPath.Parse` in a try/catch that logs and skips the bad site) so a single
malformed site record cannot abort the whole refresh and leave a half-updated cache.
**Resolution**
_Unresolved._
### Communication-010 — `DebugStreamBridgeActor` XML doc incorrectly describes it as a "Persistent actor"
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:10` |
**Description**
The class summary opens with "Persistent actor (one per active debug session)...".
The actor derives from `ReceiveActor`, not a persistent actor base class, holds no
`PersistenceId`, and writes no journal/snapshot. "Persistent" is misleading — debug
sessions are explicitly "session-based and temporary" per the design doc. A reader
could assume state survives restart, which it does not.
**Recommendation**
Reword the summary to "Long-lived (per active debug session) actor on the central
side..." or similar, removing the word "Persistent".
**Resolution**
_Unresolved._
### Communication-011 — No test coverage for snapshot-timeout cleanup, address-cache failure, or gRPC reconnect leak
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.Communication.Tests/` (module-wide) |
**Description**
The test suite covers happy-path routing, handler-not-registered failures, heartbeat
bumping, cache refresh, and gRPC bridge reconnect/retry. However several critical
paths identified in this review have no coverage:
- The `DebugStreamService.StartStreamAsync` snapshot-timeout path (Communication-001)
— no test verifies bridge actor / site subscription teardown on timeout, nor the
`onTerminated`-before-snapshot race that throws a non-`OperationCanceledException`.
- `CentralCommunicationActor` behaviour when `LoadSiteAddressesFromDb` faults
(Communication-006) — `RefreshSiteAddresses_UpdatesCache` only exercises success.
- `SiteStreamGrpcClient` subscription-map overwrite/removal race (Communication-003)
and gRPC reconnect not unsubscribing the old node (Communication-002).
- A malformed `NodeAAddress` aborting `HandleSiteAddressCacheLoaded` (Communication-009).
**Recommendation**
Add tests for: snapshot timeout / pre-snapshot termination cleanup; address-load
failure logging and empty-cache behaviour; reusing a correlation ID across
`SubscribeAsync` calls; and a malformed site address during cache refresh.
**Resolution**
_Unresolved._
@@ -0,0 +1,485 @@
# Code Review — ConfigurationDatabase
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.ConfigurationDatabase` |
| Design doc | `docs/requirements/Component-ConfigurationDatabase.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 6 |
## Summary
The ConfigurationDatabase module is a focused, conventional EF Core data-access layer:
a single `ScadaLinkDbContext`, Fluent API entity configurations, eight repository
implementations of Commons-defined interfaces, an `IAuditService` implementation, an
`IInstanceLocator`, environment-aware migration handling, and design-time tooling
support. Overall structure adheres well to the design doc and the CLAUDE.md "Code
Organization" decisions — POCO entities and interfaces live in Commons, EF mappings and
implementations live here, Fluent API only, and optimistic concurrency is correctly
applied to `DeploymentRecord` via `rowversion`. The module is generally healthy.
The main themes across findings are: (1) a genuine logic bug in
`GetTemplateWithChildrenAsync`, which loads child templates and then discards them, so
the method does not deliver what its name implies; (2) secret-bearing columns (SMTP
credentials, external-system auth config, database connection strings) persisted in
plaintext with no encryption-at-rest; (3) a hardcoded SQL `sa` connection string with a
password literal embedded in `DesignTimeDbContextFactory`; (4) the no-arg
`AddConfigurationDatabase()` overload, which silently registers nothing, making a
misconfigured central node fail late and opaquely; and (5) audit-trail robustness gaps —
`AuditService` can throw on serializing entities with navigation cycles, rolling back
the whole business operation, and the design doc's claim that audit `Id` is `Long/GUID`
disagrees with the `int` entity. Test coverage is good for the repositories that have
tests (Security, CentralUI, audit, concurrency, seed data, data protection) but several
repositories (`TemplateEngineRepository`, `DeploymentManagerRepository`,
`ExternalSystemRepository`, `InboundApiRepository`, `NotificationRepository`,
`SiteRepository`, `InstanceLocator`) have little or no direct coverage.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | `GetTemplateWithChildrenAsync` discards loaded children (CD-001); `GetApprovedKeysForMethodAsync` CSV parsing is brittle (CD-008). |
| 2 | Akka.NET conventions | ✓ | No actors in this module; data-access layer only. No issues found. |
| 3 | Concurrency & thread safety | ✓ | DbContext correctly scoped; optimistic concurrency on `DeploymentRecord` correct. Repositories hold no shared mutable state. No issues found. |
| 4 | Error handling & resilience | ✓ | `WaitForDatabaseReadyAsync` is sound. No-arg DI overload fails late and silently (CD-003); audit JSON serialization failure handling (CD-007). |
| 5 | Security | ✓ | Hardcoded `sa` credential literal (CD-002); SMTP/DB-connection/auth secrets stored unencrypted (CD-004). |
| 6 | Performance & resource management | ✓ | `GetAllTemplatesAsync` / `GetTemplateTreeAsync` eager-load multiple collections without `AsSplitQuery` (CD-009). No N+1 in audited paths. |
| 7 | Design-document adherence | ✓ | Audit `Id` type mismatch vs design doc (CD-005); seed data uses `HasData` consistent with design. |
| 8 | Code organization & conventions | ✓ | Mostly clean. `Grpc*` address columns unbounded (CD-006); inconsistent null-guard on injected context (CD-011). |
| 9 | Testing coverage | ✓ | Several repositories and `InstanceLocator` lack direct tests (CD-010). |
| 10 | Documentation & comments | ✓ | `DeploymentManagerRepository` "WP-24 stub" XML comment is stale; noted in module context but not raised as a standalone finding. No issues found beyond items above. |
## Findings
### ConfigurationDatabase-001 — `GetTemplateWithChildrenAsync` loads child templates then discards them
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/TemplateEngineRepository.cs:30-41` |
**Description**
`GetTemplateWithChildrenAsync` queries for all templates whose `ParentTemplateId`
equals the requested id, assigns the result to the local variable `children`, and
then returns `template` — the `children` list is never used, attached to the returned
object, or otherwise exposed. The method is therefore behaviourally identical to
`GetTemplateByIdAsync` but issues an extra database round-trip. Any caller relying on
the method name to obtain a template with its derived/child templates populated will
silently receive a template with no children, leading to incorrect template-resolution
or UI behaviour with no error.
**Recommendation**
Either populate the children onto the returned aggregate (e.g. project into a result
type that carries the children, or load them into a navigation collection that is
actually returned), or remove the dead query and the misleading method if children are
not in fact needed. If the navigation does not exist on the `Template` entity, add an
explicit result tuple/DTO so the loaded data reaches the caller.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against source: the
method ran a `Where(t => t.ParentTemplateId == id)` query, assigned the result to a
local `children` variable, and never used it — a misleading no-op that also issued an
extra database round-trip per call.
Triage of the three callers (`FlatteningPipeline.BuildTemplateChainAsync`,
`ManagementActor.HandleGetTemplate`, `ManagementActor.HandleValidateTemplate`) showed
none consume derived/sub-templates; they all need the template's *member* collections
(Attributes/Alarms/Scripts/Compositions), which `GetTemplateByIdAsync` already
eager-loads. The `Template` entity has no child-templates navigation collection, and
adding one (plus changing the interface signature) would require editing
`ScadaLink.Commons`, which is outside this module's scope.
Fix applied the recommendation's secondary option: removed the dead query so the
method no longer misleads or wastes a round-trip, and added an XML doc comment
clarifying that "children" means the template's member collections. The method now
honestly delegates to `GetTemplateByIdAsync`. Regression tests added in
`TemplateEngineRepositoryTests.cs`:
`GetTemplateWithChildrenAsync_ReturnsTemplateWithAllMemberCollectionsPopulated`,
`GetTemplateWithChildrenAsync_PreservesParentTemplateId_ForInheritanceChainWalk`, and
`GetTemplateWithChildrenAsync_ReturnsNull_WhenTemplateDoesNotExist` — pinning the
template-aggregate contract the callers depend on.
### ConfigurationDatabase-002 — Hardcoded `sa` connection string with embedded password literal
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/DesignTimeDbContextFactory.cs:21-22` |
**Description**
`DesignTimeDbContextFactory` falls back to a literal connection string
`"Server=localhost,1433;Database=ScadaLink_Config;User Id=sa;Password=YourPassword;TrustServerCertificate=True"`
when no configured connection string is found. Embedding a credential literal (even a
placeholder) in source code is a poor pattern: it is committed to version control,
encourages copy-paste of `sa`/`TrustServerCertificate=True` into real environments, and
the fallback can mask a genuine misconfiguration during `dotnet ef` operations by
silently pointing tooling at an unintended database.
**Recommendation**
Remove the hardcoded fallback. If no connection string is resolved from configuration
or environment, throw a clear `InvalidOperationException` instructing the developer to
set `ScadaLink:Database:ConfigurationDb` (or an environment variable). At minimum, read
the design-time connection string from an environment variable rather than a literal,
and never use `sa`.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: the factory
fell back to a literal `User Id=sa;Password=YourPassword;...` connection string when no
configured value was found. Removed the hardcoded fallback entirely. The factory now
resolves the connection string from the Host's appsettings files or, when those are not
present, from the `SCADALINK_DESIGNTIME_CONNECTIONSTRING` environment variable, and
throws a clear `InvalidOperationException` (naming both the config key and the env var)
when neither yields a value. Also hardened `SetBasePath` to be applied only when the
`ScadaLink.Host` directory exists, so the factory degrades cleanly instead of throwing
`DirectoryNotFoundException` when run from a context without a sibling Host folder.
Regression tests added in `DesignTimeDbContextFactoryTests.cs`:
`CreateDbContext_NoConnectionStringConfigured_ThrowsClearException`,
`CreateDbContext_ConnectionStringFromEnvironmentVariable_IsUsed`, and
`DesignTimeDbContextFactory_SourceContainsNoHardcodedSaCredential`.
### ConfigurationDatabase-003 — No-arg `AddConfigurationDatabase()` silently registers nothing
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs:44-49` |
**Description**
The parameterless `AddConfigurationDatabase()` overload is a deliberate no-op "retained
for backward compatibility during migration." If a central node is wired up with this
overload by mistake, no `ScadaLinkDbContext`, repositories, `IAuditService`, or
`IInstanceLocator` are registered. The failure does not surface at startup; it surfaces
much later as opaque DI resolution exceptions the first time any consumer requests a
repository — far from the actual misconfiguration. The XML comment also refers to
"Phase 0 stubs," which is stale relative to the current state of the module.
**Recommendation**
Either delete the no-op overload now that the connection-string overload exists, or
mark it `[Obsolete]` with an error-level message so misuse is a compile-time failure.
If a true "site node" no-op is genuinely required, give it an explicit, self-documenting
name (e.g. `AddConfigurationDatabaseNoOp()`), and remove the stale "Phase 0" wording.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: the
parameterless `AddConfigurationDatabase()` overload returned `services` unchanged,
registering no `DbContext`, repositories, `IAuditService`, or `IInstanceLocator`.
Applied the recommendation's first option: the overload is now marked
`[Obsolete(..., error: true)]` so any source reference is a compile-time failure, and
its body throws `InvalidOperationException` with an actionable message as
defence-in-depth (covering reflection-based invocation or suppressed warnings). The
stale "Phase 0 stubs / backward compatibility" XML comment was replaced with one
explaining the obsoletion. The pre-existing
`ServiceRegistrationTests.AddConfigurationDatabase_NoArgs_DoesNotThrow` test in
`UnitTest1.cs`, which encoded the old buggy no-op contract, was updated to
`AddConfigurationDatabase_NoArgs_FailsFast` to assert the corrected behaviour.
New regression tests added in `ServiceCollectionExtensionsTests.cs`:
`AddConfigurationDatabase_NoArgOverload_FailsFastWithClearMessage` and
`AddConfigurationDatabase_NoArgOverload_IsMarkedObsoleteAsError`.
### ConfigurationDatabase-004 — Secret-bearing columns stored in plaintext with no protection
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/NotificationConfiguration.cs:56-57`, `src/ScadaLink.ConfigurationDatabase/Configurations/ExternalSystemConfiguration.cs:25-26,75-77` |
**Description**
`SmtpConfiguration.Credentials`, `ExternalSystemDefinition.AuthConfiguration`, and
`DatabaseConnectionDefinition.ConnectionString` all hold authentication secrets (SMTP
OAuth2 client secrets / passwords, external-system API keys or Basic Auth credentials,
and database passwords respectively). They are mapped as ordinary string columns and
persisted verbatim. Anyone with read access to the configuration database — including
audit-log JSON if these entities are serialized into `AfterStateJson` — obtains the
plaintext secrets. The design doc does not call out encryption-at-rest for these
fields, so the design is also silent on a real risk.
**Recommendation**
Apply encryption to these fields, e.g. an EF Core value converter backed by ASP.NET
Data Protection (the module already configures `IDataProtectionKeyContext`), or rely on
SQL Server Always Encrypted / column encryption. Separately, ensure `IAuditService`
callers never pass these secret-bearing entities (or that the serializer redacts the
fields) so secrets do not leak into `AuditLogEntry.AfterStateJson`. Update the design
doc to state the chosen at-rest protection.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed against source:
`SmtpConfiguration.Credentials`, `ExternalSystemDefinition.AuthConfiguration`, and
`DatabaseConnectionDefinition.ConnectionString` were mapped as ordinary `nvarchar(4000)`
columns and persisted verbatim.
Implemented the recommendation's first option — an in-module EF Core value converter
backed by ASP.NET Data Protection, which the module already uses
(`IDataProtectionKeyContext`, `AddDataProtection().PersistKeysToDbContext`). Added
`EncryptedStringConverter` (purpose-scoped `IDataProtector`; `Protect` on write,
`Unprotect` on read; null-safe; surfaces a clear message on a `CryptographicException`).
`ScadaLinkDbContext` gained an `(options, IDataProtectionProvider)` constructor and
applies the converter to the three secret columns in `OnModelCreating`; the DI
registration in `ServiceCollectionExtensions` now constructs the context with the
registered provider. The secret columns were widened to `HasMaxLength(8000)` (EF maps
this to `nvarchar(max)` on SQL Server) so ciphertext expansion cannot truncate the
value; migration `20260517010521_EncryptSecretColumns` carries the column-type change.
Regression tests added in `SecretEncryptionTests.cs` verify the raw column value is
never the plaintext secret and that EF transparently decrypts on read, for all three
columns plus a null round-trip.
The encryption scheme itself is fully in-module; the only remaining cross-cutting item
is a documentation gap — the design doc does not yet state encryption-at-rest for these
fields. That doc update is outside this module's editable scope (constraint: edit only
`src/ScadaLink.ConfigurationDatabase`, the tests, and this file) and is surfaced here
for a follow-up to `docs/requirements/Component-ConfigurationDatabase.md`. The audit
secret-leak concern is mitigated separately by CD-007's serializer hardening; whether
callers should additionally redact secret-bearing entities before passing them to
`IAuditService` is a caller-side concern in other modules and is also surfaced for
follow-up. The code fix in this module is complete.
### ConfigurationDatabase-005 — Audit `Id` type disagrees with the design doc
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/AuditConfiguration.cs:11` (entity `src/ScadaLink.Commons/Entities/Audit/AuditLogEntry.cs`) |
**Description**
The design doc's Audit Entry Schema table specifies `Id` as `Long / GUID`, and notes
the audit table is append-only and retained indefinitely. The actual `AuditLogEntry`
entity uses an `int` identity key. For a never-purged, append-only table that
accumulates one row per save operation across the system lifetime, a 32-bit identity
risks overflow over a long deployment horizon, and the code drifts from the documented
schema.
**Recommendation**
Change `AuditLogEntry.Id` to `long` (and the corresponding migration column to
`bigint`) to match the design doc and remove the overflow risk, or — if `int` is
intentional — update the design doc's schema table to say `int` and justify it.
Resolve the discrepancy in one direction.
**Resolution**
_Unresolved._
### ConfigurationDatabase-006 — `Site.GrpcNodeAAddress` / `GrpcNodeBAddress` columns are unbounded
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/SiteConfiguration.cs:24-25` |
**Description**
`SiteConfiguration` explicitly sets `HasMaxLength(500)` for `NodeAAddress` and
`NodeBAddress`, but the entity also has `GrpcNodeAAddress` and `GrpcNodeBAddress`
(added per the gRPC streaming design decision) which are not configured at all. With no
length set, EF Core maps them to `nvarchar(max)`. This is inconsistent with the sibling
address columns, wastes the opportunity to constrain input, and `nvarchar(max)` columns
cannot be indexed and have different storage/performance characteristics.
**Recommendation**
Add `builder.Property(s => s.GrpcNodeAAddress).HasMaxLength(500);` and the same for
`GrpcNodeBAddress`, matching the existing `NodeAAddress`/`NodeBAddress` mapping, and
generate a migration to alter the column types.
**Resolution**
_Unresolved._
### ConfigurationDatabase-007 — `AuditService` does not handle JSON-serialization failure of arbitrary `afterState`
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/Services/AuditService.cs:28-30` |
**Description**
`LogAsync` serializes the caller-supplied `afterState` object with
`JsonSerializer.Serialize(afterState)` using default options. EF entity POCOs commonly
have navigation properties; serializing an entity that has loaded navigations (e.g. a
`Template` with `Attributes`/`Scripts`, or any entity with a cycle) will throw
`JsonException` for a reference cycle or produce a very large payload. Because audit
writes are designed to commit in the same transaction as the change, a serialization
exception thrown here will roll back the *entire* business operation — a template
update fails because its audit entry could not be serialized. This couples audit
robustness to the shape of every entity passed in.
**Recommendation**
Configure `JsonSerializerOptions` with `ReferenceHandler.IgnoreCycles` (or
`Preserve`) and a sensible `MaxDepth`, and consider serializing a projected
DTO/snapshot rather than the live tracked entity. Decide explicitly whether an audit
serialization failure should fail the operation or be logged and degraded gracefully,
and document that decision against the design doc's transactional-guarantee section.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: `LogAsync`
called `JsonSerializer.Serialize(afterState)` with default options, so any `afterState`
graph containing a reference cycle threw `JsonException` — and because the audit entry
commits in the same transaction as the change it records, that exception rolled back
the entire business operation.
Fix applied per the recommendation: `AuditService` now serializes via a static
`JsonSerializerOptions` configured with `ReferenceHandler.IgnoreCycles` and
`MaxDepth = 32`. The serialization is additionally wrapped in a `SerializeAfterState`
helper that catches a residual `JsonException`/`NotSupportedException` and substitutes a
small diagnostic placeholder JSON (`AuditSerializationError` + `StateType`) — an explicit
decision that an audit-serialization failure must **degrade gracefully** and never roll
back the audited operation. The audit entry is always recorded; the design doc's
transactional-guarantee section ("if the change succeeds, the audit entry is always
recorded") is thereby honoured even for pathological state objects. Regression test
added in `AuditServiceTests.cs`:
`LogAsync_AfterStateWithReferenceCycle_DoesNotThrow_AndDoesNotRollBackOperation`.
### ConfigurationDatabase-008 — `GetApprovedKeysForMethodAsync` CSV parsing silently drops malformed ids
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/InboundApiRepository.cs:46-58` |
**Description**
`ApiMethod.ApprovedApiKeyIds` is stored as a comma-separated string of integer ids.
`GetApprovedKeysForMethodAsync` splits it, maps each token with
`int.TryParse(...) ? id : -1`, then filters with `id > 0`. Any token that fails to
parse, or a legitimately negative/zero id, is silently discarded. If `ApprovedApiKeyIds`
becomes corrupt (e.g. a stray name instead of an id), the method quietly returns fewer
approved keys than expected, which for an API-key authorization path means a method may
unexpectedly reject a key that should be approved. Storing a relational many-to-many as
a CSV string in a column is itself fragile (no FK integrity, no cascade on key delete).
**Recommendation**
Short term: log a warning when a token fails to parse instead of silently dropping it,
so corruption is observable. Longer term: replace the CSV column with a proper join
table (`ApiMethodApprovedKey`) with foreign keys to `ApiMethod` and `ApiKey`, which
gives referential integrity and correct cascade behaviour when an API key is deleted.
**Resolution**
_Unresolved._
### ConfigurationDatabase-009 — Multi-collection eager loads issue cartesian-product queries
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/TemplateEngineRepository.cs:43-51,53-61`, `src/ScadaLink.ConfigurationDatabase/Repositories/CentralUiRepository.cs:45-55` |
**Description**
`GetAllTemplatesAsync`, `GetTemplatesComposingAsync`, and `GetTemplateTreeAsync` each
`Include` three-to-four sibling collections (`Attributes`, `Alarms`, `Scripts`,
`Compositions`) in a single query. EF Core's default single-query strategy produces a
cartesian-product join across those collections, so a template with N attributes, M
alarms, and K scripts yields N×M×K rows that EF must then de-duplicate. For templates
with many members this materially inflates the result set and query time.
`GetInstanceByIdAsync`/`GetAllInstancesAsync` have the same shape with three
collections.
**Recommendation**
Add `.AsSplitQuery()` to these multi-collection-include queries (or set
`UseQuerySplittingBehavior(QuerySplittingBehavior.SplitQuery)` globally in
`AddConfigurationDatabase`) so each collection is loaded with a separate query and the
cartesian explosion is avoided.
**Resolution**
_Unresolved._
### ConfigurationDatabase-010 — Several repositories and `InstanceLocator` lack direct test coverage
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/TemplateEngineRepository.cs`, `Repositories/DeploymentManagerRepository.cs`, `Repositories/ExternalSystemRepository.cs`, `Repositories/InboundApiRepository.cs`, `Repositories/NotificationRepository.cs`, `Repositories/SiteRepository.cs`, `Services/InstanceLocator.cs` |
**Description**
The test project covers `SecurityRepository`, `CentralUiRepository`, `AuditService`,
optimistic concurrency, seed data, and Data Protection persistence. There are no direct
tests for `TemplateEngineRepository` (the largest repository, and the one with the
CD-001 bug, which a test would have caught), `DeploymentManagerRepository` (including
its `Local`-then-stub delete fallback and the `DeleteInstanceAsync`
restrict-FK-cleanup logic), `ExternalSystemRepository`, `InboundApiRepository` (notably
`GetApprovedKeysForMethodAsync` CSV parsing — CD-008), `NotificationRepository`,
`SiteRepository` (including its stub-attach delete path), or `InstanceLocator`.
**Recommendation**
Add repository-level tests using the existing `SqliteTestHelper` pattern, covering at
minimum: CRUD round-trips, the stub-attach delete fallbacks in
`DeploymentManagerRepository`/`SiteRepository`, `DeleteInstanceAsync`'s explicit
deployment-record cleanup, `GetApprovedKeysForMethodAsync` with valid/malformed CSV,
and `InstanceLocator.GetSiteIdForInstanceAsync` for found/not-found cases.
**Resolution**
_Unresolved._
### ConfigurationDatabase-011 — Inconsistent constructor null-guarding across repositories/services
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/ExternalSystemRepository.cs:11-14`, `Repositories/InboundApiRepository.cs:11-14`, `Repositories/NotificationRepository.cs:11-14`, `Services/InstanceLocator.cs:13-16` |
**Description**
`SecurityRepository`, `CentralUiRepository`, `TemplateEngineRepository`,
`DeploymentManagerRepository`, `SiteRepository`, and `AuditService` all guard their
injected `ScadaLinkDbContext` with `?? throw new ArgumentNullException(...)`.
`ExternalSystemRepository`, `InboundApiRepository`, `NotificationRepository`, and
`InstanceLocator` assign the constructor argument directly with no guard. This is a
minor consistency/maintainability issue: although the DI container will not normally
supply null, the divergence makes the codebase look unfinished and means a future
hand-constructed instance fails with a less informative `NullReferenceException` later.
**Recommendation**
Apply the same `?? throw new ArgumentNullException(nameof(context))` guard in the four
inconsistent constructors so all data-access types behave uniformly.
**Resolution**
_Unresolved._
@@ -0,0 +1,632 @@
# Code Review — DataConnectionLayer
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.DataConnectionLayer` |
| Design doc | `docs/requirements/Component-DataConnectionLayer.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 2 |
## Summary
The DataConnectionLayer is a reasonably well-structured module: the Become/Stash
lifecycle state machine, the captured-`Self` marshalling of background-thread
disconnect events, and the protocol-factory abstraction all follow the design doc
and Akka.NET conventions. However, the review found one **critical** actor-model
violation — `HandleSubscribe` spawns a `Task.Run` that mutates the actor's private
dictionaries and counters from a thread-pool thread, racing with the actor's own
message loop. Several **high**-severity issues cluster around concurrency and error
handling: the subscription-failure path leaves the connection with degraded subtrees
but no real recovery, the `DataConnectionManagerActor`'s `Restart` supervision drops
all subscription state on a connection-actor crash, and `RealOpcUaClient`'s monitored-
item callback dictionary is mutated without synchronization while OPC UA notification
threads read it. The remaining findings concern stale health counters after failover,
an unused `WriteTimeout` option (writes are unbounded despite the design promising a
30 s timeout), `ReadBatchAsync` aborting mid-batch, and documentation drift between
the design doc's failover state machine and the implemented unstable-disconnect
heuristic. Test coverage is adequate for the happy paths and failover but absent for
tag-resolution retry, disconnect/re-subscribe, and concurrency around `HandleSubscribe`.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | x | `_resolvedTags` double-counting and stale counters after failover; `ReadBatchAsync` aborts mid-batch. |
| 2 | Akka.NET conventions | x | `Task.Run` mutating actor state (critical); `Restart` supervision loses state; closures capturing `_subscriptionsByInstance`. |
| 3 | Concurrency & thread safety | x | Actor state mutated off the actor thread; `RealOpcUaClient` callback dictionary unsynchronized. |
| 4 | Error handling & resilience | x | Subscription failures not surfaced; unbounded write with no timeout; reconnect after subscribe-time failure not handled. |
| 5 | Security | x | `AutoAcceptUntrustedCerts` defaults to `true`; OPC UA password handling acceptable. See finding 012. |
| 6 | Performance & resource management | x | `HandleUnsubscribe` O(n^2) over instances; initial-read loop serial per tag. |
| 7 | Design-document adherence | x | Failover heuristic (unstable-disconnect count) differs from documented state machine; `WriteTimeout` documented but unused. |
| 8 | Code organization & conventions | x | No issues found — POCOs in Commons, options class owned by component, factory pattern consistent. |
| 9 | Testing coverage | x | No tests for tag-resolution retry, disconnect/re-subscribe, bad-quality push, or `HandleSubscribe` concurrency. |
| 10 | Documentation & comments | x | XML comment on `RaiseDisconnected` claims thread safety it does not have; design doc round-robin description stale. |
## Findings
### DataConnectionLayer-001 — `Task.Run` in `HandleSubscribe` mutates actor state off the actor thread
| | |
|--|--|
| Severity | Critical |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:473-538` |
**Description**
`HandleSubscribe` launches a `Task.Run(async () => ...)` that runs on a thread-pool
thread and directly mutates the actor's private mutable state: `instanceTags` (a
reference into `_subscriptionsByInstance`), `_subscriptionIds`, `_totalSubscribed`,
`_resolvedTags`, and `_unresolvedTags`. All of these are simultaneously read and
written by the actor's own message loop (`HandleTagValueReceived`, `HandleUnsubscribe`,
`ReSubscribeAll`, `HandleRetryTagResolution`, `ReplyWithHealthReport`). This is a
direct violation of the Akka.NET actor model, which guarantees single-threaded access
to actor state only when state is touched on the actor thread. Two concurrent
subscribe requests, or a subscribe overlapping a `TagValueReceived` / `GetHealthReport`,
produce data races on `Dictionary`/`HashSet`/`int``Dictionary` is not thread-safe
and concurrent mutation can corrupt internal buckets, throw, or lose entries. It can
also produce torn reads of the health counters.
**Recommendation**
Do not mutate actor state from the background task. Perform only the `await
_adapter.SubscribeAsync(...)` / `ReadAsync(...)` I/O in the task, collect the results
into a local immutable result object, and `PipeTo(Self)` an internal message (e.g.
`SubscribeCompleted`) whose handler — running on the actor thread — applies all state
mutations and counter updates. The response to `Sender` should be sent from that
handler too.
**Resolution**
Resolved 2026-05-16. `HandleSubscribe` was restructured to follow the actor's own
`PipeTo(Self)` pattern (the one already used by `HandleRetryTagResolution`): the
background `Task.Run` now performs only adapter I/O (`SubscribeAsync`/`ReadAsync`),
collects per-tag outcomes into an immutable `SubscribeCompleted` message, and pipes
that to `Self`. All mutation of `_subscriptionIds`, `_subscriptionsByInstance`,
`_totalSubscribed`, `_resolvedTags` and `_unresolvedTags` now happens in the new
`HandleSubscribeCompleted` handler on the actor thread; it is wired into the
Connected, Connecting and Reconnecting states so an in-flight subscribe is applied
regardless of state transitions. Regression test
`DCL001_ConcurrentSubscribes_DoNotCorruptSubscriptionCounters` (30×30 concurrent
subscribes) fails against the pre-fix code and passes after. Fixed by the commit
whose message references `DataConnectionLayer-001`.
### DataConnectionLayer-002 — `Restart` supervision discards all subscription state on connection-actor crash
| | |
|--|--|
| Severity | High |
| Category | Akka.NET conventions |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionManagerActor.cs:131-141` |
**Description**
`DataConnectionManagerActor.SupervisorStrategy` returns a `OneForOneStrategy` with
`Directive.Restart` for `DataConnectionActor` failures. On restart, Akka.NET creates a
fresh actor instance, so all in-memory fields — `_subscriptionsByInstance`,
`_subscriptionIds`, `_subscribers`, `_unresolvedTags`, the quality counters — are
silently discarded. The actor re-enters `Connecting` with zero subscriptions, and the
design doc's "transparent re-subscribe" guarantee (WP-10) is broken: Instance Actors
that had subscribed before the crash never get their tags re-subscribed and will sit
at uncertain/stale quality indefinitely with no error returned. There is no durable
subscription store from which a restarted actor could rebuild state.
**Recommendation**
Either (a) make the subscription registry durable/recoverable so a restarted actor
can rebuild it (persist to local SQLite as the design doc says connection definitions
are, and have `PreStart` reload subscriptions), or (b) treat a connection-actor crash
as a lifecycle event the `DataConnectionManagerActor` notices, so it can re-issue the
subscription registrations. At minimum document that subscribers must re-register
after a crash and surface the lost-state condition rather than failing silently.
**Resolution**
Resolved 2026-05-16. The `DataConnectionManagerActor.SupervisorStrategy` was changed
from `Directive.Restart` to `Directive.Resume` for `DataConnectionActor` failures.
`Resume` keeps the existing actor instance and all its in-memory subscription state
(`_subscriptionsByInstance`, `_subscriptionIds`, `_subscribers`, quality counters)
intact across a transient handler exception, so the design doc's "transparent
re-subscribe" guarantee (WP-10) is preserved. The actor is a long-lived stateful
coordinator and its own Become/Stash reconnect state machine already recovers
connection-level faults — it does not need a restart. This also aligns with the
ScadaLink convention of `Resume` for coordinator actors. Regression test
`DCL002_ConnectionActorCrash_PreservesSubscriptionState` crashes the connection actor
via a synchronously-throwing write and asserts the subscription survives (health
report still shows 1 subscribed/resolved tag); it fails against the pre-fix `Restart`
code and passes after. Fixed by the commit whose message references
`DataConnectionLayer-002` (commit `<pending>`).
### DataConnectionLayer-003 — `RealOpcUaClient` callback/monitored-item dictionaries mutated without synchronization
| | |
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:16-17,130-131,153,163,173,183-184` |
**Description**
`_monitoredItems` and `_callbacks` are plain `Dictionary<,>` instances. They are
written from `CreateSubscriptionAsync` / `RemoveSubscriptionAsync` (invoked from the
`DataConnectionActor`'s `Task.Run` / `ContinueWith` continuations, i.e. thread-pool
threads) and from `DisconnectAsync` (`.Clear()`), while being read concurrently from
the OPC Foundation SDK's `MonitoredItem.Notification` event handler, which fires on
the SDK's internal publish threads (`_callbacks.TryGetValue(handle, ...)` at line
163). Concurrent reads during a `Dictionary` resize or `Clear()` are undefined
behaviour — they can throw `InvalidOperationException`, return wrong entries, or
corrupt the dictionary. The `DataConnectionActor`'s subscribe path already runs off
the actor thread (finding 001), so multiple subscribe calls can also race each other
here.
**Recommendation**
Use `ConcurrentDictionary<,>` for `_monitoredItems` and `_callbacks`, or guard all
access with a lock. Note that fixing finding 001 (serialising subscribe through the
actor thread) reduces but does not eliminate the race, because the SDK notification
threads still read `_callbacks` concurrently with `RemoveSubscriptionAsync` /
`DisconnectAsync`.
**Resolution**
Resolved 2026-05-16. `_monitoredItems` and `_callbacks` in `RealOpcUaClient` were
changed from plain `Dictionary<,>` to `ConcurrentDictionary<,>`, and the two
`Remove(key)` call sites switched to `TryRemove`. This makes the maps safe to read
from the OPC Foundation SDK's publish threads (`MonitoredItem.Notification` reading
`_callbacks`) concurrently with subscribe/disconnect mutations on other threads.
`RealOpcUaClient` wraps concrete OPC Foundation SDK types (`ISession`,
`Subscription`, `MonitoredItem`) and cannot be exercised without a live OPC UA
server, so the regression is guarded structurally by
`DCL003_SharedDictionaryFields_AreConcurrentCollections` (a reflection test asserting
both fields are `ConcurrentDictionary<,>`); it fails against the pre-fix `Dictionary`
code and passes after. Fixed by the commit whose message references
`DataConnectionLayer-003` (commit `<pending>`).
### DataConnectionLayer-004 — Subscribe-time tag-resolution failure leaves the connection healthy but never recovers correctly
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:495-503,529-537` |
**Description**
When `_adapter.SubscribeAsync` throws inside the `HandleSubscribe` background task,
the catch block adds the tag to `_unresolvedTags` and increments `_totalSubscribed`,
treating every subscribe exception as a tag-resolution failure. But `SubscribeAsync`
also throws `InvalidOperationException` from `EnsureConnected()` when the OPC UA
client is not connected, and throws on transport faults — these are connection
problems, not bad tag paths. They get misclassified as unresolved tags and retried on
the 10 s tag-resolution timer instead of triggering the reconnection state machine.
Worse, the design doc (Tag Path Resolution, step 2) says the failed tag's attribute
must be marked quality `bad`; the code never pushes a bad-quality update to the
subscriber for a tag that fails to resolve at subscribe time, so the Instance Actor
stays at uncertain quality with no signal. The `TagResolutionFailed` message it sends
to `Self` only logs and re-arms the timer (`HandleTagResolutionFailed`).
**Recommendation**
Distinguish connection-level exceptions (raise `AdapterDisconnected` / let the
reconnect machine handle them) from genuine node-not-found errors. For genuine
resolution failures, push a `TagValueUpdate` with `QualityCode.Bad` to the subscribing
Instance Actor so it reflects the documented behaviour.
**Resolution**
Resolved 2026-05-16. The subscribe background task now classifies each subscribe
exception via the new `IsConnectionLevelFailure` helper (`InvalidOperationException`
— thrown by `EnsureConnected()` — plus `SocketException`/`TimeoutException`/
`IOException` count as connection-level; anything else is a genuine resolution
failure). The classification is carried on `SubscribeTagResult.ConnectionLevelFailure`
and applied on the actor thread in `HandleSubscribeCompleted`: connection-level
failures no longer become unresolved tags and instead drive the reconnection state
machine (`HandleSubscribeCompleted` returns a flag and the Connected-state handler
calls `BecomeReconnecting`); genuine resolution failures still go to `_unresolvedTags`
and the retry timer, and now also push a `TagValueUpdate` with `QualityCode.Bad` to
the subscribing Instance Actor, matching the design doc's Tag Path Resolution step 2.
Regression tests `DCL004_GenuineTagResolutionFailure_PushesBadQualityToSubscriber`
and `DCL004_ConnectionLevelSubscribeFailure_TriggersReconnect_NotTagRetry` both fail
against the pre-fix code and pass after. Fixed by the commit whose message references
`DataConnectionLayer-004` (commit `<pending>`).
### DataConnectionLayer-005 — `WriteTimeout` option is documented and configured but never applied
| | |
|--|--|
| Severity | High |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/DataConnectionOptions.cs:15`, `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:573-590` |
**Description**
`DataConnectionOptions.WriteTimeout` (default 30 s) and the design doc's "Shared
Settings" table both promise a bounded timeout for synchronous device writes. The
value is never read anywhere in the module (`grep` confirms only the declaration).
`HandleWrite` calls `_adapter.WriteAsync(request.TagPath, request.Value)` with no
`CancellationToken` and no timeout. If the OPC UA server hangs (TCP black-hole, no
RST), the write `Task` never completes, `PipeTo(sender)` never fires, and the calling
script's Ask blocks until its own ask-timeout — and the script gets no DCL-level
error. The design states write failures (including timeout) must be returned
synchronously to the script; an unbounded write violates that.
**Recommendation**
Create a `CancellationTokenSource(_options.WriteTimeout)`, pass its token to
`WriteAsync`, and in the continuation translate cancellation into a failed
`WriteTagResponse` with a timeout error message. Apply the same to the read used by
the initial-value seed and to `WriteBatchAndWaitAsync` paths if they are reachable.
**Resolution**
Resolved 2026-05-16. `HandleWrite` now creates a `CancellationTokenSource(_options.WriteTimeout)`,
passes its token to `_adapter.WriteAsync(...)`, and disposes the source in the
continuation. A cancelled/timed-out write (`Task.IsCanceled` or a base
`OperationCanceledException`) is translated into a failed `WriteTagResponse` with a
`"Write timeout after Ns"` message, so a hung device write is bounded and the failure
is returned synchronously to the calling script (WP-11) instead of blocking until the
script's own Ask-timeout. (The `WriteBatchAndWaitAsync` adapter path already accepts
an explicit `timeout`/`CancellationToken` and is not invoked by `HandleWrite`, so no
change was needed there.) Regression test
`DCL005_Write_ThatHangs_TimesOutAndReturnsFailureSynchronously` uses an adapter whose
`WriteAsync` only completes when its token fires; it fails against the pre-fix
unbounded code and passes after. Fixed by the commit whose message references
`DataConnectionLayer-005` (commit `<pending>`).
### DataConnectionLayer-006 — Health quality counters not reset/recomputed after failover or re-subscribe
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:645-673,721-756` |
**Description**
`ReSubscribeAll` resets `_subscriptionIds`, `_unresolvedTags` and `_resolvedTags` to a
clean slate, but leaves `_lastTagQuality`, `_tagsGoodQuality`, `_tagsBadQuality` and
`_tagsUncertainQuality` untouched. `PushBadQualityForAllTags` (called on disconnect)
sets `_tagsBadQuality = _lastTagQuality.Count` and zeroes the others. After a
reconnect, `HandleTagValueReceived` decrements the *old* bucket using
`_lastTagQuality`'s value and increments the new one — but tags resolved for the first
time after reconnect were never in `_lastTagQuality`, so they only increment, never
decrement, and the totals can drift above `_totalSubscribed`. Over repeated
disconnect/reconnect cycles the health report's good/bad/uncertain counts become
unreliable.
**Verification note**: Confirmed against source. The root cause is broader than the
reconnect path the finding describes: `HandleUnsubscribe` also never removes a tag
from `_lastTagQuality` nor decrements its quality bucket, so an unsubscribed tag
lingers and `PushBadQualityForAllTags` (which sets `_tagsBadQuality =
_lastTagQuality.Count`) over-counts it — driving the bad-quality count above
`_totalSubscribed` even without a re-subscribe. Both the unsubscribe leak and the
re-subscribe drift are real.
**Recommendation**
On `BecomeConnected` after a re-subscribe (or in `ReSubscribeAll`), clear
`_lastTagQuality` and the three quality counters and let them be repopulated from
fresh `TagValueReceived` messages. Alternatively recompute the buckets from
`_lastTagQuality` whenever it changes rather than maintaining incremental counters.
**Resolution**
Resolved 2026-05-16 (commit pending). `HandleUnsubscribe` now removes each
unsubscribed tag from `_lastTagQuality` and decrements the corresponding quality
bucket, then reports the corrected counters via `UpdateTagQuality`/`UpdateTagResolution`;
`ReSubscribeAll` clears `_lastTagQuality` and zeroes the three quality counters so
post-reconnect tags are repopulated from fresh `TagValueReceived` messages instead of
only incrementing. Regression test
`DCL006_DisconnectAfterUnsubscribe_BadQualityCountMatchesRemainingTags` subscribes two
tags, pushes Good values, unsubscribes one, then disconnects and asserts
`PushBadQualityForAllTags` reports exactly 1 bad tag (the reconnect is gated open so
`ReSubscribeAll` does not run before the assertion); it reports 2 against the pre-fix
code and 1 after.
### DataConnectionLayer-007 — `ReadBatchAsync` aborts the whole batch on the first failing tag
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:187-195` |
**Description**
`ReadBatchAsync` loops calling `ReadAsync` per tag. `ReadAsync` re-throws any
non-cancellation exception (line 184). So if any single tag in the batch throws (bad
node, transient fault), the entire `ReadBatchAsync` throws and the caller gets no
results for the tags that *did* read successfully — even though `ReadResult` already
has a `Success`/`ErrorMessage` shape designed to carry per-tag failures. The batch is
also fully serial (one round-trip per tag), defeating the point of a batch API; the
design doc lists `ReadBatch`/`WriteBatch` as first-class operations.
**Verification note**: Confirmed against source — `ReadAsync` re-throws on any
non-`OperationCanceledException`, aborting the whole batch.
**Recommendation**
Catch per-tag exceptions inside the loop and store a failed `ReadResult` for that tag
so the batch returns a complete map. Ideally issue a single OPC UA `Read` service call
for all node IDs (`RealOpcUaClient.ReadValueAsync` already builds a
`ReadValueIdCollection` — extend it to accept multiple nodes).
**Resolution**
Resolved 2026-05-16 (commit pending). `ReadBatchAsync` now wraps each per-tag
`ReadAsync` in a try/catch: a per-tag exception is recorded as a failed `ReadResult`
(`Success: false`, message = the exception message) so the batch returns a complete
result map for every requested tag; `OperationCanceledException` is still propagated
so a cancelled batch aborts as a whole. The per-tag-serial loop and single-service-call
optimisation were deliberately left for a follow-up — they are a performance concern,
not the correctness bug this finding raised. Regression test
`DCL007_ReadBatch_ReturnsPerTagResults_WhenOneTagFails` reads three tags where the
middle one throws and asserts all three appear in the result map with the failing one
marked unsuccessful; it threw (no map returned) against the pre-fix code and passes
after.
### DataConnectionLayer-008 — `HandleUnsubscribe` is O(n^2) over instances and rechecks `_unresolvedTags` redundantly
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:540-569` |
**Description**
For each tag of the instance being removed, `HandleUnsubscribe` scans every other
instance's tag set (`_subscriptionsByInstance.Where(...).Any()`), making the operation
O(tags x instances). On a site with many instances sharing a connection this is
needlessly expensive on every instance stop/redeploy. Separately, line 562
re-evaluates `!_unresolvedTags.Contains(tagPath)` immediately after line 561 already
removed `tagPath` from `_unresolvedTags`, so the condition is always true — dead
logic that obscures intent (the decrement of `_resolvedTags` is unconditional in
practice).
**Recommendation**
Maintain a reference count per tag path (or a `tagPath -> set<instance>` reverse index)
so the "any other subscriber" check is O(1). Remove the redundant `_unresolvedTags`
re-check or restructure so the resolved/unresolved decrement reflects the tag's actual
prior state captured before removal.
**Resolution**
_Unresolved._
### DataConnectionLayer-009 — Implemented failover heuristic diverges from the documented state machine
| | |
|--|--|
| Severity | Medium — partially design-doc work outside this module's editable scope |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:189,242-297,379-449`, `docs/requirements/Component-DataConnectionLayer.md:73-85` |
**Description**
The design doc's failover state machine reads "retry active endpoint (5s) -> N failures
(>= FailoverRetryCount) -> switch to other endpoint". The code implements two *separate*
failover triggers: (a) `HandleReconnectResult` counts `_consecutiveFailures` on
connect-attempt failures (matches the doc), and (b) `BecomeReconnecting` additionally
counts `_consecutiveUnstableDisconnects` — connections that succeeded but dropped
within a hard-coded 60 s `StableConnectionThreshold` — and fails over on that count
too. The unstable-disconnect path, the 60 s threshold, and the fact that failover can
happen on *successful-but-flaky* connections are not described in the component doc at
all. A reviewer or operator reading `Component-DataConnectionLayer.md` would not
predict this behaviour, and the 60 s threshold is a magic constant not exposed via
`DataConnectionOptions`.
**Verification note**: Confirmed against source. The hard-coded
`StableConnectionThreshold = TimeSpan.FromSeconds(60)` `static readonly` field and the
`_consecutiveUnstableDisconnects` failover path both exist as described.
**Recommendation**
Update `Component-DataConnectionLayer.md` to document the unstable-disconnect failover
path and the stability threshold, and move the 60 s threshold into
`DataConnectionOptions` so it is configurable and consistent with the other tunables.
**Resolution**
Resolved 2026-05-16 (commit pending). The configurability half of the recommendation
is done: the hard-coded `StableConnectionThreshold` constant was removed from
`DataConnectionActor` and replaced with a new `DataConnectionOptions.StableConnectionThreshold`
property (60 s default), bindable from the `DataConnectionLayer` `appsettings.json`
section like `ReconnectInterval`/`TagResolutionRetryInterval`/`WriteTimeout`. Regression
test `DCL009_StableConnectionThreshold_IsConfigurable_WithSixtySecondDefault` guards
the default and the setter. **The documentation half is out of this module's editable
scope** — `docs/requirements/Component-DataConnectionLayer.md` (lines 73-85) still
describes only the connect-failure failover path and does not mention the
unstable-disconnect trigger. **Action required (surfaced):** the DCL design doc should
be updated to document the unstable-disconnect failover path and the configurable
stability threshold; that edit was deliberately not made here because this task is
scoped to `src/ScadaLink.DataConnectionLayer`, tests, and this findings file only.
### DataConnectionLayer-010 — Tag-resolution retry can issue duplicate concurrent subscribe attempts
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:594-619,689-703` |
**Description**
`HandleRetryTagResolution` fires `SubscribeAsync` for every tag in `_unresolvedTags`
via `ContinueWith(...).PipeTo(self)`, but does **not** remove the tags from
`_unresolvedTags` while the attempts are in flight. Because tags are not removed
before the retry, a slow `SubscribeAsync` overlapping the next 10 s tick issues
duplicate concurrent subscribe attempts for the same tag, which can create duplicate
monitored items / leaked subscription IDs (the second success overwrites
`_subscriptionIds[tag]` in `HandleTagResolutionSucceeded`, orphaning the first handle
with no `UnsubscribeAsync` call). The timer-cancel condition in
`HandleTagResolutionSucceeded` is also non-deterministic for the same reason.
**Verification note**: Confirmed against source — `HandleRetryTagResolution` dispatched
`SubscribeAsync` for every tag in `_unresolvedTags` on every tick with no in-flight
guard.
**Recommendation**
Remove tags from `_unresolvedTags` (into an "in-flight" set) when a retry is
dispatched, and only put them back on failure. This prevents overlapping duplicate
subscribe attempts and makes the timer-cancel condition deterministic.
**Resolution**
Resolved 2026-05-16 (commit pending). A new `_resolutionInFlight` `HashSet<string>`
tracks tags whose retry `SubscribeAsync` is currently outstanding.
`HandleRetryTagResolution` now dispatches only for unresolved tags **not** already in
flight (and skips entirely if all are in flight), adding each dispatched tag to the
set; `HandleTagResolutionSucceeded` and `HandleTagResolutionFailed` remove the tag
from the set when its attempt completes, and `HandleUnsubscribe`/`ReSubscribeAll`
clear stale entries. This prevents overlapping duplicate subscribe attempts and the
resulting orphaned monitored items. Regression test
`DCL010_TagResolutionRetry_DoesNotIssueDuplicateConcurrentSubscribes` gives a tag a
genuine initial failure then a retry `SubscribeAsync` that never completes, lets six
100 ms retry ticks elapse, and asserts exactly one retry was dispatched (2 total
subscribe calls); the pre-fix code dispatched on every tick (6 total).
### DataConnectionLayer-011 — Stale subscription callbacks from disposed adapters can still reach the actor
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:486-489,278-285,416-425`, `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:252-262` |
**Description**
On failover the actor disposes the old adapter (`_adapter.DisposeAsync()`,
fire-and-forget) and creates a fresh one. The old adapter's subscription callbacks
captured `self` and `tagPath` and `Tell` `TagValueReceived` to the actor. While the
`Reconnecting` handler ignores `TagValueReceived` (line 334), once the actor reaches
`Connected` again it processes them — and a disposed adapter whose OPC UA SDK threads
have not yet fully torn down could still deliver a value, mixing pre-failover device
data with the new endpoint's data and briefly reporting a value the active endpoint
never produced. There is no per-adapter generation/epoch tag on `TagValueReceived` to
distinguish current from stale callbacks.
**Verification note**: Confirmed against source — `TagValueReceived` carried no
adapter identity, and `HandleTagValueReceived` (reachable in `Connected`) processed
any such message regardless of which adapter produced it.
**Recommendation**
Add an adapter-generation counter incremented on every adapter swap; stamp it onto
`TagValueReceived` (captured in the callback closure) and drop messages whose
generation does not match the current adapter in `HandleTagValueReceived`.
**Resolution**
Resolved 2026-05-16 (commit pending). Implemented exactly as recommended: a new
`_adapterGeneration` `int` field is incremented at both adapter-swap sites (the
unstable-disconnect failover in `BecomeReconnecting` and the connect-failure failover
in `HandleReconnectResult`). The `TagValueReceived` record gained an
`AdapterGeneration` field; every subscription callback closure (`HandleSubscribe`, the
initial-read seed, `HandleRetryTagResolution`, `ReSubscribeAll`) captures the
generation in effect at subscribe time and stamps it onto each `TagValueReceived`.
`HandleTagValueReceived` drops any message whose generation no longer matches the
current adapter, so a callback fired by a disposed adapter after failover cannot reach
an Instance Actor. Regression test
`DCL011_StaleTagValueFromOldAdapter_IsNotForwardedAfterFailover` subscribes on the
primary, fails over to the backup, then invokes the captured primary callback with a
stale value and asserts the subscriber receives nothing; the stale value reached the
subscriber against the pre-fix code and is dropped after.
### DataConnectionLayer-012 — `AutoAcceptUntrustedCerts` defaults to `true`, accepting any server certificate
| | |
|--|--|
| Severity | Medium — full secure default also requires a Commons + design-doc change outside this module |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/IOpcUaClient.cs:17`, `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:49,60-61`, `docs/requirements/Component-DataConnectionLayer.md:116` |
**Description**
`OpcUaConnectionOptions.AutoAcceptUntrustedCerts` defaults to `true`, and
`RealOpcUaClient.ConnectAsync` wires `CertificateValidator.CertificateValidation += (_, e) => e.Accept = true`
when it is set. With the default, every server certificate is accepted unconditionally
— there is no certificate-pinning or trust-store enforcement — which defeats the
`Sign`/`SignAndEncrypt` security modes against an active man-in-the-middle on the OPC
UA link. The design doc explicitly lists `true` as the default. For an industrial
control link this is a meaningful exposure; a secure-by-default posture would reject
untrusted certs unless an operator opts in per connection.
**Verification note**: Confirmed against source. Note the *authoritative* runtime
default does not actually live on `OpcUaConnectionOptions` — for a real connection
`OpcUaDataConnection.ConnectAsync` builds `OpcUaConnectionOptions` from
`OpcUaEndpointConfig` (in `ScadaLink.Commons`), whose `AutoAcceptUntrustedCerts`
property also defaults to `true`. `OpcUaConnectionOptions`' own default is only the
fallback used when an `OpcUaConnectionOptions` is constructed directly.
**Recommendation**
Default `AutoAcceptUntrustedCerts` to `false` and require explicit per-connection
opt-in, or at minimum log a prominent warning whenever the auto-accept validator is
installed. Update the design doc to reflect the secure default.
**Resolution**
Resolved 2026-05-16 (commit pending). The two in-scope parts of the recommendation
are done: (1) `OpcUaConnectionOptions.AutoAcceptUntrustedCerts` now defaults to
`false`; (2) `RealOpcUaClient.ConnectAsync` logs a prominent `ILogger` warning
whenever the auto-accept certificate validator is installed (an `ILogger<RealOpcUaClient>`
was added as an optional constructor parameter, defaulting to `NullLogger`, so
existing callers are unaffected). Regression test
`DCL012_OpcUaConnectionOptions_AutoAcceptUntrustedCerts_DefaultsToFalse` guards the
new secure default. **Two parts remain outside this module's editable scope and are
surfaced as action required:** (a) `ScadaLink.Commons.Types.DataConnections.OpcUaEndpointConfig.AutoAcceptUntrustedCerts`
still defaults to `true` — since that is the value actually used for a real connection
(see verification note above), the Commons default must also be flipped to `false`
for the system to be secure-by-default; (b) `docs/requirements/Component-DataConnectionLayer.md`
line 116 still documents `true` as the default and must be updated. Both edits were
deliberately not made here because this task is scoped to
`src/ScadaLink.DataConnectionLayer`, tests, and this findings file only.
### DataConnectionLayer-013 — Misleading XML comment: `RaiseDisconnected` claims thread safety it does not provide
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:270-281` |
**Description**
The XML doc on `RaiseDisconnected` states "Thread-safe: only the first caller triggers
the event." The implementation is a non-atomic check-then-set on a `volatile bool`
(`if (_disconnectFired) return; _disconnectFired = true;`). `volatile` guarantees
visibility, not atomicity — two threads (e.g. the OPC UA keep-alive thread via
`OnClientConnectionLost` and a `ReadAsync` failure path) can both observe
`_disconnectFired == false` and both invoke `Disconnected`. In practice the
`DataConnectionActor` tolerates a duplicate `AdapterDisconnected` message, so impact
is low, but the comment overstates the guarantee. The same pattern exists in
`RealOpcUaClient.OnSessionKeepAlive` (`_connectionLostFired`).
**Recommendation**
Either make the guard atomic (`Interlocked.Exchange` with an `int` flag, or a lock),
or correct the comment to say "best-effort once-only; a duplicate event is possible
under a race and is tolerated downstream."
**Resolution**
_Unresolved._
+617
View File
@@ -0,0 +1,617 @@
# Code Review — DeploymentManager
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.DeploymentManager` |
| Design doc | `docs/requirements/Component-DeploymentManager.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 5 |
## Summary
The DeploymentManager module is small, well-structured, and clearly maps work
packages (WP-N) onto code. The happy paths for instance deployment, lifecycle
commands, artifact broadcast, and staleness comparison are implemented
sensibly, and the operation lock correctly serializes mutating operations per
instance while allowing cross-instance parallelism. However, the review found a
significant cluster of error-handling and resilience gaps: the deployment
record can be left permanently stuck in `InProgress` when an exception other
than timeout/cancellation is thrown, the catch block writes its failure status
using a cancellation token that may already be cancelled, and the
`OperationLockManager` leaks one `SemaphoreSlim` per instance name forever.
There are also two notable design-document adherence gaps: the
"query-the-site-before-redeploy" idempotency requirement is not implemented
(`GetDeploymentStatusAsync` only reads the local DB), and the "Diff View"
feature is reduced to a bare hash comparison with no added/removed/changed
detail. Configuration is not bound to `appsettings.json`, leaving one option
entirely dead. Test coverage stops at the communication boundary and never
exercises a successful deployment or the lifecycle success paths.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | Stuck `InProgress` record on unexpected exception; cancelled-token failure write. |
| 2 | Akka.NET conventions | ✓ | Module is a plain service layer; it calls `CommunicationService` which wraps Ask. No actors here. No issues. |
| 3 | Concurrency & thread safety | ✓ | `OperationLockManager` is sound but leaks semaphores; `DeployToAllSitesAsync` correctly builds commands sequentially before parallel send. |
| 4 | Error handling & resilience | ✓ | Several gaps — see DeploymentManager-001/002/003/004. |
| 5 | Security | ✓ | SMTP credentials are serialized and broadcast to sites — see DeploymentManager-013. No injection vectors; no authz here (enforced upstream). |
| 6 | Performance & resource management | ✓ | Semaphore leak (DeploymentManager-005); artifact rebuild does N+1 method queries per external system. |
| 7 | Design-document adherence | ✓ | Missing query-before-redeploy (DeploymentManager-006); Diff View not implemented (DeploymentManager-007). |
| 8 | Code organization & conventions | ✓ | Options class not bound to configuration — DeploymentManager-008. POCO/repo placement correct. |
| 9 | Testing coverage | ✓ | No successful-deploy test, no lifecycle success test — DeploymentManager-011; dead `CreateCommand` helper — DeploymentManager-014. |
| 10 | Documentation & comments | ✓ | Misleading timeout comment — DeploymentManager-009; stale option XML doc — DeploymentManager-012. |
## Findings
### DeploymentManager-001 — Unexpected exceptions leave the deployment record stuck in `InProgress`
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:141-199` |
**Description**
`DeployInstanceAsync` sets the record to `InProgress` (lines 137-139), then the
`try` block calls into `CommunicationService` and the repository. The only
`catch` filter is `when (ex is TimeoutException or OperationCanceledException)`.
Any other exception — `InvalidOperationException` (thrown by
`CommunicationService.GetCommunicationActor()` when the actor is not set), a
JSON serialization error, a deserialization failure of the response, a DB
exception on `UpdateDeploymentRecordAsync`, or any transport error — escapes the
method. The deployment record remains in `DeploymentStatus.InProgress`
permanently. Because staleness and the UI both read current status, the
instance is then misreported as "deploying" forever and a re-deploy may be
blocked or misinterpreted. The design explicitly states an interrupted
deployment must be "treated as failed".
**Recommendation**
Broaden the catch to a general `catch (Exception ex)` that records
`DeploymentStatus.Failed` with the error message, audit-logs the failure, and
re-throws or returns a failed `Result`. Keep the timeout-specific branch only
if a distinct message is desired. Ensure the failure-status write happens for
every exit path out of the `try`.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): broadened the `catch` in
`DeployInstanceAsync` to `catch (Exception ex)` so any exception (transport,
serialization, DB, `InvalidOperationException` from an uninitialized
`CommunicationService`) marks the deployment record `Failed` with the error
message and audit-logs the failure, instead of escaping and leaving the record
stuck in `InProgress`. Regression test:
`DeployInstanceAsync_CommunicationThrowsUnexpectedException_RecordMarkedFailed`.
### DeploymentManager-002 — Failure-status write uses a possibly-cancelled cancellation token
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:186-196` |
**Description**
The `catch (Exception ex) when (ex is TimeoutException or
OperationCanceledException)` block updates the record to `Failed` and calls
`UpdateDeploymentRecordAsync`/`SaveChangesAsync`/`LogAsync` passing the same
`cancellationToken` that was just cancelled (an `OperationCanceledException`
caught here means the token is already in the cancelled state). Those
repository and audit calls will themselves throw `OperationCanceledException`
before the failure status is persisted, so the record stays `InProgress` — the
exact bug DeploymentManager-001 describes, reached via the supposedly-handled
path.
**Recommendation**
Perform the cleanup writes with a fresh, non-cancellable token (e.g.
`CancellationToken.None`, optionally with an independent short timeout) so the
failure status is durably recorded even when the original operation was
cancelled or timed out.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): the broadened `catch` block now
performs the failure-status write (`UpdateDeploymentRecordAsync`,
`SaveChangesAsync`) and the audit `LogAsync` with `CancellationToken.None`
instead of the operation's (possibly-cancelled) token, so the `Failed` status
is durably recorded even after a timeout/cancellation. The cleanup writes are
themselves wrapped in a `try`/`catch` that logs (without masking the original
error) if persistence still fails. Regression test:
`DeployInstanceAsync_FailureWrite_UsesNonCancellableToken`.
### DeploymentManager-003 — Successful-deployment cleanup is not atomic with the status write
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:155-170` |
**Description**
After a successful site response the code calls `UpdateDeploymentRecordAsync`
(no `SaveChanges` yet), then `UpdateInstanceAsync`, then
`StoreDeployedSnapshotAsync` (which itself issues `Add`/`Update` calls), then a
single `SaveChangesAsync` at line 170. If `StoreDeployedSnapshotAsync` throws,
the exception is not caught (see DeploymentManager-001) and the
`SaveChangesAsync` never runs — the instance state, deployment status, and
snapshot are all left unpersisted even though the site has actually applied the
deployment. Central and site are now divergent: the site is running the new
config but central still shows the old state and a non-`Success` deployment
record.
**Verification:** Confirmed against source. The DeploymentManager-001 fix made
this strictly worse, not better — after that fix a snapshot-store failure is
caught and the record is flipped from `Success` back to `Failed`, so central
reports a *failed* deployment while the site is running the new config.
**Recommendation**
Wrap the post-success persistence so that, at minimum, the deployment record's
`Success` status is committed. Consider committing the status first, then the
instance state and snapshot, so a later failure does not lose the fact that the
site succeeded. Log loudly if the snapshot write fails after a confirmed site
apply.
**Resolution**
Resolved 2026-05-16 (commit pending): `DeployInstanceAsync` now commits the
deployment record's terminal status (`UpdateDeploymentRecordAsync` +
`SaveChangesAsync`) immediately after the site confirms the apply, *before*
touching instance state or the deployed-config snapshot. The post-success
instance-state update and `StoreDeployedSnapshotAsync` are wrapped in a
best-effort `try`/`catch` that logs loudly for operator reconciliation but no
longer flips the already-committed `Success` record back to `Failed`.
Regression test:
`DeployInstanceAsync_SiteSucceeds_SnapshotWriteFails_RecordStillCommittedSuccess`.
### DeploymentManager-004 — Site-success but central-delete-failure leaves orphaned site config
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:312-319` |
**Description**
In `DeleteInstanceAsync`, when the site responds `Success` the code calls
`_repository.DeleteInstanceAsync` then `SaveChangesAsync`. If `SaveChangesAsync`
throws (DB error, concurrency), the exception propagates uncaught: the site has
already destroyed the Instance Actor and removed its config, but the central
instance record still exists. The instance is now un-deletable through the
normal path (the site no longer has it, so a re-issued delete may fail) and is
permanently orphaned. The design states central must not mark the instance
deleted until the site confirms — but it does not address the inverse failure.
**Verification:** Confirmed against source. `DeleteInstanceAsync` has no
`try`/`catch` around the post-success block, so any exception from
`DeleteInstanceAsync`/`SaveChangesAsync` escapes uncaught to the caller.
**Recommendation**
Catch persistence failures in the post-success block and surface a distinct
error indicating the site succeeded but the central record could not be
removed, so an operator/retry can reconcile. Consider making the central delete
idempotent and retryable independently of the site command.
**Resolution**
Resolved 2026-05-16 (commit pending): the post-success removal in
`DeleteInstanceAsync` (`DeleteInstanceAsync` + `SaveChangesAsync`) is now
wrapped in a `try`/`catch`. A persistence failure no longer escapes uncaught —
it is logged, recorded with a `DeleteOrphaned` audit entry, and surfaced as a
distinct `Result` failure stating the site deleted the instance but the central
record is orphaned and must be reconciled. Regression test:
`DeleteInstanceAsync_SiteSucceeds_CentralDeleteFails_ReturnsDistinctFailure`.
### DeploymentManager-005 — `OperationLockManager` leaks a `SemaphoreSlim` per instance name
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/OperationLockManager.cs:15-33` |
**Description**
`AcquireAsync` does `_locks.GetOrAdd(instanceUniqueName, _ => new
SemaphoreSlim(1, 1))` and entries are never removed. Every distinct instance
unique name that is ever deployed/disabled/enabled/deleted permanently adds a
`SemaphoreSlim` (an `IDisposable` holding a kernel wait handle) to the
dictionary. Over the lifetime of a long-running central process — especially
with the bulk "deploy all out-of-date instances" workflow and instances that
are created and deleted over time — this is an unbounded leak of both managed
memory and OS handles. Deleted instances' semaphores are never reclaimed.
**Verification:** Confirmed against source. `_locks` is a `ConcurrentDictionary`
with no removal path anywhere in the type.
**Recommendation**
Either accept the leak explicitly and document the expected bounded cardinality
of instance names, or implement reclamation: e.g. ref-count handles and remove
+ `Dispose()` the semaphore when the count reaches zero and the lock is free.
At minimum, remove the semaphore entry when an instance is deleted
(`DeleteInstanceAsync`).
**Resolution**
Resolved 2026-05-16 (commit pending): `OperationLockManager` now ref-counts each
lock entry. A reference is reserved (creating the entry if needed) before the
`SemaphoreSlim.WaitAsync`, so concurrent waiters for the same instance share one
semaphore and the entry survives until every waiter/holder has released. When
the reference count reaches zero — on release, timeout, or cancellation — the
entry is removed from the dictionary and the semaphore is `Dispose()`d, so the
process no longer accumulates one kernel wait handle per distinct instance name.
A `TrackedLockCount` diagnostic property was added to make reclamation testable.
Regression tests: `AcquireAsync_ReleasedLock_RemovesSemaphoreEntry`,
`AcquireAsync_ManyDistinctInstances_DoesNotAccumulateSemaphores`,
`AcquireAsync_ContendedLock_KeepsSemaphoreUntilLastReleaseThenReclaims`.
### DeploymentManager-006 — Query-the-site-before-redeploy idempotency requirement not implemented
| | |
|--|--|
| Severity | High |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:84-200,363-368` |
**Description**
The design ("Deployment Identity & Idempotency") requires: "After a central
failover or timeout, the Deployment Manager queries the site for current
deployment state before allowing a re-deploy. This prevents duplicate
application and out-of-order config changes." The code never does this.
`GetDeploymentStatusAsync` only reads the local `DeploymentRecord` from the DB
(`GetDeploymentByDeploymentIdAsync`) — it does not contact the site.
`DeployInstanceAsync` unconditionally generates a new deployment ID and sends a
new `DeployInstanceCommand` regardless of any prior in-flight or timed-out
deployment. After a timeout where the site actually applied the config, a
re-deploy produces a second deployment with no reconciliation against the
site's current revision hash. Site-side stale-rejection is the only safety
net, and that is not verified here.
**Recommendation**
Add a site query (a new `CommunicationService` pattern returning the site's
currently-applied deployment ID / revision hash) and call it before re-deploy
when a prior record for the instance is in `InProgress`/`Failed` due to
timeout. Reconcile: if the site already has the target revision, mark the prior
record `Success` instead of re-sending. Either implement this or update the
design doc to reflect that reconciliation is delegated entirely to site-side
stale-rejection.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): implemented the cross-module
query-the-site-before-redeploy idempotency feature across Commons, SiteRuntime,
Communication, and DeploymentManager — new `DeploymentStateQueryRequest` /
`DeploymentStateQueryResponse` contracts, a `DeploymentManagerActor` handler
answering from the site's deployed-config store, a
`CommunicationService.QueryDeploymentStateAsync` method routed over the
ClusterClient command/control transport, and reconciliation in
`DeployInstanceAsync` (`TryReconcileWithSiteAsync`) that queries the site only
when a prior record is `InProgress` or `Failed` due to a timeout, marks the
prior record `Success` without re-sending if the site already has the target
revision hash, and falls through to a normal deploy (relying on site-side
stale-rejection) when the query fails. Regression tests:
`RoundTrip_DeploymentStateQueryRequest_Succeeds`,
`RoundTrip_DeploymentStateQueryResponse_Deployed_Succeeds`,
`RoundTrip_DeploymentStateQueryResponse_NotDeployed_NullApplied`,
`DeploymentStateQuery_DeployedInstance_ReturnsAppliedIdentity`,
`DeploymentStateQuery_UnknownInstance_ReturnsNotDeployed`,
`DeploymentStateQuery_ForwardedToDeploymentManager`,
`QueryDeploymentStateAsync_BeforeInitialization_Throws`,
`QueryDeploymentStateAsync_SendsEnvelopeAndReturnsResponse`,
`DeployInstanceAsync_PriorInProgressRecord_SiteHasTargetHash_MarksSuccessWithoutRedeploy`,
`DeployInstanceAsync_PriorInProgressRecord_SiteHasDifferentHash_ProceedsWithDeploy`,
`DeployInstanceAsync_PriorFailedTimeoutRecord_QueriesSite`,
`DeployInstanceAsync_PriorSuccessRecord_SkipsSiteQuery`,
`DeployInstanceAsync_FreshFirstTimeDeploy_SkipsSiteQuery`,
`DeployInstanceAsync_PriorInProgressRecord_QueryFails_FallsThroughToDeploy`.
### DeploymentManager-007 — "Diff View" reduced to a hash comparison with no diff detail
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:334-358,401-406` |
**Description**
The design ("Diff View" and "Dependencies" sections) states the Deployment
Manager can request a diff from the Template Engine showing added/removed
members, changed values, and connection-binding changes.
`GetDeploymentComparisonAsync` and `DeploymentComparisonResult` only compare two
revision hashes and return a boolean `IsStale` plus the two hashes. No
added/removed/changed detail is produced, and the Template Engine's diff
capability is not invoked. The UI cannot render a meaningful diff from this
result.
**Verification:** Confirmed against source. The Template Engine already provides
`DiffService` + `ConfigurationDiff` (structured Added/Removed/Changed entries
for attributes, alarms, and scripts, including data connection binding fields),
and `DiffService` is DI-registered — it was simply never wired into the
Deployment Manager's comparison path.
**Recommendation**
Either implement a real diff (deserialize the stored
`DeployedConfigSnapshot.ConfigurationJson` and the freshly flattened config and
invoke the Template Engine's diff service, surfacing structured
added/removed/changed entries), or revise the design doc to scope the feature
down to staleness detection only.
**Resolution**
Resolved 2026-05-16 (commit pending): `GetDeploymentComparisonAsync` now
deserializes the stored `DeployedConfigSnapshot.ConfigurationJson` and runs the
Template Engine `DiffService` against the freshly flattened current
configuration, attaching the resulting `ConfigurationDiff` (added/removed/changed
attributes, alarms, scripts) to a new optional `Diff` property on
`DeploymentComparisonResult`. `DiffService` is injected into `DeploymentService`.
A snapshot that cannot be deserialized (corrupt / older schema) still yields the
hash-based staleness result with a null diff, logged at warning level.
Regression test: `GetDeploymentComparisonAsync_ProducesStructuredDiff`.
### DeploymentManager-008 — `DeploymentManagerOptions` is never bound to configuration
| | |
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Resolved |
| Location | `src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs:7-14` |
**Description**
`AddDeploymentManager` registers the services but never calls
`services.Configure<DeploymentManagerOptions>(configuration.GetSection(...))`.
`IOptions<DeploymentManagerOptions>` therefore always resolves to a
default-constructed instance — the operation-lock and artifact-deployment
timeouts cannot be tuned via `appsettings.json`, contrary to the CLAUDE.md
convention "Per-component configuration via `appsettings.json` sections bound
to options classes (Options pattern)." `Host/Program.cs` binds
`SecurityOptions` and `InboundApiOptions` from configuration sections but has
no equivalent for `DeploymentManagerOptions`.
**Verification:** Confirmed against source. Neither `AddDeploymentManager` nor
`Host/Program.cs` binds `DeploymentManagerOptions`.
**Recommendation**
Add an `IConfiguration` parameter (or a configure callback) to
`AddDeploymentManager` and bind `DeploymentManagerOptions` to a section such as
`ScadaLink:DeploymentManager`, consistent with the other components.
**Resolution**
Resolved 2026-05-16 (commit pending): `AddDeploymentManager()` now calls
`services.AddOptions<DeploymentManagerOptions>()` so `IOptions<DeploymentManagerOptions>`
is always resolvable, and `Host/Program.cs` binds the
`ScadaLink:DeploymentManager` section (exposed as
`ServiceCollectionExtensions.OptionsSection`) via
`services.Configure<DeploymentManagerOptions>(...)` — the same pattern the Host
uses for `SecurityOptions`/`InboundApiOptions`. An earlier attempt added an
`AddDeploymentManager(IConfiguration)` overload; that was reverted because the
project convention (enforced by `Host.Tests.OptionsTests`) forbids component
`Add*` methods from depending on `IConfiguration` — the Host owns
configuration binding. Regression tests:
`AddDeploymentManager_RegistersResolvableOptions_WithDefaults`,
`AddDeploymentManager_OptionsBindToConfigurationSection_AsTheHostWires`,
`OptionsSection_MatchesTheConventionalComponentSectionPath`.
### DeploymentManager-009 — Misleading timeout comment on `DeleteInstanceAsync`
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:288` |
**Description**
The XML doc says "Delete fails if site unreachable (30s timeout via
CommunicationOptions)." The actual delete timeout is whatever
`CommunicationOptions.LifecycleTimeout` is configured to (passed inside
`CommunicationService.DeleteInstanceAsync`); the "30s" figure is hard-coded
into the comment and not derived from any constant in this module. If
`LifecycleTimeout` is reconfigured, the comment becomes wrong. It also wrongly
implies the value lives in this module.
**Recommendation**
Reword to "Delete fails if the site is unreachable within
`CommunicationOptions.LifecycleTimeout`" without quoting a specific number.
**Resolution**
_Unresolved._
### DeploymentManager-010 — `SystemArtifactDeploymentRecord` does not persist the deployment ID
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs:136,194-211` |
**Description**
`DeployToAllSitesAsync` generates a `deploymentId` (line 136) and returns it in
the `ArtifactDeploymentSummary` and audit log, but the persisted
`SystemArtifactDeploymentRecord` has no field for it (the entity only has `Id`,
`ArtifactType`, `DeployedBy`, `DeployedAt`, `PerSiteStatus`). The deployment ID
that appears in the UI summary and audit log cannot be correlated back to the
stored record. Additionally each per-site `DeployArtifactsCommand` carries its
own separate GUID (`BuildDeployArtifactsCommandAsync` line 114), so there are in
fact N+1 unrelated IDs for one logical artifact deployment.
**Recommendation**
Add a `DeploymentId` column to `SystemArtifactDeploymentRecord` and store the
single logical `deploymentId`; reuse that ID (or a derived per-site ID) for the
per-site commands so the audit log, UI summary, and persisted record agree.
**Resolution**
_Unresolved._
### DeploymentManager-011 — Tests never exercise a successful deployment or lifecycle success path
| | |
|--|--|
| Severity | Medium |
| Category | Testing coverage |
| Status | Resolved |
| Location | `tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs:100-151,155-199` |
**Description**
`DeploymentServiceTests` never sets the `CommunicationService` actor, so every
deploy/lifecycle test deliberately stops at the `InvalidOperationException`
thrown by `GetCommunicationActor()` (see lines 118-125, 147). As a result there
is no test covering: a successful deployment (`DeploymentStatus.Success`
response → instance state set to `Enabled`, snapshot stored, audit logged); a
failed-but-handled site response; the `InProgress`-stuck bug
(DeploymentManager-001); successful Disable/Enable/Delete; or the operation
lock actually serializing two concurrent deploys of the same instance. The
critical post-response branch (`DeploymentService.cs:154-184`) and the entire
delete/disable/enable success path are untested. The `AuditLogs` test
(lines 277-289) asserts nothing.
**Verification:** Partially confirmed. By the time this finding was being
resolved, the DeploymentManager-006 fix had already introduced a TestKit-actor
seam (`CreateServiceWithCommActor` + `ReconcileProbeActor`) and successful-deploy
tests. The genuinely-still-missing coverage was: successful Disable/Enable/Delete
paths, per-instance lock serialization during deploy, and the assertionless
`AuditLogs` test — those gaps were addressed.
**Recommendation**
Introduce a seam to inject a fake/substitute communication path (e.g. an
interface over `CommunicationService`, or wire a TestKit actor) so success and
handled-failure paths can be unit tested. Add tests for the stuck-`InProgress`
scenario and for per-instance lock contention during deploy. Make the audit
test assert on `IAuditService.LogAsync`.
**Resolution**
Resolved 2026-05-16 (commit pending): extended the TestKit-actor seam
(`ReconcileProbeActor` now also answers lifecycle commands) and added the
missing coverage — successful Disable/Enable/Delete (state transition + audit
assertions), a successful-deploy audit assertion, and per-instance lock
serialization via a new deferred-reply `SerializationProbeActor` that asserts a
single instance's concurrent deploys never overlap. The assertionless `AuditLogs`
test was replaced with `DeployInstanceAsync_FlatteningFails_DoesNotReachAudit`,
which asserts on `IAuditService.LogAsync`. Regression tests:
`DisableInstanceAsync_SiteSucceeds_SetsDisabledStateAndAudits`,
`EnableInstanceAsync_SiteSucceeds_SetsEnabledStateAndAudits`,
`DeleteInstanceAsync_SiteSucceeds_RemovesRecordAndAudits`,
`DeployInstanceAsync_SiteSucceeds_WritesDeployAuditEntry`,
`DeployInstanceAsync_FlatteningFails_DoesNotReachAudit`,
`DeployInstanceAsync_SameInstance_OperationLockSerializesConcurrentDeploys`.
### DeploymentManager-012 — `LifecycleCommandTimeout` option is dead code
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.DeploymentManager/DeploymentManagerOptions.cs:8-9` |
**Description**
`DeploymentManagerOptions.LifecycleCommandTimeout` is declared with a 30s
default and an XML doc, but it is never read anywhere in the codebase
(lifecycle commands rely on `CommunicationOptions.LifecycleTimeout` inside
`CommunicationService`). The option misleads readers into thinking it controls
disable/enable/delete timeouts, when setting it has no effect.
**Recommendation**
Remove `LifecycleCommandTimeout`, or actually thread it through to the
lifecycle command calls (e.g. by creating a linked CTS with this timeout in
`DisableInstanceAsync`/`EnableInstanceAsync`/`DeleteInstanceAsync`, the way
`ArtifactDeploymentTimeoutPerSite` is used).
**Resolution**
_Unresolved._
### DeploymentManager-013 — SMTP credentials serialized and broadcast to all sites
| | |
|--|--|
| Severity | Low |
| Category | Security |
| Status | Open |
| Location | `src/ScadaLink.DeploymentManager/ArtifactDeploymentService.cs:108-111` |
**Description**
`BuildDeployArtifactsCommandAsync` maps `smtp.Credentials` directly into
`SmtpConfigurationArtifact` and that command is sent to every site. Distributing
SMTP credentials to sites is consistent with the design (SMTP configuration is
a deployable artifact), but the credentials travel inside a serialized command
across the inter-cluster transport and are stored on each site's SQLite. There
is no indication the value is encrypted at rest on the site or scrubbed from
logs. Worth confirming the transport is TLS-protected and the site stores the
credential securely; at minimum this should be a conscious, documented decision.
**Recommendation**
Confirm inter-cluster transport encryption covers artifact commands, ensure
`Credentials` is never written to logs, and document the at-rest protection of
SMTP credentials on site SQLite. Consider encrypting the credential field
within the artifact payload.
**Resolution**
_Unresolved._
### DeploymentManager-014 — Dead `CreateCommand` helper in artifact tests
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs:86-90` |
**Description**
The private static `CreateCommand()` helper is never referenced by any test in
the file. It is dead code that suggests an intended test (e.g. a successful
multi-site artifact deployment) was never written — coverage of
`DeployToAllSitesAsync` is limited to the no-sites failure case, and
`RetryForSiteAsync` and `BuildDeployArtifactsCommandAsync` have no tests at all.
**Recommendation**
Either remove the unused helper or, preferably, write the missing tests for
`DeployToAllSitesAsync` (per-site success/failure matrix, partial failure) and
`RetryForSiteAsync` using it.
**Resolution**
_Unresolved._
@@ -0,0 +1,650 @@
# Code Review — ExternalSystemGateway
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.ExternalSystemGateway` |
| Design doc | `docs/requirements/Component-ExternalSystemGateway.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 4 |
## Summary
The External System Gateway is a small module (five source files plus options) that
implements the HTTP/REST client (`ExternalSystemClient`), the database access surface
(`DatabaseGateway`), and error classification (`ErrorClassifier`). The structure is
clean and the dual call-mode semantics broadly match the design doc. However, the
review surfaced several substantive problems that prevent the module from behaving as
designed. The most serious is that **no store-and-forward delivery handler is ever
registered** for the `ExternalSystem` or `CachedDbWrite` categories, so cached calls
and cached writes are buffered but can never actually be delivered on retry — a silent
data-loss path. Two further high-impact issues are that the **per-system call timeout
is never applied** to the HTTP client (the design's central error-handling guarantee
is absent), and that **`CachedCall` double-dispatches the HTTP request** because
`StoreAndForwardService.EnqueueAsync` itself re-attempts immediate delivery, breaking
the idempotency expectations. A cluster of medium issues concern resource leaks,
classification gaps (cancellation conflation), and the dropped `StoreAndForwardResult`.
Test coverage is thin — `CachedCall` transient/buffering paths and `DatabaseGateway`
are entirely untested. Themes: incomplete wiring against the S&F engine, and design-doc
requirements (timeout, retry settings) that are declared but not implemented.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | URL building edge cases, dropped S&F result, classification gaps — findings 003, 006, 009. |
| 2 | Akka.NET conventions | ☑ | No actors in this module; `AddExternalSystemGatewayActors` is a no-op. Blocking-I/O isolation is delegated to Site Runtime. No issues found in this module. |
| 3 | Concurrency & thread safety | ☑ | Services are stateless and DI-scoped; `ExternalCallResult.Response` lazy-parse is not thread-safe but instances are single-use. No findings raised. |
| 4 | Error handling & resilience | ☑ | S&F handler never registered, double-dispatch, timeout not applied, cancellation conflation — findings 001, 002, 003, 008. |
| 5 | Security | ☑ | Auth secrets logged-safe, but error bodies echoed verbatim — finding 007. |
| 6 | Performance & resource management | ☑ | `HttpRequestMessage`/`HttpResponseMessage` and failed `SqlConnection` not disposed; full repository scan per call — findings 005, 010, 011. |
| 7 | Design-document adherence | ☑ | Timeout, retry settings, audit logging gaps — findings 002, 004, 012. |
| 8 | Code organization & conventions | ☑ | Options class correctly owned by module; `MaxConcurrentConnectionsPerSystem` unused — finding 013. |
| 9 | Testing coverage | ☑ | CachedCall buffering and DatabaseGateway untested — finding 014. |
| 10 | Documentation & comments | ☑ | XML docs reference WP numbers; permanent-failure logging requirement unverified — folded into finding 012. |
## Findings
### ExternalSystemGateway-001 — No S&F delivery handler registered; cached calls and writes can never be delivered
| | |
|--|--|
| Severity | Critical |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:109`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:81` |
**Description**
`CachedCallAsync` and `CachedWriteAsync` enqueue messages under
`StoreAndForwardCategory.ExternalSystem` and `StoreAndForwardCategory.CachedDbWrite`.
`StoreAndForwardService.RegisterDeliveryHandler` is the only mechanism that lets the
S&F engine actually deliver a buffered message, and a repository-wide search shows it
is **never called for either category** anywhere in the codebase. Consequences:
1. On a transient failure, `EnqueueAsync` falls through to the "No handler registered
— buffer for later" branch (`StoreAndForwardService.cs:163`) and the message is
persisted.
2. During the retry sweep, `AttemptDeliveryAsync` (`StoreAndForwardService.cs:201`)
logs `"No delivery handler for category {Category}"` and returns without ever
removing or delivering the message.
The result is that every cached external call and cached DB write is silently
buffered forever and never delivered — a data-loss path for the exact "deferred
delivery is acceptable" use cases the design doc calls out (posting production data,
quality reports). The script also receives `WasBuffered: true` / a successful
`CachedWriteAsync` completion, so the failure is completely invisible.
**Recommendation**
Register delivery handlers for `StoreAndForwardCategory.ExternalSystem` and
`StoreAndForwardCategory.CachedDbWrite` during host/site startup. The `ExternalSystem`
handler should deserialize the payload, re-resolve the system/method, and re-invoke
`InvokeHttpAsync`, returning `true`/`false`/throwing per the transient-vs-permanent
contract `EnqueueAsync` expects. The `CachedDbWrite` handler should execute the SQL
against the named connection. Add an integration test that buffers a message and
verifies it is delivered by a retry sweep.
**Resolution**
Resolved 2026-05-16. Delivery handlers for `StoreAndForwardCategory.ExternalSystem` and
`CachedDbWrite` are now registered at site startup in `AkkaHostedService`, after
`StoreAndForwardService.StartAsync()`. Each handler resolves its consumer in a fresh DI
scope and calls a new `DeliverBufferedAsync`: `ExternalSystemClient.DeliverBufferedAsync`
re-resolves the system/method and re-invokes `InvokeHttpAsync`, and
`DatabaseGateway.DeliverBufferedAsync` executes the buffered SQL — each returning `true`
on success, `false` (park) when the target no longer exists or fails permanently, and
throwing on transient failure so the engine retries. `EnqueueAsync` gained an
`attemptImmediateDelivery` parameter; `CachedCallAsync` passes `false` so registering the
handler does not dispatch the request twice (the double-dispatch noted in
`ExternalSystemGateway-003`). Regression tests cover the success, target-removed and
transient-retry paths. Fixed by the commit whose message references
`ExternalSystemGateway-001`.
### ExternalSystemGateway-002 — Per-system call timeout is never applied to HTTP requests
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:130`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` |
**Description**
The design doc states each external system definition specifies a timeout that
"applies to all method calls on that system" and "applies to the HTTP request
round-trip", and `ExternalSystemGatewayOptions.DefaultHttpTimeout` exists as a
fallback. In practice no timeout is ever configured. `ServiceCollectionExtensions`
calls `services.AddHttpClient()` with no per-named-client configuration, and
`InvokeHttpAsync` calls `_httpClientFactory.CreateClient($"ExternalSystem_{system.Name}")`
without setting `client.Timeout` or passing a `CancellationToken` derived from a
timeout. `SendAsync` is therefore subject only to `HttpClient`'s default 100-second
timeout, regardless of the system definition or the configured `DefaultHttpTimeout`.
A slow or hung external system will block the calling Script Execution Actor far
longer than the operator configured, and the design's core error-handling guarantee
(timeout → transient classification) does not hold within the intended window.
There is also no `Timeout` field on `ExternalSystemDefinition` at all, so even a
correct implementation has nowhere to read the per-system value from — the entity is
missing the field the design requires.
**Recommendation**
Add a `Timeout` (TimeSpan) field to `ExternalSystemDefinition` and have
`InvokeHttpAsync` enforce it — either by setting `client.Timeout` via a typed/named
`HttpClient` registration, or by linking a `CancellationTokenSource` with the
per-system (or `DefaultHttpTimeout`) timeout to the supplied `cancellationToken`
before `SendAsync`. Ensure the resulting `TaskCanceledException`/`TimeoutException`
is classified as transient.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `InvokeHttpAsync` now enforces a call
timeout: `ExternalSystemClient` takes an `IOptions<ExternalSystemGatewayOptions>` and
links a `CancellationTokenSource(DefaultHttpTimeout)` with the caller's token before
`SendAsync` and the response-body read, so the design's "timeout applies to the HTTP
request round-trip" guarantee now holds within the configured window (default 30s)
instead of `HttpClient`'s default 100s. A timeout is reclassified as a
`TransientExternalSystemException`; a caller-initiated cancellation is distinguished
from a timeout and propagated as `OperationCanceledException` rather than being
swallowed as transient. Regression tests:
`Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow` and
`Call_CallerCancellation_IsNotMisreportedAsTimeout`.
Note (partial scope): the per-*system* `Timeout` field on `ExternalSystemDefinition`
remains unimplemented — adding it requires a change to `ScadaLink.Commons`, which is
outside this module's edit scope. Until that entity field exists, the configured
`DefaultHttpTimeout` is the effective per-call limit for every system. A follow-up
against the Commons module should add the `Timeout` field and have `InvokeHttpAsync`
prefer it over the default. This is a tracked follow-up, not a regression.
### ExternalSystemGateway-003 — `CachedCall` double-dispatches the HTTP request
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:84-117` |
**Description**
`CachedCallAsync` first calls `InvokeHttpAsync` directly (line 86). On a
`TransientExternalSystemException` it then calls `_storeAndForward.EnqueueAsync(...)`
(line 109). `StoreAndForwardService.EnqueueAsync` is **not** a pure enqueue — it
"Attempts immediate delivery" by invoking the registered delivery handler
(`StoreAndForwardService.cs:128-159`). If a delivery handler for the `ExternalSystem`
category is registered (as finding 001 recommends), the HTTP request will be executed
a **second time** synchronously inside `EnqueueAsync`, immediately after the first
attempt failed. For a transient failure that is actually a slow/overloaded system,
this doubles the load and — critically — if the original request did reach the
external system, the immediate retry produces a duplicate delivery before the script
even returns, worsening the idempotency hazard the design doc explicitly warns about.
**Recommendation**
Decide on one dispatch path. Either (a) have `CachedCall` not pre-invoke
`InvokeHttpAsync` and instead let `EnqueueAsync`'s immediate-delivery attempt be the
single first attempt (requires the handler to exist and to surface permanent vs
transient correctly); or (b) add an enqueue-only entry point to
`StoreAndForwardService` that skips the immediate-delivery attempt, and have
`CachedCall` use it after its own first attempt. Approach (a) is cleaner and removes
the duplicated logic.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Re-triage: this finding was already fixed in
the codebase as a side effect of the `ExternalSystemGateway-001` fix and is no longer
reproducible against the current source. `StoreAndForwardService.EnqueueAsync` gained an
`attemptImmediateDelivery` parameter (recommendation approach (b)), and
`CachedCallAsync` passes `attemptImmediateDelivery: false` after its own first HTTP
attempt — so `EnqueueAsync` buffers the message for the background retry sweep without
re-invoking the registered delivery handler, eliminating the duplicate dispatch. A
dedicated regression test, `CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler`,
was added in this module's test suite: it registers a counting delivery handler, drives
a `CachedCall` whose HTTP attempt fails transiently, and asserts the handler is invoked
zero times during enqueue. The test was verified to fail if `attemptImmediateDelivery`
is flipped back to `true`.
### ExternalSystemGateway-004 — System retry settings are not honoured for cached calls/writes
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:114-115`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:86-87` |
**Description**
`CachedCallAsync` and `CachedWriteAsync` pass the definition's `MaxRetries` /
`RetryDelay` to `EnqueueAsync` only when they are non-default
(`MaxRetries > 0 ? ... : null`, `RetryDelay > TimeSpan.Zero ? ... : null`), otherwise
falling back to the S&F defaults. The site-side repository that supplies these
definitions, `SiteExternalSystemRepository.MapExternalSystem`
(`src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:194`), never
reads `MaxRetries`/`RetryDelay` from SQLite at all — the constructed entities always
have `MaxRetries == 0` and `RetryDelay == TimeSpan.Zero`. As a result, at sites the
per-system retry settings the design doc requires are *always* discarded and the
global S&F defaults are silently used instead. The `> 0` guard in the ESG also makes
a legitimately-configured `MaxRetries` of 0 ("never retry") indistinguishable from
"unset", so an operator cannot express "do not retry".
**Recommendation**
Within this module, drop the `> 0` / `> Zero` guards and pass the definition values
through directly (or use nullable fields on the entity to distinguish "unset"). The
companion fix in `SiteExternalSystemRepository` to actually map the retry columns
should be tracked against the SiteRuntime module.
**Resolution**
Resolved 2026-05-16 (commit pending). `CachedCallAsync` and `CachedWriteAsync` now pass
the definition's `MaxRetries` to `EnqueueAsync` verbatim — the `> 0` guard is dropped, so
a legitimately-configured `MaxRetries` of 0 ("never retry") is honoured instead of being
collapsed to the S&F default. The `RetryDelay > TimeSpan.Zero` guard is deliberately
**kept**: `TimeSpan.Zero` is the entity default for an unconfigured field and a literal
zero-delay retry loop is not a valid configuration, so falling back to the S&F default
interval for an unset delay is correct (only `MaxRetries == 0` is a meaningful operator
choice). Regression test `CachedCall_TransientFailure_ZeroMaxRetriesIsHonouredNotTreatedAsUnset`
buffers a transient failure and asserts the buffered message carries `MaxRetries == 0`
rather than the S&F default; `CachedCall_TransientFailure_BuffersWithSystemRetrySettings`
additionally covers a non-default settings pass-through. The companion fix in
`SiteExternalSystemRepository.MapExternalSystem` to actually read the `MaxRetries` /
`RetryDelay` columns from SQLite remains a tracked follow-up against the SiteRuntime
module (outside this module's edit scope) — until then, sites still supply
`MaxRetries == 0`, which this fix now correctly honours as "never retry".
### ExternalSystemGateway-005 — `HttpRequestMessage` and `HttpResponseMessage` are not disposed
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:133-167` |
**Description**
`InvokeHttpAsync` creates an `HttpRequestMessage` (line 133) and receives an
`HttpResponseMessage` from `SendAsync` (line 155); neither is wrapped in a `using` nor
explicitly disposed. Both are `IDisposable` and own resources (the request's
`StringContent`, the response's content stream). Under the per-invocation call volume
of a busy site this produces avoidable pressure on the finalizer queue and can hold
socket/stream resources longer than necessary. The success path reads the content but
never disposes the response; the error path likewise reads `errorBody` and then throws
without disposing.
**Recommendation**
Wrap the request in `using var request = ...` and the response in
`using var response = ...` (or call `Dispose()` in a `finally`). Ensure disposal still
occurs on the exception paths.
**Resolution**
Resolved 2026-05-16 (commit pending). `InvokeHttpAsync` now declares the request as
`using var request` and wraps all response handling in a `using (response)` block, so
both `IDisposable` instances (and the request's `StringContent` / the response content
stream) are released on the success path **and** on the permanent/transient
exception paths. Regression tests `Call_SuccessfulHttp_DisposesRequestAndResponse` and
`Call_PermanentFailure_StillDisposesRequestAndResponse` use a disposal-tracking
`HttpMessageHandler`/`HttpContent` and assert both the request and the response content
are disposed; both were verified to fail before the `using` wrappers were added.
### ExternalSystemGateway-006 — `BuildUrl` ignores path templates and appends a trailing slash for empty paths
| | |
|--|--|
| Severity | Medium — partially re-triaged: trailing-slash bug fixed; path-templating sub-issue is a design decision (see Resolution) |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:180-196` |
**Description**
`BuildUrl` does `baseUrl.TrimEnd('/') + "/" + path.TrimStart('/')`. When `method.Path`
is empty (a method that targets the base URL itself), this still appends a `/`,
producing `https://host/api/` which some servers treat as a different resource than
`https://host/api`. More importantly, the design doc shows method paths as templates
like `/recipes/{id}`, but `BuildUrl` performs no placeholder substitution — a `{id}`
token is sent literally in the URL and the corresponding parameter is instead appended
as a query-string entry (for GET/DELETE) or placed in the JSON body (POST/PUT). Either
the design's path-template feature is unimplemented, or the doc is stale; in the
current code a method defined as `/recipes/{id}` will never produce a correct URL.
**Recommendation**
Decide whether path templating is in scope. If yes, implement `{name}` substitution
from `parameters` in `BuildUrl` and exclude substituted parameters from the query
string/body. If no, update the component design doc to remove the `/recipes/{id}`
example and state that paths are literal. Also avoid appending a trailing `/` when
`path` is empty.
**Resolution**
Resolved 2026-05-16 (commit pending). The **trailing-slash bug** is fixed: `BuildUrl`
now appends a `/`-joined path segment only when the method's path is non-empty after
trimming, so a method targeting the base URL itself produces `https://host/api` rather
than `https://host/api/`. Regression tests `Call_MethodWithEmptyPath_DoesNotAppendTrailingSlash`
and `Call_MethodWithPath_BuildsExpectedUrl` (asserting on the captured request URI)
cover the empty-path and normal-path cases; the empty-path test was verified to fail
before the fix.
Re-triage of the **path-templating sub-issue** (`{id}` placeholder substitution): this
is a genuine design decision, not a code bug, and it requires editing the component
design doc — both outside this module's edit scope (`src/`, `tests/`, this file only).
The current code treats method paths as literal strings and routes parameters to the
query string (GET/DELETE) or JSON body (POST/PUT); a method authored as `/recipes/{id}`
sends the `{id}` token verbatim. **Tracked follow-up / surfaced design question:** the
design owner must decide whether path templating is in scope — if yes, implement
`{name}` substitution in `BuildUrl` and exclude substituted params from the
query/body; if no, the `Component-ExternalSystemGateway.md` `/recipes/{id}` example must
be changed to a literal path. The trailing-slash defect (the concrete correctness bug
in this finding) is fully resolved.
### ExternalSystemGateway-007 — External error response bodies are echoed verbatim into script-visible error messages
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:167-177` |
**Description**
On a non-success HTTP response, the full response body is read into `errorBody` and
embedded verbatim into the exception message (`$"HTTP {code} from {name}: {errorBody}"`),
which then flows into `ExternalCallResult.ErrorMessage` and back to the calling script,
and into Site Event Logging. An external system error page can be arbitrarily large
(an HTML stack trace, a multi-megabyte body) and may contain sensitive detail. There
is no size cap, so a hostile or misbehaving endpoint can inflate every error log entry
and error string returned to scripts. There is also no content-type check before
treating the body as text.
**Recommendation**
Truncate `errorBody` to a bounded length (e.g. 12 KB) before embedding it, and
consider logging the full body separately at debug level rather than returning it to
the script. Optionally only include the body when the content type is textual.
**Resolution**
Resolved 2026-05-16 (commit pending). `InvokeHttpAsync` now truncates the external error
response body to `MaxErrorBodyChars` (2048) via a `Truncate` helper before embedding it
into the transient/permanent exception message — so a misbehaving or hostile endpoint
can no longer inflate every script-visible `ErrorMessage` and Site Event Logging entry
with a multi-megabyte body. When truncation occurs the message is suffixed with
`… [truncated, N chars total]` so the original size is still visible. Regression test
`Call_PermanentFailureWithHugeErrorBody_TruncatesErrorMessage` drives a 400 with a
500 KB body and asserts the resulting `ErrorMessage` is bounded (< 4096 chars); it was
verified to fail (500 040-char message) before the cap was added. Content-type
filtering was considered optional in the recommendation and was not implemented — the
size cap alone closes the inflation/disclosure vector.
### ExternalSystemGateway-008 — Cancellation is conflated with transient timeout failure
| | |
|--|--|
| Severity | Medium — re-triaged: root cause already fixed in current source (see Resolution) |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ErrorClassifier.cs:24-30`, `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:157-159` |
**Description**
`ErrorClassifier.IsTransient(Exception)` returns `true` for `TaskCanceledException`
and `OperationCanceledException`. `HttpClient.SendAsync` throws `TaskCanceledException`
both when its internal timeout elapses *and* when the supplied `CancellationToken` is
cancelled (e.g. the Script Execution Actor is stopped, or the actor system is shutting
down). Because `InvokeHttpAsync`'s `catch` filter treats all of these as transient, a
caller-initiated cancellation during a `CachedCall` will be misclassified as a
transient failure and the message will be buffered for retry — work the caller
explicitly asked to abandon. For a `Call`, a shutdown-time cancellation is reported to
the script as a "Transient error" rather than an `OperationCanceledException`.
**Recommendation**
In `InvokeHttpAsync`, check `cancellationToken.IsCancellationRequested` first and
rethrow `OperationCanceledException` (or let it propagate) before applying transient
classification. Only treat a cancellation as a timeout when the supplied token is
*not* the one that was cancelled.
**Resolution**
Resolved 2026-05-16 (commit pending). **Re-triage:** the root cause described — a
caller-initiated cancellation being misclassified as a transient failure — is **no
longer present in the current source** and is not reproducible. `InvokeHttpAsync`
already wraps both `SendAsync` and the response-body `ReadAsStringAsync` in ordered
`catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)`
filters that rethrow the caller's cancellation *before* the
`catch (Exception ex) when (ErrorClassifier.IsTransient(ex))` branch is ever reached
(this was added alongside the `ExternalSystemGateway-002` timeout fix). A caller-cancel
therefore propagates as `OperationCanceledException` and is never buffered; only the
gateway's own timeout token reclassifies as transient.
`ErrorClassifier.IsTransient(Exception)` does still return `true` for
`TaskCanceledException`/`OperationCanceledException`, but that is **correct and
intentional**: a `TaskCanceledException` raised by an HTTP timeout *is* a genuine
transient failure, and the only caller (`InvokeHttpAsync:238`) is unreachable for a
caller-cancellation because the two preceding `when`-filtered catches intercept it
first. The transient-vs-cancel decision is contextual (which token fired) and cannot
be made from the exception type alone, which is exactly why the call site does it.
No source change was required. A regression guard,
`CachedCall_CallerCancellation_IsNotBufferedAsTransient`, was added: it cancels the
caller token mid-`CachedCall` and asserts an `OperationCanceledException` is thrown and
the S&F buffer remains empty (the cancelled work is not retried). The existing
`Call_CallerCancellation_IsNotMisreportedAsTimeout` covers the synchronous `Call` path.
### ExternalSystemGateway-009 — `StoreAndForwardResult` from `EnqueueAsync` is discarded; permanent failures during buffering are swallowed
| | |
|--|--|
| Severity | Medium — re-triaged: root cause subsumed by the ExternalSystemGateway-003 dispatch redesign (see Resolution) |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:109-117` |
**Description**
`CachedCallAsync` assigns the result of `_storeAndForward.EnqueueAsync(...)` to
`sfResult` and then never reads it — it unconditionally returns
`new ExternalCallResult(true, null, null, WasBuffered: true)`. `EnqueueAsync` can
return `Success == false` (a permanent failure encountered during its
immediate-delivery attempt — `StoreAndForwardService.cs:142`) or `Buffered == false`
(delivered immediately). In both cases the ESG still reports the call as buffered and
successful to the script. A permanent failure surfaced by the S&F immediate attempt is
therefore silently lost instead of being returned to the script as the design requires
("On permanent failure (HTTP 4xx), the error is returned synchronously").
**Recommendation**
Inspect `sfResult`: if `Success == false` return an error `ExternalCallResult`; set
`WasBuffered` from `sfResult.Buffered` rather than hard-coding `true`. (This finding is
partly subsumed by the dispatch redesign in finding 003.)
**Resolution**
Resolved 2026-05-16 (commit pending). **Re-triage:** the stated root cause — "a
permanent failure surfaced by `EnqueueAsync`'s immediate-delivery attempt is silently
lost" — **can no longer occur** in the current source, and the dead `sfResult` variable
the finding cites has already been removed. The `ExternalSystemGateway-003` fix changed
`CachedCallAsync` to call `EnqueueAsync` with `attemptImmediateDelivery: false`. With
that flag, `EnqueueAsync` never invokes the registered delivery handler: it skips the
immediate-delivery block entirely (so the `StoreAndForwardResult(false, …, …)`
permanent-failure return at `StoreAndForwardService.cs:147` is unreachable from this
caller) and unconditionally buffers, returning `Accepted: true, WasBuffered: true`
(`StoreAndForwardService.cs:180`). The `ExternalCallResult(true, null, null,
WasBuffered: true)` that `CachedCallAsync` returns is therefore now factually correct
in every reachable case — the message *is* buffered and there is no swallowed permanent
failure. Permanent HTTP 4xx failures are still surfaced synchronously, because
`CachedCallAsync` makes its own first HTTP attempt and catches
`PermanentExternalSystemException` *before* it ever reaches `EnqueueAsync`. No source
change was required beyond the `ExternalSystemGateway-003` redesign that already landed.
Coverage: `CachedCall_TransientFailure_BuffersWithSystemRetrySettings` asserts both
`result.WasBuffered == true` and that the message is genuinely present in the S&F buffer
(depth == 1), confirming the `WasBuffered: true` claim is not a lie; the existing
`CachedCall` permanent-failure path is exercised by `Call_Permanent400_ReturnsPermanentError`
semantics shared via `InvokeHttpAsync`.
### ExternalSystemGateway-010 — `GetConnectionAsync` leaks the `SqlConnection` when `OpenAsync` fails
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:48-50` |
**Description**
`GetConnectionAsync` constructs `new SqlConnection(...)` and calls `await
connection.OpenAsync(...)`. If `OpenAsync` throws (unreachable server, bad
credentials, cancellation) the just-created `SqlConnection` instance is never disposed
— the exception propagates and the local reference is lost. While an unopened
`SqlConnection` is lightweight, over many failing calls this is an avoidable leak. The
design doc says `Database.Connection()` failures return an error to the script; the
current code lets a raw `SqlException` escape, which is acceptable, but the leak is
not.
**Recommendation**
Wrap the open in a try/catch that disposes the connection before rethrowing:
`try { await connection.OpenAsync(ct); } catch { connection.Dispose(); throw; }`.
**Resolution**
Resolved 2026-05-16 (commit pending). `GetConnectionAsync` now wraps `OpenAsync` in a
`try/catch` that calls `await connection.DisposeAsync()` before rethrowing, so a failed
open (unreachable server, bad credentials, cancellation) no longer leaks the
`SqlConnection`. Connection creation was extracted into an `internal virtual
CreateConnection(string)` factory so the failure path is unit-testable. Regression test
`GetConnection_OpenFails_DisposesConnectionBeforeRethrowing` substitutes a `DbConnection`
whose `OpenAsync` always throws and asserts the connection is disposed when the
exception propagates; it was verified to fail before the `try/catch` was added.
### ExternalSystemGateway-011 — Every call performs a full repository scan of all systems and methods
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:231-245`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:90-97` |
**Description**
`ResolveSystemAndMethodAsync` calls `GetAllExternalSystemsAsync()` and then
`GetMethodsByExternalSystemIdAsync()` and filters in memory on every single call;
`ResolveConnectionAsync` calls `GetAllDatabaseConnectionsAsync()` and filters in memory
on every cached write / connection request. At sites this hits the SQLite repository,
and `SiteExternalSystemRepository` re-reads and re-parses the methods JSON each time.
For a hot script path this is unnecessary repeated I/O and allocation. Definitions only
change on deployment, so they are eminently cacheable.
**Recommendation**
Add an in-memory cache of system/method/connection definitions keyed by name,
invalidated on artifact deployment. Alternatively use a name-keyed repository lookup
rather than fetch-all-then-filter.
**Resolution**
_Unresolved._
### ExternalSystemGateway-012 — Permanent-failure logging requirement is not met; `_logger` is injected but unused
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:24,169-177`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:22` |
**Description**
The design doc states permanent failures are "Logged to Site Event Logging", but
`InvokeHttpAsync` performs no logging on the permanent-failure path. In fact the
injected `ILogger<ExternalSystemClient>` and `ILogger<DatabaseGateway>` fields are
never used at all in either class. Either the logging is expected to happen in the
caller (Script Execution Actor) — in which case the design doc is imprecise about
where — or it is missing. Separately, `IsTransient(HttpStatusCode)` treats any
non-success, non-(5xx/408/429) status as permanent without an explicit comment, which
is a reasonable default but undocumented.
**Recommendation**
Add a `_logger.LogWarning` on the permanent-failure path (and a debug log on
transient), or clarify in the design doc that Site Event Logging capture is the
caller's responsibility and remove the unused `_logger` fields. Add a comment in
`ErrorClassifier` documenting the "default to permanent" behaviour.
**Resolution**
_Unresolved._
### ExternalSystemGateway-013 — `MaxConcurrentConnectionsPerSystem` and `DefaultHttpTimeout` options are defined but never used
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemGatewayOptions.cs:9,12`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` |
**Description**
`ExternalSystemGatewayOptions.MaxConcurrentConnectionsPerSystem` (default 10) and
`DefaultHttpTimeout` (default 30s) are bound from configuration but neither is read
anywhere. `AddHttpClient()` registers the default factory with no
`ConfigurePrimaryHttpMessageHandler`/`SocketsHttpHandler` `MaxConnectionsPerServer` and
no `Timeout`, so both options have no effect. An operator setting these values gets
them silently ignored — a misleading configuration surface (`DefaultHttpTimeout` is
also referenced by finding 002).
**Recommendation**
Either wire the options into a named/typed `HttpClient` registration (set
`MaxConnectionsPerServer` on the primary handler, set `Timeout`), or remove the unused
options to avoid implying behaviour that does not exist.
**Resolution**
_Unresolved._
### ExternalSystemGateway-014 — Cached-call buffering path and `DatabaseGateway` are untested
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs:1`, (no `DatabaseGatewayTests.cs`) |
**Description**
`ExternalSystemClientTests` covers system/method not-found, success, transient 500 and
permanent 400 for `CallAsync`, plus `CachedCall` not-found and success. It does **not**
cover: the `CachedCall` transient-failure → S&F buffering branch (the most
behaviour-rich path, including the `_storeAndForward == null` fallback and `WasBuffered`
semantics), the `CachedCall` permanent-failure branch, connection-exception
classification (`HttpRequestException` thrown by the handler), `BuildUrl` query-string
construction, and `ApplyAuth` for the apikey/basic variants. There is **no test file
for `DatabaseGateway`** at all — `GetConnectionAsync` not-found, `CachedWriteAsync`
not-found, and the `_storeAndForward == null` guard are entirely uncovered. The
`MockHttpMessageHandler` also does not assert request URL/headers/body, so auth and
URL construction are unverified.
**Recommendation**
Add tests for the `CachedCall` transient/buffering paths (with a substituted S&F
service), `DatabaseGateway` not-found and null-S&F guards, and `BuildUrl`/`ApplyAuth`
by asserting on the captured `HttpRequestMessage` in the mock handler.
**Resolution**
_Unresolved._
+509
View File
@@ -0,0 +1,509 @@
# Code Review — HealthMonitoring
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.HealthMonitoring` |
| Design doc | `docs/requirements/Component-HealthMonitoring.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 5 |
## Summary
The HealthMonitoring module is small, readable, and broadly faithful to the design
intent: per-interval error counters with atomic read-and-reset, monotonic sequence
numbers with Unix-ms seeding to survive failover, sequence-guarded staleness
rejection, and a 60s offline timeout. However, the review surfaced two recurring
themes. First, **a documented metric is silently unimplemented** — store-and-forward
buffer depths are never populated (`SetStoreAndForwardDepths` has zero callers and a
test asserts the field is always empty), so the dashboard cannot show the buffer
depth metric the design doc requires. Second, **the central aggregator's in-memory
state model has unguarded shared mutable state**: `SiteHealthState` is a mutable
class whose fields are written by a background timer thread, by `ProcessReport`, and
by `MarkHeartbeat` with no synchronization, and the same live mutable objects are
handed straight to UI callers via `GetAllSiteStates`. The `ProcessReport` logic also
mutates shared state inside a `ConcurrentDictionary.AddOrUpdate` update delegate,
which the runtime may invoke more than once under contention. Additionally there are
gaps around central self-report offline detection, heartbeats for not-yet-registered
sites being dropped, and missing test coverage for the central report loop,
heartbeat path, and most collector setters. None of the findings are crash-class,
but the concurrency issues are Medium/High and the missing S&F metric is a real
design-adherence gap.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | x | `MarkHeartbeat` drops heartbeats for unregistered sites (HealthMonitoring-007); central self-report has no heartbeat grace (HealthMonitoring-005). |
| 2 | Akka.NET conventions | x | Module itself contains no actors (transport abstracted via `IHealthReportTransport`); `AddHealthMonitoringActors` is a dead placeholder (HealthMonitoring-011). Actor-side wiring lives in Communication and is out of scope. |
| 3 | Concurrency & thread safety | x | Unguarded mutable `SiteHealthState` (HealthMonitoring-002); mutation inside `AddOrUpdate` delegate (HealthMonitoring-003); `GetAllSiteStates` leaks live mutable references (HealthMonitoring-008). Collector counters correctly use `Interlocked`. |
| 4 | Error handling & resilience | x | `HealthReportSender` silently swallows inner failures with bare `catch {}` (HealthMonitoring-010); top-level loop error handling is sound. |
| 5 | Security | x | No issues found. Module handles only numeric/string operational metrics, no secrets, no external input parsing, no auth surface. |
| 6 | Performance & resource management | x | `PeriodicTimer` instances correctly disposed via `using`. Dictionary snapshots per report are acceptable at the documented scale. No issues found. |
| 7 | Design-document adherence | x | Store-and-forward buffer depth metric unimplemented (HealthMonitoring-001); sequence seeding deviates from doc's "starting at 1" wording (HealthMonitoring-006). |
| 8 | Code organization & conventions | x | Options class correctly owned by the component; POCO/messages in Commons. Dead placeholder method noted (HealthMonitoring-011). |
| 9 | Testing coverage | x | No tests for `CentralHealthReportLoop`, `MarkHeartbeat`, offline-via-heartbeat, replica idempotency, or most collector setters (HealthMonitoring-009). |
| 10 | Documentation & comments | x | Heartbeat interval is described inconsistently (~2s vs ~5s) across XML docs (HealthMonitoring-004); `LatestReport = null!` misrepresents the contract (HealthMonitoring-012). |
## Findings
### HealthMonitoring-001 — Store-and-forward buffer depth metric is never populated
| | |
|--|--|
| Severity | High |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs:104`, `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:79` |
**Description**
`Component-HealthMonitoring.md` lists "Store-and-forward buffer depth" (pending
messages by category) as a required monitored metric. `SiteHealthCollector` exposes
`SetStoreAndForwardDepths(...)` to receive it, but a codebase-wide search shows the
method has **no callers**`_sfBufferDepths` always remains the empty dictionary it
is initialized to. `HealthReportSender` queries `GetParkedMessageCountAsync()` and
sets `ParkedMessageCount`, but parked count is a distinct metric from per-category
buffer depth. The test `SiteHealthCollectorTests.StoreAndForwardBufferDepths_IsEmptyPlaceholder`
even codifies the unimplemented state as expected behaviour. The result is that the
central dashboard cannot display buffer depth, a documented triage metric.
**Recommendation**
Wire `SetStoreAndForwardDepths` into `HealthReportSender.ExecuteAsync` (alongside the
existing parked-count call) using the S&F engine's per-category depth API, or, if the
metric is intentionally deferred, record that decision in the design doc and remove
the dead setter. Update the placeholder test accordingly once implemented.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `HealthReportSender.ExecuteAsync` now
queries the existing public `StoreAndForwardStorage.GetBufferDepthByCategoryAsync()`
API alongside the parked-count call and feeds the per-category depths into
`SiteHealthCollector.SetStoreAndForwardDepths` (category enum names as keys), so the
documented store-and-forward buffer depth metric is populated in every emitted
report. Regression test `HealthReportSenderTests.ReportsIncludeStoreAndForwardBufferDepthsFromStorage`
verifies populated per-category depths. The obsolete placeholder test
`SiteHealthCollectorTests.StoreAndForwardBufferDepths_IsEmptyPlaceholder` continues
to pass — it only exercises the collector with no setter call and still correctly
asserts the empty default; it was left in place as the collector-level default-state
test. No StoreAndForward source was modified (existing public API only).
### HealthMonitoring-002 — `SiteHealthState` mutable fields written from multiple threads without synchronization
| | |
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthState.cs:11`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:86`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:137` |
**Description**
`SiteHealthState` is a plain mutable class. Its fields (`LatestReport`,
`LastReportReceivedAt`, `LastHeartbeatAt`, `LastSequenceNumber`, `IsOnline`) are
mutated from at least three concurrent contexts: `ProcessReport` (caller thread —
ClusterClient/PubSub message handlers), `MarkHeartbeat` (caller thread — heartbeat
handler), and `CheckForOfflineSites` (the `BackgroundService` timer thread). The
`ConcurrentDictionary` only protects the dictionary structure, not the objects it
stores. A heartbeat update and the offline-check can interleave on the same
`SiteHealthState` instance, and reads/writes of `DateTimeOffset` (a 16-byte struct)
and `long` fields are not guaranteed atomic on all platforms — producing torn reads
and lost updates of `IsOnline`/`LastHeartbeatAt`.
**Recommendation**
Make state transitions atomic: either guard all reads/writes of a `SiteHealthState`
with a per-site lock, or replace `SiteHealthState` with an immutable record updated
via `ConcurrentDictionary` compare-and-swap (`TryUpdate`) so every transition is
a single atomic reference swap.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `SiteHealthState` is now a `sealed record`
with `init`-only properties. `CentralHealthAggregator.ProcessReport`,
`MarkHeartbeat`, and `CheckForOfflineSites` were rewritten to perform every state
transition as an atomic compare-and-swap (`TryAdd`/`TryUpdate`) producing a new
record instance — no field of a stored state is ever mutated in place. `ProcessReport`
uses an explicit CAS retry loop instead of the `AddOrUpdate` update delegate so the
sequence-number guard and the field writes are evaluated against the value actually
installed (this also closes the root cause behind HealthMonitoring-003). Reads via
`GetAllSiteStates`/`GetSiteState` now hand out immutable snapshots, so a concurrent
reader can never observe a torn or half-applied state. `LatestReport` was changed
from `SiteHealthReport` (`null!`) to `SiteHealthReport?`, making the contract honest;
all existing consumers (CentralUI, integration/perf tests) already null-checked it
and continue to build clean. Regression test
`CentralHealthAggregatorTests.ProcessReport_ConcurrentUpdates_NeverLoseSequenceOrTearState`
exercises concurrent report/heartbeat/read threads and asserts snapshot consistency
and no lost updates.
### HealthMonitoring-003 — Shared state mutated inside `ConcurrentDictionary.AddOrUpdate` update delegate
| | |
|--|--|
| Severity | Medium — re-triaged: already resolved as a side-effect of HealthMonitoring-002. |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:45-103` |
**Description**
The update delegate passed to `AddOrUpdate` mutates the `existing` object in place
(`existing.LatestReport = report; existing.IsOnline = true; ...`). `AddOrUpdate`'s
contract explicitly allows the update delegate to be invoked **more than once** under
contention (when the CAS that installs the result loses a race and is retried). Each
invocation mutates the shared object, so a concurrent report for the same site can
observe a half-applied update, and the multi-field assignment is not atomic with
respect to readers in `GetAllSiteStates`/`CheckForOfflineSites`. The intended
"only replace if sequence is higher" guard can also be subverted because the
sequence comparison and the field writes are not a single atomic step.
**Recommendation**
Have the update delegate return a **new** `SiteHealthState` (record `with` copy)
rather than mutating `existing`, and treat the dictionary value as immutable.
Combined with HealthMonitoring-002, this makes every state transition an atomic
reference swap with no observable intermediate state.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Re-triaged: verified against the current
source — the root cause was already eliminated by the HealthMonitoring-002 fix.
`ProcessReport` no longer uses `AddOrUpdate` at all; it is now an explicit
compare-and-swap retry loop (`TryGetValue` → guard → `TryAdd`/`TryUpdate`) that
produces a brand-new immutable `SiteHealthState` record per transition and never
mutates a stored value in place. The sequence-number guard and the field writes are
evaluated against the value actually installed by the CAS, so the "only replace if
sequence is higher" invariant holds. The concurrency stress test
`CentralHealthAggregatorTests.ProcessReport_ConcurrentUpdates_NeverLoseSequenceOrTearState`
(added under HealthMonitoring-002) already exercises this path and asserts no lost
updates and no torn snapshots. No further code change was required for this finding.
### HealthMonitoring-004 — Inconsistent heartbeat interval described across XML docs
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:146-148`, `src/ScadaLink.HealthMonitoring/SiteHealthState.cs:21`, `src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs:16` |
**Description**
The heartbeat cadence that offline detection relies on is documented inconsistently.
`CheckForOfflineSites` says "heartbeats arrive every ~5s"; `SiteHealthState.LastHeartbeatAt`
says "~5s heartbeat"; but `ICentralHealthAggregator.MarkHeartbeat` says "~2s
heartbeats are arriving". The actual cadence is set elsewhere (Cluster Infrastructure /
`SiteCommunicationActor`). Readers cannot reason about whether a 60s offline timeout
gives the intended grace without a single authoritative number.
**Recommendation**
Pick the correct interval (verify against the heartbeat scheduler in
`SiteCommunicationActor`/Cluster Infrastructure) and use it consistently in all three
comments, ideally referencing the owning component rather than restating a magic number.
**Resolution**
_Unresolved._
### HealthMonitoring-005 — Central self-report site can flap offline; no heartbeat grace like real sites
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:48-81`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:149` |
**Description**
`CheckForOfflineSites` decides offline status purely from `LastHeartbeatAt`, and for
real sites that field is kept fresh by frequent (~2-5s) heartbeats so the 60s timeout
only fires on genuine total loss. The synthetic `central` site, however, has no
heartbeat source — `LastHeartbeatAt` is only bumped by `ProcessReport` from the
30s `CentralHealthReportLoop`. The loop also only runs on the cluster leader and
silently skips a cycle on any exception. Consequently, a single skipped/late central
self-report (leader GC pause, brief stall, mid-failover before the new leader's loop
spins up) leaves `central` with no signal for >60s and it is marked offline even
though the central cluster is healthy. The central card thus has no equivalent of
the "one missed report grace" the design doc grants real sites.
**Recommendation**
Either feed `central` a heartbeat equivalent (e.g. have `MarkHeartbeat` called for
`CentralSiteId` on a fast timer independent of the leader-only report loop), or apply
a longer/distinct offline timeout to the `central` keyspace entry, and ensure the new
leader starts the report loop promptly on failover.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Applied the distinct-timeout option. A new
`HealthMonitoringOptions.CentralOfflineTimeout` (default 3x the report interval =
3 minutes) is applied by `CentralHealthAggregator.CheckForOfflineSites` to the
`central` keyspace entry only — real sites keep the existing `OfflineTimeout`. This
gives the synthetic `central` site (which has no heartbeat source and is fed solely
by the 30s leader-only `CentralHealthReportLoop`) enough grace to survive a single
skipped or late self-report — the equivalent of the "one missed report" grace the
design doc grants real sites — while still going offline on genuine total loss.
Regression tests `CentralHealthAggregatorTests.OfflineDetection_CentralSite_HasLongerGraceThanRealSites`
(central survives 75s of silence while a real site goes offline) and
`OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss` (central still detected
offline after 10 minutes) verify the behaviour.
### HealthMonitoring-006 — Sequence seeding contradicts the doc's "starting at 1" wording and is untestable
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:28`, `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:32` |
**Description**
The `HealthReportSender` class XML summary states "Sequence numbers are monotonic,
starting at 1, and reset on service restart." The implementation instead seeds
`_sequenceNumber` with `DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()` so the first
emitted sequence is a large epoch value, specifically to keep ordering correct across
failover. The summary is therefore stale and contradicts the code. Separately, the
seed reads `DateTimeOffset.UtcNow` directly at field initialization rather than
through an injected `TimeProvider` (which `CentralHealthAggregator` already uses),
making the seeding logic impossible to unit-test deterministically and dependent on
node wall-clock agreement — if one node's clock lags, its post-failover reports can
be silently rejected as stale by the aggregator.
**Recommendation**
Fix the `HealthReportSender` XML summary to describe the actual Unix-ms seeding
strategy, and inject `TimeProvider` for the seed so the behaviour is testable and the
clock dependency is explicit.
**Resolution**
_Unresolved._
### HealthMonitoring-007 — Heartbeats for not-yet-registered sites are silently dropped
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:86-99` |
**Description**
`MarkHeartbeat` returns immediately if the site is not already in `_siteStates`
("registration only happens on report"). Central health state is in-memory only and
not persisted. After a central restart or failover the aggregator starts empty, so
for up to one full report interval (default 30s) every site emits only heartbeats
that are all discarded — the site is reported as *unknown* (absent from
`GetAllSiteStates`) rather than *online*, even though heartbeats prove it is
reachable. This is a visible dashboard regression precisely during the failover
window, which is when operators most need accurate status.
**Recommendation**
Allow `MarkHeartbeat` to register a minimal `SiteHealthState` (online, no
`LatestReport` yet, with a UI-visible "awaiting first report" indication) when a
heartbeat arrives for an unknown site, so reachable sites show online immediately
after a central restart.
**Resolution**
Resolved 2026-05-16 (commit `pending`). `CentralHealthAggregator.MarkHeartbeat` no
longer returns early for an unknown site. When a heartbeat arrives for a site with no
aggregator state, it now atomically registers (`TryAdd`, with CAS-loss retry) a
minimal `SiteHealthState` that is `IsOnline = true`, `LatestReport = null`,
`LastSequenceNumber = 0` and `LastHeartbeatAt = receivedAt` — an "online, awaiting
first report" state. This relies on the HealthMonitoring-002 change that made
`LatestReport` properly nullable, so UI consumers already handle the null case.
Reachable sites therefore show online immediately after a central restart/failover
instead of being absent ("unknown") for up to a full report interval. The
`ICentralHealthAggregator.MarkHeartbeat` XML doc was corrected to describe the new
behaviour. Regression test
`CentralHealthAggregatorTests.MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport`
verifies the registration; `MarkHeartbeat_KeepsSiteOnline_BetweenReports` and
`MarkHeartbeat_BringsOfflineSiteBackOnline` cover the already-registered paths.
### HealthMonitoring-008 — `GetAllSiteStates` / `GetSiteState` leak live mutable state objects to callers
| | |
|--|--|
| Severity | Medium — re-triaged: already resolved as a side-effect of HealthMonitoring-002. |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:146-158` |
**Description**
`GetAllSiteStates` copies the dictionary but the copy still holds references to the
same live mutable `SiteHealthState` instances; `GetSiteState` returns the live
instance directly. UI consumers (Blazor Server / SignalR circuits) read these objects
on their own threads while the aggregator's background timer and report handlers
concurrently mutate the very same instances (see HealthMonitoring-002). A UI render
can observe a `SiteHealthState` with, e.g., `IsOnline == true` but a `LatestReport`
from a different update, or a torn `DateTimeOffset`. Callers could also mutate the
shared state, corrupting aggregator state.
**Recommendation**
Return immutable snapshots: convert `SiteHealthState` to a record (per
HealthMonitoring-002/003) so handing out the reference is safe, or deep-copy each
state into an immutable DTO before returning.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Re-triaged: verified against the current
source — the root cause was already eliminated by the HealthMonitoring-002 fix.
`SiteHealthState` is now a `sealed record` with `init`-only properties (fully
immutable). Every aggregator transition installs a brand-new record instance via an
atomic compare-and-swap, so the references `GetAllSiteStates` and `GetSiteState` hand
out are immutable snapshots — a UI consumer reading one on its own thread can never
observe a torn or half-applied state, and cannot mutate aggregator state through the
returned reference. The recommended fix (make `SiteHealthState` a record) is exactly
what the HealthMonitoring-002 change did, so no further code change was required.
### HealthMonitoring-009 — Missing test coverage for central report loop, heartbeat path, replication, and collector setters
| | |
|--|--|
| Severity | Medium |
| Category | Testing coverage |
| Status | Resolved |
| Location | `tests/ScadaLink.HealthMonitoring.Tests/` |
**Description**
Several behaviours have no automated coverage:
- `CentralHealthReportLoop` — leader-only gating (`SelfIsPrimary`), self-report
generation, sequence assignment: no test file at all.
- `CentralHealthAggregator.MarkHeartbeat` — keeping a site online between reports,
online recovery via heartbeat, and the unknown-site drop behaviour
(HealthMonitoring-007): untested.
- Offline detection driven by `LastHeartbeatAt` vs `LastReportReceivedAt` — the
existing offline tests only advance time after a report, never exercising the
heartbeat-keeps-alive path the design depends on.
- `SiteHealthCollector``SetClusterNodes`, `SetInstanceCounts`, `SetParkedMessageCount`,
`SetNodeHostname`, `SetActiveNode`/`NodeRole`, `UpdateTagQuality`,
`UpdateConnectionEndpoint`: not reflected-in-report tested.
- `SiteHealthReportReplica` idempotency under double delivery: untested.
**Recommendation**
Add tests for the central report loop (with a fake `IClusterNodeProvider`), the
heartbeat-keeps-online and unknown-site heartbeat paths, and the remaining collector
setters' presence in `CollectReport` output.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Added the missing coverage:
- **`CentralHealthReportLoopTests`** (new file) — `GeneratesCentralReports_WhenSelfIsPrimary`,
`GeneratesNoReports_WhenNotPrimary` (leader-only `SelfIsPrimary` gating with a fake
`IClusterNodeProvider`), `AssignsMonotonicSequenceNumbers`, and
`SetsActiveNodeFlag_EvenWhenNotPrimary`.
- **`CentralHealthAggregatorTests`** — `MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport`,
`MarkHeartbeat_KeepsSiteOnline_BetweenReports` (heartbeat keeps a site online past
the offline timeout — the path the design depends on), and
`MarkHeartbeat_BringsOfflineSiteBackOnline`.
- **`SiteHealthCollectorTests`** — reflected-in-report tests for `SetClusterNodes`,
`SetInstanceCounts`, `SetParkedMessageCount`, `SetNodeHostname`,
`SetActiveNode`/`NodeRole`, `UpdateTagQuality`, `UpdateConnectionEndpoint`, and
`SetStoreAndForwardDepths`.
The `SiteHealthReportReplica` idempotency item is **out of scope** for this module:
`SiteHealthReportReplica` is declared in `ScadaLink.Commons` and published/consumed by
`CentralCommunicationActor` in the `ScadaLink.Communication` module — the
HealthMonitoring module itself has no replication code. Replica double-delivery
idempotency is already covered by `ProcessReport`'s sequence-number guard
(`ProcessReport_RejectsEqualSequence`, `ProcessReport_RejectsStaleReport_WhenSequenceNotGreater`);
testing the actor-side double-publish belongs in the Communication module's review.
The HealthMonitoring test suite now stands at 47 passing tests (was 30).
### HealthMonitoring-010 — `HealthReportSender` silently swallows inner failures with bare `catch {}`
| | |
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.HealthMonitoring/HealthReportSender.cs:70-87` |
**Description**
The cluster-nodes update and parked-message-count query are each wrapped in
`try { ... } catch { /* Non-fatal */ }` with no logging. A persistent failure (e.g.
the S&F SQLite store is permanently broken, or `GetClusterNodes()` always throws)
is then completely invisible — every report silently ships with stale cluster nodes
and a parked count of 0, with nothing in the logs to explain the wrong dashboard
values. Bare `catch` with no exception variable also catches `OperationCanceledException`
and would mask shutdown signalling if the awaited call observed the token.
**Recommendation**
Catch a specific exception type (or at least `Exception ex`) and `LogWarning`/`LogDebug`
the failure so persistent degradation is diagnosable; avoid swallowing
`OperationCanceledException`.
**Resolution**
_Unresolved._
### HealthMonitoring-011 — `AddHealthMonitoringActors` is a dead no-op placeholder
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.HealthMonitoring/ServiceCollectionExtensions.cs:42-46` |
**Description**
`AddHealthMonitoringActors` does nothing but `return services` with a "Placeholder for
Phase 4+" comment. A public extension method that silently no-ops is a trap: a caller
who registers it will believe actor wiring is in place. No caller currently invokes it.
**Recommendation**
Remove the method until it has real behaviour, or throw `NotImplementedException` so
accidental use fails loudly. If the actor model for this component is genuinely
planned, track it in the design doc instead of a half-method.
**Resolution**
_Unresolved._
### HealthMonitoring-012 — `SiteHealthState.LatestReport` initialized to `null!`, misrepresenting the contract
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.HealthMonitoring/SiteHealthState.cs:11` |
**Description**
`LatestReport` is declared `SiteHealthReport LatestReport { get; set; } = null!;`,
suppressing nullability. Today every code path that creates a `SiteHealthState` (only
`ProcessReport`) assigns `LatestReport`, so it is never actually null — but the
`null!` declaration tells readers and the compiler the opposite of the real
invariant. If HealthMonitoring-007 is addressed by registering state from a heartbeat
(no report yet), this becomes a live `NullReferenceException` risk for UI code that
dereferences `LatestReport`.
**Recommendation**
Either make `LatestReport` `required` (matching how it is genuinely always set today)
or make it properly nullable `SiteHealthReport?` and have consumers handle the
"registered, no report yet" case explicitly — consistent with whatever is decided
for HealthMonitoring-007.
**Resolution**
_Unresolved._
+459
View File
@@ -0,0 +1,459 @@
# Code Review — Host
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.Host` |
| Design doc | `docs/requirements/Component-Host.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 8 |
## Summary
The Host module is the composition root for the entire ScadaLink system: a single
binary whose behaviour (`Central` vs `Site`) is driven entirely by configuration. The
implementation is generally faithful to `Component-Host.md` — startup validation,
role-based registration, Serilog enrichment, Windows Service support, dead-letter
monitoring, CoordinatedShutdown, and gRPC hosting on site nodes are all present and
backed by a solid test suite (`tests/ScadaLink.Host.Tests`).
The most significant problem is the readiness endpoint: `/health/ready` runs **all**
registered health checks, including the leader-only `active-node` check, so a fully
operational *standby* central node permanently reports `503` on `/health/ready`
directly contradicting REQ-HOST-4a, which defines readiness as cluster membership +
DB connectivity (not leadership). Several other findings concern configuration that
is validated-but-never-consumed (`MachineDataDb`), design-doc drift (Akka.Persistence
is required by REQ-HOST-6 but the system uses no persistent actors), an incorrect
seed-node entry in the shipped site config, blocking sync-over-async during startup,
and unguarded string interpolation when building HOCON. None are crash/data-loss
class, but the readiness bug is High because it breaks load-balancer behaviour with
no safe workaround.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | `/health/ready` includes the leader-only check (Host-001); site seed-node config points at the gRPC port (Host-004). |
| 2 | Akka.NET conventions | ☑ | CoordinatedShutdown, receptionist registration, singleton scoping all correct. HOCON built by raw string interpolation (Host-006); `StartAsync` returns before actors are confirmed running (Host-009). |
| 3 | Concurrency & thread safety | ☑ | Blocking `GetAwaiter().GetResult()` on a hosted-service startup thread (Host-005). `DeadLetterMonitorActor` state is actor-confined — no issues. |
| 4 | Error handling & resilience | ☑ | Top-level try/catch logs fatal and rethrows. No retry around DB migration / readiness preconditions (Host-010). |
| 5 | Security | ☑ | Plaintext DB password, LDAP service-account password and dev JWT key checked into `appsettings.Central.json` (Host-003). |
| 6 | Performance & resource management | ☑ | No undisposed resources. Inbound API script compilation is a synchronous startup loop — acceptable. |
| 7 | Design-document adherence | ☑ | REQ-HOST-6 mandates Akka.Persistence config but none exists and no persistent actors exist — doc is stale (Host-002). REQ-HOST-4 GrpcPort-≠-RemotingPort rule not enforced (Host-007). |
| 8 | Code organization & conventions | ☑ | `MachineDataDb` validated/declared but never consumed (Host-008). `LoggingOptions.MinimumLevel` is dead (Host-011). |
| 9 | Testing coverage | ☑ | Strong suite; no test asserts `/health/ready` excludes `active-node`, which is why Host-001 slipped through (noted in Host-001). |
| 10 | Documentation & comments | ☑ | Comments are accurate. REQ-HOST-6 in the design doc is the main stale-doc item (Host-002). |
## Findings
### Host-001 — `/health/ready` includes the leader-only `active-node` check
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.Host/Program.cs:135-145` |
**Description**
`/health/ready` is mapped with `MapHealthChecks("/health/ready", ...)` and **no
`Predicate`**, so it executes every registered check: `database`, `akka-cluster`
*and* `active-node`. `ActiveNodeHealthCheck` (`Health/ActiveNodeHealthCheck.cs:38`)
returns `Unhealthy` on any node that is not the cluster leader. As a result a
standby central node that is fully operational (cluster member `Up`, database
reachable) still returns `503` on `/health/ready`. This contradicts REQ-HOST-4a,
which defines readiness as cluster membership + DB connectivity + singletons —
explicitly *not* leadership. `/health/active` is the endpoint intended to report
leadership. A load balancer using `/health/ready` to decide whether a node may
serve traffic will permanently treat the standby as unready, defeating failover
readiness. No test covers this: `HealthCheckTests.HealthReady_Endpoint_ReturnsResponse`
only asserts a response is returned, not the standby semantics.
**Recommendation**
Add a `Predicate` to the `/health/ready` mapping that excludes the `active-node`
check, e.g. `Predicate = check => check.Name != "active-node"` (or tag the readiness
checks and filter by tag). Add a regression test asserting a non-leader node returns
`200` on `/health/ready`.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Root cause confirmed against
`Program.cs`: the `/health/ready` mapping had no `Predicate`, so it executed all
three registered checks including the leader-only `active-node` check, while
`ActiveNodeHealthCheck` returns `Unhealthy` on any non-leader node — making a fully
operational standby central node permanently report `503`. Fix: added
`Predicate = check => check.Name != "active-node"` to the `/health/ready`
`HealthCheckOptions`, so readiness now reflects cluster membership + DB connectivity
only (REQ-HOST-4a); leadership remains reported solely by `/health/active`.
Regression test `HealthCheckTests.HealthReady_Endpoint_ExcludesActiveNodeCheck`
asserts the `active-node` check name does not appear in the `/health/ready`
response body; it failed before the fix and passes after. Full Host suite green
(156 passed).
### Host-002 — Akka.Persistence required by REQ-HOST-6 is not configured and not used
| | |
|--|--|
| Severity | Medium — re-triaged: stale design doc, Host code is correct |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:70-108`, `docs/requirements/Component-Host.md` REQ-HOST-6 |
**Description**
REQ-HOST-6 states the Host "must configure the Akka.NET actor system using
Akka.Hosting with ... **Persistence**: Configured with the appropriate journal and
snapshot store (SQL for central, SQLite for site)." The HOCON built in
`AkkaHostedService.StartAsync` contains no `akka.persistence` section, no journal and
no snapshot-store plugin, and `ScadaLink.Host.csproj` references neither
`Akka.Persistence.Hosting` nor any persistence plugin (the design doc Dependencies
list `Akka.Persistence.Hosting`). A repo-wide search finds **no** `PersistentActor` /
`ReceivePersistentActor` subclasses — the system deliberately uses custom SQLite
storage services instead. The code is internally consistent, but the design document
is stale: it mandates a subsystem that does not exist. This is a documented-vs-actual
drift that will mislead future maintainers and any audit against REQ-HOST-6.
**Recommendation**
Update `Component-Host.md` REQ-HOST-6 and the Dependencies list to remove the
Akka.Persistence requirement (or explicitly state persistence is provided by
component-owned SQLite storage, not Akka.Persistence). If persistence *is* intended,
add the plugin packages and HOCON. Either way, code and doc must agree.
**Resolution**
_Verified 2026-05-16, left Open — re-triaged._ The finding is accurate: a repo-wide
search confirms there are **no** `PersistentActor` / `ReceivePersistentActor`
subclasses anywhere in `src/`, no `akka.persistence` section in the HOCON built by
`AkkaHostedService.StartAsync`, and `ScadaLink.Host.csproj` references no persistence
plugin packages. The system deliberately uses component-owned SQLite storage
services instead. The **Host code is therefore correct and internally consistent**
the only defect is that `docs/requirements/Component-Host.md` REQ-HOST-6 and its
Dependencies list still mandate Akka.Persistence, a subsystem that does not (and is
not intended to) exist. The sole fix is editing that design document, which lies
outside this resolution task's permitted edit scope (`src/ScadaLink.Host`,
`tests/ScadaLink.Host.Tests`, `code-reviews/Host/findings.md`). No code or test
change can resolve a stale-doc finding. Left **Open** and surfaced for the design-doc
owner: REQ-HOST-6 must drop the Akka.Persistence requirement (and the
`Akka.Persistence.Hosting` Dependencies entry), stating that node-local persistence
is provided by component-owned SQLite storage services.
### Host-003 — Secrets committed in plaintext in `appsettings.Central.json`
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.Host/appsettings.Central.json:20-31` |
**Description**
`appsettings.Central.json` contains real-looking secrets in plaintext, checked into
source control: SQL Server passwords in the `ConfigurationDb` / `MachineDataDb`
connection strings (`Password=ScadaLink_Dev1#`), an LDAP service-account password
(`LdapServiceAccountPassword: "password"`), and a JWT signing key
(`JwtSigningKey: "scadalink-dev-jwt-signing-key-..."`). Even though these are
intended as development defaults, shipping them in the default config invites them
being reused verbatim in production, and a committed JWT signing key allows anyone
with repo access to forge session tokens. `TrustServerCertificate=true` additionally
disables TLS validation for the SQL connection.
**Recommendation**
Move all secrets out of committed `appsettings*.json` into environment variables,
user-secrets, or a secret store. Keep only non-sensitive structural defaults in the
file and document the required environment variables. At minimum add a clear comment
that these values are dev-only and must be overridden, and rotate the JWT key per
environment.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Root cause confirmed against
`appsettings.Central.json`: the committed file carried real-looking secrets in
plaintext — SQL Server passwords (`Password=ScadaLink_Dev1#`) in both connection
strings, an LDAP service-account password, and a JWT signing key. Fix: all four
secrets were removed from the committed file and replaced with non-functional
`${...}` placeholder references (`ConfigurationDb` / `MachineDataDb`,
`LdapServiceAccountPassword`, `JwtSigningKey`). A new top-level `_secrets` note
documents that the Host's configuration builder (`AddEnvironmentVariables()`)
overlays the real values supplied via environment variables
(`ScadaLink__Database__ConfigurationDb`, `ScadaLink__Database__MachineDataDb`,
`ScadaLink__Security__LdapServiceAccountPassword`,
`ScadaLink__Security__JwtSigningKey`); the placeholders are intentionally invalid so
a misconfigured deployment fails loudly rather than silently using a committed key.
Regression test class `ConfigSecretsTests` asserts the committed file carries no
plaintext `Password=` value, no committed LDAP service-account password, and no
committed JWT signing key; all three tests failed before the fix and pass after.
Tests that drive the full `Program` startup pipeline against the real SQL provider
(`HealthCheckTests`, `HostStartupTests.CentralRole_StartsWithoutError`) were adapted
to supply the local dev connection strings themselves via the new
`CentralDbTestEnvironment` test helper (environment variables) — they must no longer
depend on committed secrets. Note: the `docker/central-node-*/appsettings.Central.json`
files still contain the same dev secrets but lie outside this task's permitted edit
scope; they should receive the same treatment in a follow-up.
### Host-004 — Site seed-node list points at the gRPC port, not a remoting port
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.Host/appsettings.Site.json:10-19` |
**Description**
The shipped site config sets `Node:RemotingPort = 8082` and `Node:GrpcPort = 8083`,
but `Cluster:SeedNodes` is `["akka.tcp://scadalink@localhost:8082",
"akka.tcp://scadalink@localhost:8083"]`. The second seed node targets `8083`, which
is the Kestrel HTTP/2 gRPC port — not an Akka remoting endpoint. A node attempting to
join via that seed will try to establish an Akka.Remote TCP association against the
gRPC listener and fail. `StartupValidator` only checks that ≥2 seed nodes exist
(`StartupValidator.cs:54-56`), so this misconfiguration passes validation silently.
For the single-node dev site it is harmless (the first seed succeeds), but it is an
incorrect example that will be copied into multi-node site configs.
**Recommendation**
Correct the site seed-node list to reference the two site nodes' *remoting* ports
(e.g. `8082` and `8084`), never the gRPC port. Consider extending `StartupValidator`
to reject a seed node whose port equals this node's `GrpcPort`.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Root cause confirmed against
`appsettings.Site.json`: with `Node:RemotingPort = 8082` and `Node:GrpcPort = 8083`,
the second `Cluster:SeedNodes` entry was `akka.tcp://scadalink@localhost:8083` — the
Kestrel HTTP/2 gRPC port, not an Akka.Remote endpoint. `StartupValidator` only
checked seed-node *count* (≥2), so the misconfiguration passed silently. Fix, two
parts: (1) the shipped `appsettings.Site.json` second seed entry was corrected to a
remoting port (`localhost:8084`); (2) `StartupValidator.Validate` was extended — for
`Site` nodes it now parses each seed node's TCP port (via a new `SeedNodePort`
helper) and rejects any entry whose port equals the node's `GrpcPort`, using the
resolved GrpcPort including the `8083` `NodeOptions` default when the key is absent.
The seed-node-count check was hoisted above the Site block so the new rule can reuse
the parsed list. Regression tests in `StartupValidatorTests`:
`Site_SeedNodeOnGrpcPort_FailsValidation`,
`Site_SeedNodeOnDefaultGrpcPort_FailsValidation` (default-8083 path),
`Site_SeedNodesOnRemotingPort_PassesValidation`, and
`Central_SeedNodeOnPort8083_PassesValidation` (rule is Site-only) — all failed
appropriately before the fix and pass after.
### Host-005 — Blocking sync-over-async (`GetAwaiter().GetResult()`) inside `StartAsync`
| | |
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:345` |
**Description**
`RegisterSiteActors` calls `storeAndForwardService.StartAsync().GetAwaiter().GetResult()`
synchronously, blocking inside the `IHostedService.StartAsync` path. `StartAsync` is
itself declared synchronous (returns `Task.CompletedTask`), so the work cannot be
awaited cleanly. Blocking on async work risks thread-pool starvation during startup
and, if the awaited operation captures a synchronization context, deadlock. It also
hides exceptions behind an `AggregateException` wrapper.
**Recommendation**
Make `AkkaHostedService.StartAsync` genuinely `async` and `await
storeAndForwardService.StartAsync(cancellationToken)`. Propagate the
`CancellationToken` and let exceptions surface as the original type.
**Resolution**
_Unresolved._
### Host-006 — HOCON assembled by unescaped string interpolation
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:70-108` |
**Description**
The Akka HOCON is built with an interpolated string that injects
`_nodeOptions.NodeHostname`, `_clusterOptions.SeedNodes`, the computed roles, and
`SplitBrainResolverStrategy` directly into the configuration text. Values are not
escaped. A hostname or seed-node string containing a quote, backslash, brace, or
comment sequence would corrupt the HOCON and produce a confusing parse error far from
the real cause; `SplitBrainResolverStrategy` is interpolated without quoting, so a
value with whitespace breaks the document. Building cluster configuration from raw
string concatenation is also harder to maintain than the typed Akka.Hosting builder
the design doc (REQ-HOST-6) actually calls for ("via Akka.Hosting").
**Recommendation**
Prefer the `Akka.Hosting` `AddAkka(...)` builder with strongly-typed `WithRemoting`,
`WithClustering`, and split-brain-resolver configuration instead of hand-built HOCON.
If HOCON must be retained, validate/escape interpolated values (especially hostname
and seed nodes) before substitution.
**Resolution**
_Unresolved._
### Host-007 — REQ-HOST-4 rule "GrpcPort ≠ RemotingPort" is not enforced
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.Host/StartupValidator.cs:43-47` |
**Description**
REQ-HOST-4 requires: "Site nodes must have `GrpcPort` in valid port range (165535)
**and different from `RemotingPort`**." `StartupValidator` validates the GrpcPort
range but never compares it to `RemotingPort`. A site config that sets both ports to
the same value passes validation and then fails opaquely at runtime when Kestrel and
Akka.Remote both try to bind the port. The GrpcPort range check is also skipped
entirely when the key is absent (`grpcPortStr != null`), relying on the
`NodeOptions` default of 8083 — acceptable, but the equality rule is the missing
piece.
**Recommendation**
Add a check in the `role == "Site"` block: if `GrpcPort` (resolved, including the
8083 default) equals `RemotingPort`, add an error
`"ScadaLink:Node:GrpcPort must differ from RemotingPort"`.
**Resolution**
_Unresolved._
### Host-008 — `MachineDataDb` is validated and declared but never consumed
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.Host/StartupValidator.cs:33-34`, `src/ScadaLink.Host/DatabaseOptions.cs:6` |
**Description**
`StartupValidator` requires a non-empty `ScadaLink:Database:MachineDataDb` connection
string for Central nodes, and `DatabaseOptions` exposes a `MachineDataDb` property,
but a repo-wide search shows the value is never read anywhere outside the Host module
— only `ConfigurationDb` is passed to `AddConfigurationDatabase`
(`Program.cs:83-85`). The Host therefore fails startup if `MachineDataDb` is missing
even though nothing uses it. This is either dead configuration that should be removed
or a missing wiring (a machine-data DbContext that was never registered).
**Recommendation**
Determine whether a machine-data store is actually required. If yes, wire it into the
relevant component's DI registration. If no, remove the `MachineDataDb` validation
rule, the `DatabaseOptions` property, and the key from `appsettings.Central.json`.
**Resolution**
_Unresolved._
### Host-009 — `StartAsync` reports success before role actors are confirmed running
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:127-141` |
**Description**
`StartAsync` creates actors with `ActorOf` (a fire-and-forget operation — the actor's
`PreStart` runs asynchronously on its own thread) and then returns
`Task.CompletedTask`. For site nodes, `grpcServer.SetReady(_actorSystem)` is called
synchronously at the end of `RegisterSiteActors`, marking the gRPC server ready even
though `SiteCommunicationActor`, the deployment-manager singleton, and the
`ClusterClient` may not yet have completed their `PreStart`/initial-contact handshake.
REQ-HOST-7 requires "Actor system and SiteStreamManager ... initialized before gRPC
begins accepting connections" — `SiteStreamManager.Initialize` is awaited-equivalent,
but the broader actor graph is not. The window is small and the gRPC server still
rejects streams until `SetReady`, so impact is limited, but readiness is being
asserted optimistically.
**Recommendation**
If strict ordering matters, gate `SetReady` on confirmation that
`SiteCommunicationActor` is fully initialized (e.g. an `Ask` round-trip or a
readiness message), or document explicitly that gRPC readiness only guarantees the
actor system exists, not that the cluster handshake has completed.
**Resolution**
_Unresolved._
### Host-010 — No retry/backoff around startup preconditions (DB migration, readiness)
| | |
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.Host/Program.cs:112-125` |
**Description**
On Central startup the Host opens a DI scope and calls
`MigrationHelper.ApplyOrValidateMigrationsAsync` directly. If the SQL Server is not
yet reachable (common in container orchestration where the DB and app start
together), the call throws, the top-level `catch` logs `Fatal`, and the process
exits. There is no bounded retry/backoff to tolerate a database that is briefly
unavailable at boot. The design intent (REQ-HOST-4a, readiness gating, `503` until
ready) is about *serving traffic*, but the migration step happens before the host
even runs and has no such tolerance.
**Recommendation**
Wrap the migration/validation step in a bounded retry with exponential backoff (e.g.
Polly), or move schema apply behind the readiness gate so the process stays up and
reports `503` until the database becomes reachable.
**Resolution**
_Unresolved._
### Host-011 — `LoggingOptions.MinimumLevel` is dead configuration
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.Host/LoggingOptions.cs:5`, `src/ScadaLink.Host/Program.cs:42-50` |
**Description**
`LoggingOptions` exposes a `MinimumLevel` property bound from `ScadaLink:Logging`
(`SiteServiceRegistration.BindSharedOptions`), and both `appsettings.Central.json`
and `appsettings.Site.json` set `"Logging": { "MinimumLevel": "Information" }`.
However Serilog is configured purely via `ReadFrom.Configuration(configuration)`,
which reads the standard `Serilog` section — not `ScadaLink:Logging`. The
`LoggingOptions.MinimumLevel` value is never read by any code, so changing it has no
effect. This is misleading: an operator editing `ScadaLink:Logging:MinimumLevel`
expecting a log-level change will see nothing happen.
**Recommendation**
Either consume `LoggingOptions.MinimumLevel` when configuring the Serilog
`LoggerConfiguration` (e.g. set `MinimumLevel.Is(...)` from it), or remove the option
class and the `ScadaLink:Logging` sections and rely solely on the `Serilog`
configuration section. Keep one mechanism, not two.
**Resolution**
_Unresolved._
+526
View File
@@ -0,0 +1,526 @@
# Code Review — InboundAPI
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.InboundAPI` |
| Design doc | `docs/requirements/Component-InboundAPI.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 6 |
## Summary
The InboundAPI module is small (8 source files) and the happy-path flow — extract
key, validate, deserialize parameters, execute script, serialize result — is clean
and readable. However the review surfaced several real problems concentrated in two
themes: **concurrency** and **security**. The `InboundScriptExecutor` is a singleton
that mutates a plain `Dictionary` from concurrent ASP.NET request threads with no
synchronization, which can corrupt the handler cache or crash the process under load.
On the security side, API-key comparison is a non-constant-time database string
match (timing oracle), compiled scripts run with no enforcement of the documented
script trust model (forbidden APIs such as `System.IO`/`Process`/`Reflection` are
fully reachable), there is no request-body size limit, and the executor's catch-all
swallows `OperationCanceledException` from genuine client disconnects as a "timeout".
Design-doc adherence is also incomplete: the `Database.Connection()` script API
described in the design doc is entirely absent from `InboundScriptContext`, and the
endpoint never enforces that the API is central-only. Testing covers the validators
well but there is no coverage of the HTTP endpoint, concurrency, or recompilation.
None of the findings are data-loss-class, but the concurrency and trust-model issues
are High severity and should be addressed before production use.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | `CoerceValue` returns `null` for legitimately-null/`String` values indistinguishably; parameter-definition edge cases noted. |
| 2 | Akka.NET conventions | ☑ | Module is ASP.NET-hosted, no actors of its own; routes to actors via `CommunicationService`. No correlation-ID issues — IDs are set in `RouteHelper`. |
| 3 | Concurrency & thread safety | ☑ | Singleton `InboundScriptExecutor` mutates a non-thread-safe `Dictionary` from concurrent request threads — see InboundAPI-001/002. |
| 4 | Error handling & resilience | ☑ | Catch-all conflates client cancellation with timeout (InboundAPI-004); compilation-failure path repeats work on every request (InboundAPI-009). |
| 5 | Security | ☑ | Non-constant-time key comparison, no trust-model enforcement, no body-size limit, missing-method enumeration oracle — see InboundAPI-003/005/006/011. |
| 6 | Performance & resource management | ☑ | Up to 3 separate DB round-trips per request in `ApiKeyValidator`; uncapped lazy recompilation. |
| 7 | Design-document adherence | ☑ | `Database.Connection()` script API missing; central-only hosting not enforced; lazy-compile diverges from "compiled at startup". |
| 8 | Code organization & conventions | ☑ | `ParameterDefinition` is an API-shaped POCO declared in the component project rather than Commons; otherwise conventions followed. |
| 9 | Testing coverage | ☑ | Good unit coverage of the two validators; no endpoint, concurrency, recompilation, or timeout-vs-cancel tests. |
| 10 | Documentation & comments | ☑ | `ApiKeyValidationResult.NotFound` XML/name says "NotFound" but returns HTTP 400 — misleading (InboundAPI-013). |
## Findings
### InboundAPI-001 — Singleton script handler cache mutated without synchronization
| | |
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:17`, `:32`, `:40`, `:89`, `:123-128` |
**Description**
`InboundScriptExecutor` is registered as a singleton (`ServiceCollectionExtensions.cs:11`)
and its handler cache is a plain `Dictionary<string, Func<...>>` (`InboundScriptExecutor.cs:17`).
`RegisterHandler`, `RemoveHandler`, `CompileAndRegister`, and the lazy-compile path in
`ExecuteAsync` all read and write this dictionary with no lock. ASP.NET serves inbound
API requests on concurrent thread-pool threads, so two requests for an as-yet-uncompiled
method (or a request racing a CLI-triggered `CompileAndRegister`) can mutate the
dictionary concurrently. `Dictionary` is explicitly not safe for concurrent
read/write — this can corrupt internal buckets, throw `InvalidOperationException`,
or return a torn/`null` handler, crashing the request or the process.
**Recommendation**
Replace the `Dictionary` with a `ConcurrentDictionary<string, Func<...>>`, or guard all
access with a lock. For the lazy-compile path use `GetOrAdd` so concurrent first-callers
compile at most once.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): replaced the plain `Dictionary` handler
cache with a `ConcurrentDictionary`; `RemoveHandler` now uses `TryRemove`; the
lazy-compile path in `ExecuteAsync` compiles outside the cache and inserts atomically
via `GetOrAdd` so concurrent first-callers share one handler. Regression tests
`ConcurrentLazyCompile_SameMethod_DoesNotCorruptCache` and
`ConcurrentRegisterAndExecute_DoesNotThrow` added.
### InboundAPI-002 — Lazy compilation is a check-then-act race with no atomicity
| | |
|--|--|
| Severity | Medium — re-triaged: already fixed by the InboundAPI-001 fix; verified and closed |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:152-161` |
**Description**
`ExecuteAsync` does `if (!_scriptHandlers.TryGetValue(...)) { CompileAndRegister(method); handler = _scriptHandlers[method.Name]; }`.
Even setting aside the unsynchronized dictionary (InboundAPI-001), this is a
check-then-act sequence: between `TryGetValue` failing and the re-read on line 128,
another thread could `RemoveHandler` the entry, causing the indexer on line 128 to
throw `KeyNotFoundException` — an unhandled-in-context exception that is then caught
only by the broad catch on line 143 and reported to the caller as "Internal script
error". Multiple concurrent first-callers will also each compile the same script
redundantly (wasted Roslyn work).
**Recommendation**
Make compile-and-fetch a single atomic operation (`ConcurrentDictionary.GetOrAdd`
with a lazily-evaluated factory, or a per-method lock), and have `CompileAndRegister`
return the handler it produced rather than requiring a separate dictionary read.
**Resolution**
Resolved 2026-05-16 (commit `pending`): re-triage — verified against the current
source, this finding was **already fixed** by the InboundAPI-001 fix. The
`InboundScriptExecutor.cs:152-161` lazy-compile path no longer does check-then-act
re-read: `Compile(method)` runs unconditionally (it never reads the cache) and the
result is published via the atomic `_scriptHandlers.GetOrAdd(method.Name, compiled)`.
There is no separate dictionary indexer read, so the `KeyNotFoundException` race the
finding describes cannot occur, and concurrent first-callers all share the single
handler that `GetOrAdd` keeps. Regression test
`LazyCompile_RacingRemoveHandler_NeverThrowsKeyNotFound` added (asserts a concurrent
`RemoveHandler` storm against lazy-compiling callers never yields the catch-all
"Internal script error"); it passes against the current code, confirming the fix.
### InboundAPI-003 — API key compared with non-constant-time string equality
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ConfigurationDatabase/Repositories/InboundApiRepository.cs:22-23`, consumed by `src/ScadaLink.InboundAPI/ApiKeyValidator.cs:33` |
**Description**
API-key authentication resolves the key with
`FirstOrDefaultAsync(k => k.KeyValue == keyValue)` — an ordinary equality match
translated to a SQL `WHERE KeyValue = @p` comparison. The secret is matched with
ordinary (early-exit) string/SQL comparison rather than a constant-time comparison,
which is a classic timing side-channel for secret material. Combined with the design's
explicit "no rate limiting" decision, an attacker with network access to the central
API can mount a timing attack to recover valid keys. The API key is the *sole*
credential for the inbound API, so this is the primary authentication path.
**Recommendation**
Look the key up by a non-secret indexed identifier (e.g. a key prefix/id) or fetch
candidate rows, then verify the secret in-process using
`CryptographicOperations.FixedTimeEquals` over the UTF-8 bytes. Preferably store only
a salted hash of the key value and compare hashes. Avoid leaking secret-length and
match-position timing.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): `ApiKeyValidator` no longer calls the
secret-equality lookup `GetApiKeyByValueAsync` (the SQL `WHERE KeyValue = @secret`
timing oracle). It now fetches all keys via `GetAllApiKeysAsync` and matches the
secret in-process with `CryptographicOperations.FixedTimeEquals` over the UTF-8 bytes,
scanning every candidate so neither match position nor secret length is observable.
Regression tests `ValidateAsync_DoesNotUseSecretEqualityLookup`,
`ValidateAsync_WrongKey_ConstantTimePath_Returns401`, and
`ValidateAsync_KeyOfDifferentLength_Returns401` added. Note: the timing-oracle method
`GetApiKeyByValueAsync` remains on `IInboundApiRepository` (it is outside this module);
removing it from the repository is left as separate follow-up since the validator no
longer depends on it.
### InboundAPI-004 — Client disconnect is misreported as a script timeout
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:117-141` |
**Description**
`ExecuteAsync` creates a linked CTS from `httpContext.RequestAborted` and the method
timeout, then catches `OperationCanceledException` and unconditionally returns
"Script execution timed out". When the *client* aborts the request (`RequestAborted`
fires), the same exception type is thrown, so a normal client disconnect is logged as
a timeout (`_logger.LogWarning("Script execution timed out ...")`) and an attempt is
made to write a 500 timeout body to an already-gone connection. This pollutes the
failure log (which the design says is reserved for genuine script errors) and obscures
real timeout incidents.
**Recommendation**
Distinguish the two cancellation sources: if `cancellationToken` (the request token)
is cancelled, treat it as a client abort — do not log a timeout and do not attempt to
write a response. Only when the timeout CTS fired should the result be "timed out".
Check `cts.Token.IsCancellationRequested && !cancellationToken.IsCancellationRequested`
or use a dedicated timeout `CancellationTokenSource` so the two are separable.
**Resolution**
Resolved 2026-05-16 (commit `pending`): `ExecuteAsync` now uses a dedicated timeout
`CancellationTokenSource` (`new CancellationTokenSource(timeout)`) linked with the
request-abort token, so the two cancellation sources are separable. The
`OperationCanceledException` handler reports "Script execution timed out" (and logs a
warning) **only** when the timeout CTS fired and the request token did not; a client
abort instead returns "Request cancelled by client" and logs at Debug — the failure
log stays reserved for genuine script-execution timeouts. `HandleInboundApiRequest`
additionally short-circuits with `Results.Empty` (no warning log, no 500 body write)
when `RequestAborted` is cancelled, since the connection is already gone. Regression
tests `ClientDisconnect_IsNotReportedAsTimeout` and `GenuineTimeout_StillReportedAsTimeout`
added.
### InboundAPI-005 — Compiled API scripts run with no script-trust-model enforcement
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:56-93` |
**Description**
CLAUDE.md's Akka.NET conventions state the script trust model forbids `System.IO`,
`Process`, `Threading`, `Reflection`, and raw network access. `CompileAndRegister`
compiles arbitrary C# with `CSharpScript.Create` and only restricts the *default
imports* (`WithImports("System", ...)`). Imports are a convenience, not a sandbox — a
script can still fully-qualify any type (`System.IO.File.Delete(...)`,
`System.Diagnostics.Process.Start(...)`, `System.Reflection`, raw `Socket`) because
the core framework assemblies are referenced and Roslyn scripting performs no API
allow/deny-listing. Inbound API scripts execute on the central node with the host
process's privileges, so a malicious or buggy method definition has full host access.
Note the Design role authors these scripts (less trusted than Admin), making
enforcement material.
**Recommendation**
Add a compile-time analyzer/`SyntaxWalker` (as the Site Runtime does for instance
scripts) that rejects forbidden namespaces/types before registering a handler, and/or
run scripts under a constrained boundary. At minimum, share the Site Runtime's
forbidden-API checker so the trust model is enforced consistently. Reject the method
(and log) when a violation is found instead of registering it.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): added `ForbiddenApiChecker`, a Roslyn
`CSharpSyntaxWalker` that statically rejects scripts referencing forbidden namespaces
(`System.IO`, `System.Diagnostics`, `System.Threading` except `Tasks`,
`System.Reflection`, `System.Net`, `System.Runtime.InteropServices`, `Microsoft.Win32`)
whether reached via a `using` directive or a fully-qualified name. `CompileAndRegister`
now runs the check before Roslyn compilation and refuses to register (and logs) a
violating method; `ExecuteAsync`'s lazy-compile path is gated by the same check.
Regression tests `CompileAndRegister_ForbiddenApi_RejectsScript` (theory),
`ExecuteAsync_ForbiddenApiScript_DoesNotRunAndReturnsFailure`, and
`CompileAndRegister_PermittedScript_StillRegisters` added.
### InboundAPI-006 — No request body size limit on the inbound endpoint
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/EndpointExtensions.cs:54-62` |
**Description**
`HandleInboundApiRequest` calls `JsonDocument.ParseAsync(httpContext.Request.Body, ...)`
with no explicit body-size cap and no `[RequestSizeLimit]`/endpoint metadata. Although
Kestrel has a default max request body size, this endpoint accepts arbitrary JSON from
external systems, fully buffers it into a `JsonDocument`, and then `Clone()`s the
root element (`:61`) which materializes the entire document on the heap. With no rate
limiting (a deliberate design choice) a single caller can drive large allocations.
Deep/wide JSON also makes the `CoerceValue` `object`/`list` deserialization
(`ParameterValidator.cs:113,117`) expensive.
**Recommendation**
Set an explicit, modest body-size limit on the endpoint
(`.WithMetadata(new RequestSizeLimitAttribute(...))` or
`IHttpMaxRequestBodySizeFeature`) and consider a `JsonDocumentOptions` `MaxDepth`.
Reject oversized bodies with 413 before buffering.
**Resolution**
Resolved 2026-05-16 (commit `pending`): added `InboundApiEndpointFilter`, an
`IEndpointFilter` applied to `POST /api/{methodName}` via `.AddEndpointFilter<>()`.
It rejects requests whose declared `Content-Length` exceeds `InboundApiOptions.
MaxRequestBodyBytes` (default 1 MiB) with HTTP 413 *before* the handler buffers the
body into a `JsonDocument`, and also lowers the per-request `IHttpMaxRequestBodySizeFeature`
cap so a chunked/unknown-length stream is cut off by Kestrel while being read. The
limit is configurable via the bound `ScadaLink:InboundApi` options section. Regression
tests `OversizedBody_ShortCircuitsWith413_AndDoesNotRunHandler`, `BodyAtLimit_RunsHandler`,
and `FilterCapsMaxRequestBodySizeFeature` added.
### InboundAPI-007 — `Database.Connection()` script API from the design doc is not implemented
| | |
|--|--|
| Severity | Medium — verified real drift; left Open pending a design decision (see Resolution) |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:188-203` |
**Description**
`Component-InboundAPI.md` ("Script Runtime API -> Database Access") specifies
`Database.Connection("connectionName")` as an available script capability for
querying the configuration/machine-data databases. `InboundScriptContext` exposes only
`Parameters`, `Route`, and `CancellationToken` — there is no `Database` member. Any
method script that follows the documented API will fail to compile. Either the code
is incomplete or the design doc is stale; the two must be reconciled.
**Recommendation**
If database access is in scope, add a `Database` property to `InboundScriptContext`
backed by a connection-factory service. If it is not, remove the "Database Access"
section from `Component-InboundAPI.md` so the design doc stops advertising an absent
API.
**Resolution**
_Unresolved — left Open; needs a design decision the resolving agent cannot make._
Re-triage 2026-05-16: confirmed against the current source — the drift is **real**.
`InboundScriptContext` (`InboundScriptExecutor.cs:188-203`) exposes only `Parameters`,
`Route`, and `CancellationToken`; there is no `Database` member, so a method script
following the documented `Database.Connection("name")` API fails to compile.
This finding cannot be closed by the InboundAPI module agent for two reasons:
1. **Scope** — the alternative resolution (deleting the "Database Access" section)
edits `docs/requirements/Component-InboundAPI.md`, which is outside the editable
scope (`src/ScadaLink.InboundAPI`, `tests/`, this file only).
2. **It is a genuine design decision.** Implementing `Database.Connection()` is not a
mechanical fix: it hands inbound API scripts a *raw* MS SQL client. The ScadaLink
script trust model (CLAUDE.md, Akka.NET conventions) forbids scripts from `System.IO`
and raw network access, and `ForbiddenApiChecker` (added for InboundAPI-005) now
statically blocks `System.Net`/`System.IO`. A raw `SqlConnection` is in clear
tension with that trust model, and the set of connection names a script may open,
read-only vs. read-write access, and connection lifetime/pooling all require a
design call. **Surface to the design owner:** decide whether `Database.Connection()`
is in scope — if yes, write a design note covering the trust-model carve-out and
then implement a `Database` member backed by a connection-factory service; if no,
delete the "Database Access" section from `Component-InboundAPI.md`.
### InboundAPI-008 — Inbound API endpoint not restricted to the active central node
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.InboundAPI/EndpointExtensions.cs:19-23`, `src/ScadaLink.Host/Program.cs:149` |
**Description**
The design states the Inbound API is "Central cluster only (active node)" and "fails
over with it". `MapInboundAPI` registers `POST /api/{methodName}` unconditionally, and
`Program.cs` maps it inside the central-role branch but with no active-node gating —
unlike `/health/active` which has an `active-node` predicate. A standby central node
will happily serve inbound API calls, executing scripts and `Route.To()` calls from a
non-leader, which can race the active node or run against stale singleton state.
**Recommendation**
Gate the endpoint on active-node status (reuse the cluster `active-node` health check
or a leader-state check) and return 503 on the standby, so Traefik/clients only reach
the live node — consistent with how the Management API and `/health/active` are
treated.
**Resolution**
Resolved 2026-05-16 (commit `pending`): introduced `IActiveNodeGate`, an abstraction
the inbound API uses to ask whether this node is the active (cluster-leader) central
node. The new `InboundApiEndpointFilter` (applied to `POST /api/{methodName}`)
consults the gate and short-circuits a standby node with HTTP 503 before any
auth/script work, so Traefik/clients only reach the live node — consistent with
`/health/active`. The gate is resolved optionally: when no implementation is
registered (non-clustered host / tests) the endpoint defaults to "allow", preserving
prior behaviour. Regression tests `StandbyNode_ShortCircuitsWith503_AndDoesNotRunHandler`,
`ActiveNode_PassesGate_RunsHandler`, and `NoGateRegistered_PassesGate_RunsHandler`
added. **Follow-up (outside this module's scope):** `ScadaLink.Host` should register
an `IActiveNodeGate` implementation backed by `ActiveNodeHealthCheck` /
`Cluster.State.Leader` in the central-role branch of `Program.cs` so the gate is
actually enforced in production; until then the endpoint defaults to "allow".
### InboundAPI-009 — Failed compilation is retried on every subsequent request
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.InboundAPI/InboundScriptExecutor.cs:123-128` |
**Description**
When a method's script fails to compile, `CompileAndRegister` returns `false` and
nothing is stored in `_scriptHandlers`. Every subsequent call to that method re-enters
the lazy-compile branch and recompiles the broken script via Roslyn from scratch.
Roslyn compilation is expensive; a single broken method definition repeatedly invoked
by an external caller (no rate limiting) becomes a CPU amplification vector.
**Recommendation**
Cache the compilation *failure* (e.g. store a sentinel handler that immediately
returns the compile error, or keep a `HashSet` of known-bad method names with the
diagnostic) so a broken script is compiled at most once until the definition is
updated via `CompileAndRegister`.
**Resolution**
_Unresolved._
### InboundAPI-010 — `ParameterValidator` ignores extra body fields and cannot validate Object/List element types
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.InboundAPI/ParameterValidator.cs:64-90`, `:112-118` |
**Description**
Two related correctness gaps: (1) The validator iterates only over *defined*
parameters; any extra top-level fields in the request body are silently ignored
rather than reported, so callers get no feedback on typo'd parameter names. (2) For
`Object` and `List` types the validator only checks the JSON *kind* (`Object`/`Array`)
and then blindly `JsonSerializer.Deserialize`s the raw text — the design's extended
type system describes Objects as "named structure with typed fields" and Lists as
collections "of objects or primitive types", but no field-level or element-level type
validation is performed. Invalid nested structures pass validation and surface only
as runtime script errors.
**Recommendation**
Optionally warn/400 on unexpected body fields. For the extended types, either parse a
richer `ParameterDefinition` (with nested field definitions / element type) and
validate recursively, or document explicitly that Object/List are validated only for
shape — and update the design doc to match.
**Resolution**
_Unresolved._
### InboundAPI-011 — Method-existence check leaks to unapproved callers (enumeration oracle)
| | |
|--|--|
| Severity | Low |
| Category | Security |
| Status | Open |
| Location | `src/ScadaLink.InboundAPI/ApiKeyValidator.cs:39-52` |
**Description**
`ValidateAsync` returns 400 `Method '{methodName}' not found` when the method does not
exist, but 403 `API key not approved for this method` when it exists but the key is
not approved. A caller holding any valid enabled key can therefore enumerate which
method names exist on the central API by observing 400-vs-403 responses. The error
message also echoes the caller-supplied `methodName` back verbatim into the JSON
response (`EndpointExtensions.cs:47`), a minor reflected-input concern.
**Recommendation**
Return an indistinguishable response (e.g. 403/404) for both "method not found" and
"key not approved" so existence is not observable to unapproved callers. Avoid echoing
raw caller input in error bodies, or sanitize it.
**Resolution**
_Unresolved._
### InboundAPI-012 — `ParameterDefinition` POCO declared in the component project, not Commons
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.InboundAPI/ParameterValidator.cs:128-133` |
**Description**
`ParameterDefinition` is a persistence-/contract-shaped POCO: it is the deserialized
form of `ApiMethod.ParameterDefinitions` (a column in the configuration database) and
describes the public API contract. CLAUDE.md's code-organization rules place
persistence-ignorant entity/contract types in `ScadaLink.Commons`. Defining it inside
the InboundAPI project means any other component that needs to read or produce method
parameter definitions (e.g. Central UI's method editor, CLI, Management Service)
cannot share the type and will duplicate it.
**Recommendation**
Move `ParameterDefinition` (and a matching return-definition type, if added) to
`ScadaLink.Commons` under the InboundApi entity/types namespace so it is shared by all
components that work with method definitions.
**Resolution**
_Unresolved._
### InboundAPI-013 — `ApiKeyValidationResult.NotFound` factory returns HTTP 400, contradicting its name
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.InboundAPI/ApiKeyValidator.cs:78-79` |
**Description**
The static factory is named `NotFound` and is used for the "method not found" case,
but it builds a result with `StatusCode = 400` (Bad Request), not 404. The name
strongly implies 404 and will mislead future maintainers; `EndpointExtensions`
faithfully propagates whatever status code the factory sets, so the misnaming directly
affects the wire contract.
**Recommendation**
Rename the factory to match its behaviour (e.g. `BadRequest`) or change the status
code to 404 if that is the intended contract — and document the chosen "method not
found" status in `Component-InboundAPI.md`'s Error Handling section, which currently
does not list it.
**Resolution**
_Unresolved._
+508
View File
@@ -0,0 +1,508 @@
# Code Review — ManagementService
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.ManagementService` |
| Design doc | `docs/requirements/Component-ManagementService.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 5 |
## Summary
The ManagementService module is a thin command-dispatch layer: a single `ManagementActor`
fronts every administrative operation, an HTTP `POST /management` endpoint authenticates and
forwards to it, and a SignalR `DebugStreamHub` provides real-time debug streaming. The code
is consistently structured and the role-based authorization gate (`GetRequiredRole`) is
broadly correct and well tested. However, the review surfaced a significant **security
theme**: site-scope enforcement, which the design document requires for instance- and
site-targeted Deployment operations, is applied inconsistently — several query handlers and
all remote-query/debug handlers perform no site-scope check at all, allowing a site-scoped
Deployment user to read or act on sites outside their scope. A second theme is **Akka.NET
convention drift**: the actor offloads all work to `Task.Run` instead of using `PipeTo`,
declares no supervision strategy, and the contract messages carry a loosely-typed `object`
payload. There are also resource-management defects in the HTTP endpoint (`JsonDocument`
instances never disposed) and dead/unused configuration. None of the findings are
crash-class, but the site-scope gaps are High severity because they are a real
authorization bypass with no workaround.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | + | `HandleResolveRoles` builds `RoleMapper` by hand; `ResolveRolesCommand` is a stale dispatch path. See 008, 011. |
| 2 | Akka.NET conventions | + | `Task.Run` instead of `PipeTo`, no supervision strategy, `object`-typed message payload. See 004, 005, 012. |
| 3 | Concurrency & thread safety | + | Actor is stateless so `Task.Run` does not corrupt state, but it defeats actor-thread serialization (004). `Sender` correctly captured to a local before the closure. |
| 4 | Error handling & resilience | + | Exceptions are caught and mapped uniformly; `SiteScopeViolationException` mapped to `Unauthorized`. Audit-logging consistency issue noted in 009. |
| 5 | Security | + | Site-scope enforcement missing on query/remote/debug paths. See 001, 002, 003. |
| 6 | Performance & resource management | + | `JsonDocument` instances never disposed in the HTTP endpoint. See 006. |
| 7 | Design-document adherence | + | Design doc states remote queries enforce site scoping; code does not. `ManagementServiceOptions` reserved-for-future config is unused. See 001, 010. |
| 8 | Code organization & conventions | + | Mixed serializers (Newtonsoft in actor, System.Text.Json in endpoint); inconsistent audit logging across mutations. See 007, 009. |
| 9 | Testing coverage | + | Authorization is well covered; site-scope enforcement, the HTTP endpoint, `DebugStreamHub`, and remote-query handlers have no tests. See 013. |
| 10 | Documentation & comments | + | XML docs are accurate where present; `ManagementServiceOptions` and `ResolveRolesCommand` paths are undocumented dead code (010, 011). |
## Findings
### ManagementService-001 — Remote-query and debug-snapshot handlers bypass site-scope enforcement
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:1465`, `:1481`, `:1493`, `:641`, `:649` |
**Description**
The design document (`Component-ManagementService.md`, Authorization section) states that for
Deployment users "Site scoping is enforced for site-scoped Deployment users" and lists
"debug snapshot, parked message queries, site event log queries" among the Deployment-role
operations. `HandleQueryEventLogs`, `HandleQueryParkedMessages`, `HandleDebugSnapshot`,
`HandleRetryParkedMessage`, and `HandleDiscardParkedMessage` make no call to `EnforceSiteScope`
or `EnforceSiteScopeForInstance`. A Deployment user scoped to site A can therefore query event
logs / parked messages of site B, retry or discard another site's parked messages, and pull a
debug snapshot of any instance simply by supplying a different `SiteIdentifier` or `InstanceId`.
This is an authorization bypass with no workaround.
**Recommendation**
In each of these handlers resolve the target site and call site-scope enforcement before
delegating to `CommunicationService`. For the `SiteIdentifier`-keyed handlers, look up the
`Site` by identifier and enforce against `Site.Id`; for `DebugSnapshotCommand` the instance
is already loaded — call `EnforceSiteScope(user, instance.SiteId)` (which requires threading
`AuthenticatedUser` into these handlers, currently dropped).
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Threaded `AuthenticatedUser` into
`HandleQueryEventLogs`, `HandleQueryParkedMessages`, `HandleRetryParkedMessage`,
`HandleDiscardParkedMessage`, and `HandleDebugSnapshot`; added an
`EnforceSiteScopeForIdentifier` helper that resolves the site by identifier and applies
`EnforceSiteScope`. `HandleDebugSnapshot` enforces against the already-loaded instance's
`SiteId`. Regression tests: `QueryEventLogs_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`QueryParkedMessages_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`RetryParkedMessage_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`DiscardParkedMessage_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`DebugSnapshot_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`.
### ManagementService-002 — Single-entity query handlers leak data across site scope
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:510`, `:673`, `:733`, `:774`, `:631`, `:624` |
**Description**
`HandleListInstances` and `HandleListSites` correctly filter their results by the user's
`PermittedSiteIds`, but the single-entity query handlers do not. `HandleGetInstance`,
`HandleGetSite`, `HandleListAreas`, and `HandleGetDataConnection` fetch by ID with no
site-scope check, so a site-scoped Deployment user can read any instance, site, area tree,
or data connection by ID even though that site is excluded from their scope. The list
endpoints having a filter while the get-by-id endpoints do not is an inconsistency that
undermines the scoping model. (`HandleGetDeploymentDiff` and `HandleListInstanceAlarmOverrides`
do enforce scope, confirming the omission elsewhere is unintentional.)
**Recommendation**
Apply `EnforceSiteScopeForInstance` in `HandleGetInstance`, and `EnforceSiteScope` against
the resolved site ID in `HandleGetSite`, `HandleListAreas`, and `HandleGetDataConnection`
(for data connections, scope by the connection's `SiteId`).
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `HandleGetInstance`, `HandleGetSite`,
`HandleGetDataConnection` now take `AuthenticatedUser` and call `EnforceSiteScope` against
the resolved entity's site ID (instance `SiteId`, site `Id`, data-connection `SiteId`);
`HandleListAreas` enforces against the requested `SiteId` before querying. Regression tests:
`GetInstance_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`GetInstance_InScopeForSiteScopedUser_ReturnsSuccess`,
`GetSite_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`GetSite_OutOfScopeForAdminUser_ReturnsSuccess`,
`ListAreas_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`,
`GetDataConnection_OutOfScopeForSiteScopedUser_ReturnsUnauthorized`.
### ManagementService-003 — DebugStreamHub.SubscribeInstance performs no per-instance authorization
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/DebugStreamHub.cs:104` |
**Description**
`OnConnectedAsync` authenticates the WebSocket connection and verifies the caller holds the
`Deployment` role, but `SubscribeInstance(int instanceId)` accepts any instance ID and starts
a stream without checking that the authenticated user is scoped to that instance's site. A
site-scoped Deployment user can therefore subscribe to the live debug stream (attribute
values, alarm states) of an instance belonging to a site outside their scope. This is the
streaming equivalent of finding 001/002.
**Recommendation**
Resolve the instance's site inside `SubscribeInstance` and reject the subscription if the
authenticated user's permitted-site set does not include it. The authenticated identity
established in `OnConnectedAsync` must be persisted on the connection (e.g. in
`Context.Items`) so it is available to `SubscribeInstance`.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `OnConnectedAsync` now persists the resolved
roles and `PermittedSiteIds` in `Context.Items`. `SubscribeInstance` resolves the
instance's site via `ITemplateEngineRepository` and rejects the subscription (sending
`OnStreamTerminated`) when the new pure `DebugStreamHub.IsInstanceAccessAllowed` check
fails. The check grants access for the Admin role or system-wide Deployment (empty
permitted set) and otherwise requires the instance's site in the permitted set. Regression
tests: `IsInstanceAccessAllowed_SiteScopedUser_OutOfScopeInstance_Denied`,
`IsInstanceAccessAllowed_SiteScopedUser_InScopeInstance_Allowed`,
`IsInstanceAccessAllowed_SystemWideDeployment_AnySiteAllowed`,
`IsInstanceAccessAllowed_AdminRole_BypassesSiteScope`,
`IsInstanceAccessAllowed_AdminRoleCheck_IsCaseInsensitive`.
### ManagementService-004 — Actor offloads work to Task.Run instead of using PipeTo
| | |
|--|--|
| Severity | Medium |
| Category | Akka.NET conventions |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:61` |
**Description**
`HandleEnvelope` runs every command on a thread-pool thread via `Task.Run(async () => ...)`
and replies from inside the continuation. This is the anti-pattern the project's Akka.NET
conventions warn against — the canonical approach is to start the async work and `PipeTo`
its result back to `Self`/`Sender`. Although `Sender` is correctly copied to a local before
the closure, the current code: (a) lets multiple commands execute fully concurrently with no
actor-thread serialization, so the actor provides no ordering or back-pressure guarantees
and is an actor in name only; (b) cannot be paused, supervised, or made to honour a mailbox
bound; (c) is shielded from synchronous faults only because every path is inside the
try/catch — any future code path that throws synchronously before the `Task.Run` body would
escape it.
**Recommendation**
Replace `Task.Run` with a method that returns the `Task` and `PipeTo` the mapped result
(`ManagementSuccess`/`ManagementError`/`ManagementUnauthorized`) back to the captured sender,
mapping faults in the `PipeTo` failure continuation. If genuine parallelism is desired, make
that explicit with a router/dispatcher rather than ad-hoc `Task.Run`.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: `HandleEnvelope` ran every command via
`Task.Run` and replied from inside the continuation, contrary to the project's PipeTo
convention. Replaced it with a `ProcessCommand` method returning a `Task<object>` and
`PipeTo(sender, success, failure)`; faults are now mapped uniformly in a `MapFault` failure
continuation (`SiteScopeViolationException` -> `ManagementUnauthorized`, otherwise
`ManagementError`), which also unwraps `AggregateException`. Regression test:
`UnknownCommandType_FaultMappedToManagementError`. Existing success/error/unauthorized
mapping tests confirm behaviour is preserved.
### ManagementService-005 — ManagementActor declares no supervision strategy
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:33` |
**Description**
The project conventions call for explicit supervision strategies (Resume for coordinator
actors). `ManagementActor` is a long-lived coordinator-style actor but overrides no
`SupervisorStrategy` and defines no `PreRestart`/`PostRestart` behaviour. In practice it
spawns no children so the default strategy is rarely exercised, but an explicit strategy
should still be declared for clarity and to match the documented convention; it also matters
if children are added later (e.g. if finding 004 introduces worker actors).
**Recommendation**
Add an explicit `protected override SupervisorStrategy SupervisorStrategy()` returning a
Resume-based strategy, consistent with other central coordinator actors.
**Resolution**
_Unresolved._
### ManagementService-006 — JsonDocument instances never disposed in the HTTP endpoint
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementEndpoints.cs:83`, `:112` |
**Description**
`JsonDocument` is `IDisposable` (it rents buffers from a pooled `ArrayPool`). `HandleRequest`
parses the request body into `doc` at line 83 and never disposes it, and line 112
(`JsonDocument.Parse("{}")`) allocates a second document inline that is also never disposed.
Every management HTTP call therefore leaks pooled buffers, increasing GC pressure and pool
churn under load.
**Recommendation**
Wrap the parsed document in `using var doc = ...`. For the empty-payload fallback, avoid
allocating a `JsonDocument` entirely — deserialize from the literal string `"{}"`/an empty
object, or restructure so the fallback path does not parse a throwaway document.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: the request `JsonDocument` was never
disposed and the empty-payload path allocated a second throwaway `JsonDocument`. Extracted
request parsing into a testable `ManagementEndpoints.ParseCommand` helper that wraps the
document in `using`; the missing-payload case now deserializes from the `"{}"` literal
string rather than parsing a throwaway document. Regression tests:
`ParseCommand_WithExplicitPayload_DeserializesIntoCommandType`,
`ParseCommand_WithMissingPayload_DeserializesParameterlessCommand`,
`ParseCommand_WithInvalidJson_ReturnsFailure`,
`ParseCommand_WithMissingCommandField_ReturnsFailure`,
`ParseCommand_WithUnknownCommand_ReturnsFailure`.
### ManagementService-007 — Inconsistent and cycle-prone serialization of repository entities
| | |
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:67`; `src/ScadaLink.ManagementService/ManagementEndpoints.cs:113` |
**Description**
The actor serializes every command result with `Newtonsoft.Json` (`JsonConvert.SerializeObject`)
while the HTTP endpoint deserializes payloads with `System.Text.Json`. Beyond the
inconsistency, `JsonConvert.SerializeObject` is applied directly to EF-backed entities
returned by repositories (e.g. `Site`, `DataConnection`, `NotificationList` with a
`Recipients` collection, `Template` with children). With default Newtonsoft settings any
bidirectional navigation property produces a `JsonSerializationException` for self-referencing
loops, and even without cycles this serializes lazy/navigation state the CLI does not expect.
**Recommendation**
Standardise on one serializer (the rest of the HTTP path uses `System.Text.Json`). Serialize
explicit DTOs / projections rather than EF entities, or configure
`ReferenceLoopHandling.Ignore` and ignore navigation properties. Verify that handlers
returning rich entity graphs (`HandleGetTemplate`, `HandleUpdateNotificationList`) round-trip
correctly.
**Resolution**
Resolved 2026-05-16 (commit pending). Confirmed: the actor serialized results with
`Newtonsoft.Json` (not even a direct package reference) while the HTTP endpoint uses
`System.Text.Json`. Standardised the actor on `System.Text.Json` via a new
`ManagementActor.SerializeResult` helper using a shared `JsonSerializerOptions` with
`ReferenceHandler.IgnoreCycles` (cycle-safe for EF entity graphs) and camelCase naming
(matches the CLI's case-insensitive deserializer). Removed the `Newtonsoft.Json` import.
Regression tests: `SerializeResult_WithCyclicGraph_DoesNotThrow`,
`SerializeResult_UsesCamelCasePropertyNames`.
### ManagementService-008 — HandleResolveRoles constructs RoleMapper manually instead of via DI
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:285` |
**Description**
Every other handler resolves its collaborators from the scoped `IServiceProvider`.
`HandleResolveRoles` instead does `new RoleMapper(sp.GetRequiredService<ISecurityRepository>())`,
bypassing DI. If `RoleMapper` ever gains a dependency, caching, or options, this hand-built
instance silently diverges from the DI-registered one. It is also inconsistent with
`ManagementEndpoints`, which resolves `RoleMapper` from DI.
**Recommendation**
Resolve `RoleMapper` via `sp.GetRequiredService<RoleMapper>()` like every other dependency.
**Resolution**
_Unresolved._
### ManagementService-009 — Audit logging applied inconsistently across mutating handlers
| | |
|--|--|
| Severity | Low — re-triaged from Medium; the claimed audit gap does not exist (see Description), leaving only an undocumented-convention issue. |
| Category | Code organization & conventions |
| Status | Resolved |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:357`, `:1134`, `:1085`, `:526`, `:1275` |
**Description**
The design doc states "All mutating operations are audit logged." Some handlers call
`AuditAsync` explicitly (`HandleCreateInstance`, `HandleCreateSite`, all repository-direct
external-system/notification/security/area mutations), but the handlers that delegate to a
domain service do **not**`HandleCreateTemplate`/`HandleUpdateTemplate`/`HandleDeleteTemplate`,
all template-member handlers (`HandleAddAttribute` ... `HandleDeleteComposition`), template-folder
handlers, shared-script handlers, `HandleDeployArtifacts`, `HandleDeployInstance`,
`HandleEnableInstance`/`Disable`/`Delete`, and the instance-binding/override handlers.
**Re-triage (2026-05-16):** the original finding claimed this "creates a real risk of silent
audit gaps for template authoring and deployment operations." That claim was verified against
the actual sources and is **false**. Every domain service the delegating handlers call —
`TemplateService`, `SharedScriptService`, `InstanceService`, `AreaService`, `SiteService`,
`TemplateFolderService`, `DeploymentService`, `ArtifactDeploymentService` — injects
`IAuditService` and calls `LogAsync` on every mutation (`grep` confirms an `_auditService.LogAsync`
call after each `Create`/`Update`/`Delete` in `TemplateService.cs`, `DeploymentService.cs`,
`ArtifactDeploymentService.cs`, etc.). There is therefore no audit gap; if anything, adding
explicit `AuditAsync` to a delegating handler would *double-log*. The genuine issue is purely
organizational: the two-layer split (actor audits repo-direct mutations, services audit their
own) was undocumented, which is what made it look risky. This is a Low-severity
code-organization issue, not a Medium error-handling/resilience defect.
**Recommendation**
Document the chosen contract so the split cannot be misread as a gap. (The original
alternative — moving all auditing into the actor — would require un-auditing eight services
and is not warranted given they already audit correctly.)
**Resolution**
Resolved 2026-05-16 (commit pending). Re-triaged to Low / Code organization after verifying
all eight delegated-to services audit internally — no audit gap exists. Documented the
two-layer audit contract in an XML `<remarks>` block on `ManagementActor.AuditAsync`:
repository-direct mutations call `AuditAsync`; service-delegating handlers must not, because
the services own auditing and a duplicate call would double-log. No behavioural change, so
no new regression test; existing `CreateInstanceCommand_WithDeploymentRole_ReturnsSuccess`
covers the explicit-audit path.
### ManagementService-010 — ManagementServiceOptions.CommandTimeout is defined but never used
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.ManagementService/ManagementServiceOptions.cs:5`; `src/ScadaLink.ManagementService/ManagementEndpoints.cs:16` |
**Description**
`ManagementServiceOptions.CommandTimeout` is bound from configuration in
`ServiceCollectionExtensions`, but no code reads it. The HTTP endpoint instead hard-codes
`AskTimeout = TimeSpan.FromSeconds(30)`. The design doc describes the options section as
"Reserved for future configuration — e.g., command timeout overrides", yet a concrete
`CommandTimeout` property already exists and is silently ignored, so an operator who sets it
in `appsettings.json` gets no effect.
**Recommendation**
Either consume `ManagementServiceOptions.CommandTimeout` in `ManagementEndpoints.HandleRequest`
(inject `IOptions<ManagementServiceOptions>`), or remove the property until it is wired up so
configuration cannot be set with no effect.
**Resolution**
_Unresolved._
### ManagementService-011 — ResolveRolesCommand dispatch path is stale dead code
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.ManagementService/ManagementActor.cs:273`, `:283` |
**Description**
The design doc states the HTTP endpoint "collapses the CLI's previous two-step flow
(ResolveRoles + actual command) into a single HTTP round-trip", and indeed `ManagementEndpoints`
performs LDAP auth and role resolution itself before dispatching. The `ResolveRolesCommand`
case in `DispatchCommand` is therefore unreachable from the HTTP path. It remains reachable
only via a raw ClusterClient sender, but a caller able to send `ResolveRolesCommand` could
enumerate role mappings for arbitrary LDAP groups with no role requirement
(`GetRequiredRole` returns null for it) — a minor information-disclosure surface for a path
the design says no longer exists.
**Recommendation**
If the two-step flow is genuinely retired, remove `ResolveRolesCommand`, its handler, and the
class. If it must remain for non-HTTP clients, document why and confirm exposing role-mapping
data unauthenticated is intended.
**Resolution**
_Unresolved._
### ManagementService-012 — ManagementEnvelope carries a loosely-typed object payload
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.Commons/Messages/Management/ManagementEnvelope.cs:7`; `src/ScadaLink.ManagementService/ManagementActor.cs:132` |
**Description**
`ManagementEnvelope.Command` is typed `object`, so the actor relies on a large open-ended
`switch` with a `NotSupportedException` default for unknown types. While the individual
command records are immutable, `object` defeats compile-time exhaustiveness — adding a new
command record produces no compiler signal that `DispatchCommand` (and `GetRequiredRole`)
need updating, and a typo or unregistered command surfaces only as a runtime exception. The
message contract is also harder to evolve safely under the additive-only rule.
**Recommendation**
Introduce a marker interface (e.g. `IManagementCommand`) implemented by every command record
and type the envelope payload as that interface. This documents the contract, lets analyzers
flag unhandled cases, and keeps `ManagementCommandRegistry`'s reflection scan precise.
**Resolution**
_Unresolved._
### ManagementService-013 — No tests for site-scope enforcement, the HTTP endpoint, or DebugStreamHub
| | |
|--|--|
| Severity | Medium |
| Category | Testing coverage |
| Status | Resolved |
| Location | `tests/ScadaLink.ManagementService.Tests/ManagementActorTests.cs:1` |
**Description**
`ManagementActorTests` covers role-based authorization, success/error mapping, and correlation
IDs thoroughly, but several critical paths are untested: (a) site-scope enforcement —
`EnforceSiteScope`/`EnforceSiteScopeForInstance` and `SiteScopeViolationException` -> `Unauthorized`
mapping have no test, which is why the gaps in findings 001/002 went unnoticed; (b)
`ManagementEndpoints` — Basic Auth decoding, malformed-header handling, LDAP/role resolution,
command deserialization, and HTTP status mapping have zero coverage; (c) `DebugStreamHub`
authentication, subscribe/unsubscribe lifecycle, and `ManagementCommandRegistry.Resolve` are
untested. The `Envelope` test helper always passes `Array.Empty<string>()` for permitted
sites, so no test ever exercises a site-scoped user.
**Recommendation**
Add tests that exercise a site-scoped Deployment user against in-scope and out-of-scope
targets for instance and site operations, asserting `ManagementUnauthorized` on violations.
Add `WebApplicationFactory`-based tests for `ManagementEndpoints` covering auth failures,
malformed bodies, unknown commands, and the 200/400/403/401/504 mappings.
**Resolution**
Resolved 2026-05-16 (commit pending). The site-scope and `DebugStreamHub` coverage gaps
were closed by the resolution of findings 001/002/003 (the `ScopedEnvelope` helper plus the
`*_OutOfScopeForSiteScopedUser_ReturnsUnauthorized` tests and `DebugStreamHubTests`). The
remaining HTTP-endpoint gap is now covered by a new `ManagementEndpointsTests.cs` exercising
`ManagementEndpoints.ParseCommand` — command deserialization, malformed JSON, missing
`command` field, and unknown commands. Full `WebApplicationFactory` auth-flow tests were
deliberately not added: `ManagementEndpoints` depends on `LdapAuthService` and live LDAP
infrastructure, so the testable command-parsing/dispatch logic was extracted into the pure
`ParseCommand` helper and covered instead. Tests: `ParseCommand_*` (5),
`SerializeResult_*` (2), `UnknownCommandType_FaultMappedToManagementError`, plus the
pre-existing site-scope and DebugStreamHub suites. `dotnet test` -> 48 passed.
@@ -0,0 +1,418 @@
# Code Review — NotificationService
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.NotificationService` |
| Design doc | `docs/requirements/Component-NotificationService.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 3 |
## Summary
The NotificationService module is small (6 source files) and structurally clean: it
abstracts the SMTP client behind an interface, isolates the OAuth2 token lifecycle,
and integrates with the Store-and-Forward Engine for transient-failure buffering.
However, the review surfaced several substantive defects. The most serious is that
**no Store-and-Forward delivery handler is ever registered for the `Notification`
category** — buffered notifications are persisted but never retried or delivered,
silently losing every notification that hit a transient SMTP failure. Error
classification is fragile (substring matching on exception messages) and is
applied inconsistently between `SendAsync` and `DeliverAsync`. `DeliverAsync` also
contains a resource-management bug that constructs and leaks two SMTP clients per
call. Secondary themes: the `OAuth2TokenService` singleton caches a single token
keyed to no credential identity (incorrect if multiple SMTP configs exist), several
design-doc requirements are unimplemented (connection timeout, max concurrent
connections, TLS `SSL`/`None` modes), and credentials are stored and passed as
plaintext `string` values. Test coverage exercises the happy path and the main
error branches but misses the OAuth2 delivery path, the permanent-classification
fallback in `DeliverAsync`, and concurrency on the token cache.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | Double SMTP client construction; `Auto` socket option for non-TLS; `TimeoutException`/`OperationCanceledException` misclassified. |
| 2 | Akka.NET conventions | ☑ | No actors in this module (`AddNotificationServiceActors` is a no-op); delivery is a plain DI service. No Akka-specific issues. |
| 3 | Concurrency & thread safety | ☑ | `OAuth2TokenService` is a singleton with a shared mutable token cache; double-checked locking present but cache key is wrong (NS-006). |
| 4 | Error handling & resilience | ☑ | Critical: no S&F delivery handler registered for `Notification` (NS-001). Fragile substring error classification (NS-002, NS-003). |
| 5 | Security | ☑ | Credentials handled as plaintext strings; OAuth2 client secret in DB credential blob; no recipient address validation. |
| 6 | Performance & resource management | ☑ | Two `ISmtpClientWrapper` instances created per send, one leaked; connection not pooled; `MaxConcurrentConnections` unenforced. |
| 7 | Design-document adherence | ☑ | Connection timeout, max concurrent connections, and TLS `SSL`/`None` modes from the design doc are not implemented. |
| 8 | Code organization & conventions | ☑ | `SmtpPermanentException` in the wrong file; `SmtpConfiguration` POCO has non-nullable strings with no initializer (compiler-warning risk). |
| 9 | Testing coverage | ☑ | Happy path and main error branches covered; OAuth2 delivery path, `DeliverAsync` permanent fallback, and token-cache concurrency untested. |
| 10 | Documentation & comments | ☑ | XML comment on `DeliverAsync` ("Throws on failure") and the misleading "OAuth2 token refresh if needed" comment do not match behaviour. |
## Findings
### NotificationService-001 — Buffered notifications are never retried (no S&F delivery handler)
| | |
|--|--|
| Severity | Critical |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:96`, `src/ScadaLink.NotificationService/ServiceCollectionExtensions.cs:8` |
**Description**
On a transient SMTP failure the service calls `_storeAndForward.EnqueueAsync(StoreAndForwardCategory.Notification, ...)`. The Store-and-Forward Engine only delivers (immediately or on retry sweep) a category for which a delivery handler has been registered via `StoreAndForwardService.RegisterDeliveryHandler`. A repo-wide search shows the `Notification` category handler is never registered anywhere — `StoreAndForwardCategory.Notification` appears only in this module's `EnqueueAsync` call. As a result, every buffered notification falls into the `RetryMessageAsync` "No delivery handler for category" branch (`StoreAndForwardService.cs:201-204`), which logs a warning and returns without ever delivering or removing the message. Buffered notifications accumulate in SQLite forever and are never sent. This silently loses every notification that hit a transient failure, while `SendAsync` returns `Success=true, WasBuffered=true`, telling the caller the notification is safely queued. This directly violates the design doc's "integrates with the Store-and-Forward Engine for reliable delivery" guarantee.
**Recommendation**
Register a delivery handler for `StoreAndForwardCategory.Notification` during startup that deserializes the buffered payload (`ListName`, `Subject`, `Message`), re-resolves the list/recipients/SMTP config, and re-attempts `DeliverAsync`, returning `true` on success, `false` on permanent failure, and throwing on transient failure. Wire it in `AddNotificationService` or the host bootstrap. Add an integration test covering the buffer-then-retry-then-deliver round trip.
**Resolution**
Resolved 2026-05-16. A delivery handler for `StoreAndForwardCategory.Notification` is now
registered at site startup in `AkkaHostedService`. The handler resolves
`NotificationDeliveryService` in a fresh DI scope and calls the new `DeliverBufferedAsync`,
which re-resolves the list, recipients and SMTP config and re-attempts delivery —
returning `true` on success, `false` (park) on permanent failure or missing
configuration, and throwing on transient failure so the engine retries. `SendAsync` now
buffers with `attemptImmediateDelivery: false` so registering the handler does not send
the notification twice. Regression tests cover the happy path and the list-removed park
path. Fixed by the commit whose message references `NotificationService-001`.
### NotificationService-002 — `TimeoutException`/`OperationCanceledException` misclassified as transient
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:157-167` |
**Description**
`IsTransientSmtpError` treats `OperationCanceledException` (and its subtype `TaskCanceledException`) as a transient SMTP error. When the caller passes a `CancellationToken` that is cancelled — e.g. the Script Execution Actor is stopped, or the script times out — the resulting `OperationCanceledException` is caught by the `catch ... when (IsTransientSmtpError(ex))` clause and the notification is buffered as if SMTP had failed. A deliberate cancellation should propagate, not be silently buffered for retry. The same clause classifies any `IOException` as transient even though `IOException` covers unrelated failures (e.g. a serialization stream error). Additionally, `OperationCanceledException` raised by token cancellation in the OAuth2 path would be miscategorised the same way.
**Recommendation**
Re-throw `OperationCanceledException`/`TaskCanceledException` when `cancellationToken.IsCancellationRequested` is true rather than classifying it as transient. Narrow `IOException` handling to SMTP-specific I/O failures, or rely on MailKit's typed exceptions (`SmtpCommandException`, `SmtpProtocolException`, `ServiceNotConnectedException`) instead of broad base types.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Classification was rewritten around a typed
`ClassifySmtpError` helper: a caller-requested cancellation (`OperationCanceledException`/
`TaskCanceledException` while `cancellationToken.IsCancellationRequested`) now propagates
out of both `SendAsync` and `DeliverAsync` via dedicated `catch` filters instead of being
buffered. The broad `IOException` catch-all was dropped — only MailKit's typed exceptions
plus `SocketException`/`TimeoutException` are treated as transient. Regression tests
`Send_CancellationRequested_PropagatesAndDoesNotBuffer` and
`Send_TaskCanceledException_WithCancellation_Propagates`.
### NotificationService-003 — Error classification by substring matching on exception messages is fragile
| | |
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:144-147`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:163-166` |
**Description**
Transient/permanent classification depends on `ex.Message.Contains("5.")`, `Contains("4.")`, `Contains("550")`, `Contains("421")`, etc. This is unreliable: (a) `Message.Contains("5.")` matches any message containing the literal "5." anywhere — e.g. a host name `smtp5.example.com`, a version string, or a path — producing false permanent classification; (b) `Contains("4.")` likewise matches `"v4.0"` or an IP address octet; (c) MailKit exposes the actual SMTP status code on `SmtpCommandException.StatusCode`, which is the correct, locale-independent source of truth and is being ignored; (d) message text is culture/version-dependent and not part of any stable contract. Misclassification has real consequences: a permanent failure misread as transient floods the S&F buffer (which the design doc explicitly says must be prevented), and a transient failure misread as permanent loses the notification.
**Recommendation**
Classify on MailKit's typed exceptions and `SmtpCommandException.StatusCode` (4xx → transient, 5xx → permanent), and `SocketException`/`SmtpProtocolException`/connection-refused → transient. Remove all `Message.Contains` checks.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). All `ex.Message.Contains(...)` checks were
removed. The new `ClassifySmtpError` helper inspects `SmtpCommandException.StatusCode`
(numeric SMTP code: 4xx → transient, 5xx → permanent) and treats `SmtpProtocolException`,
`ServiceNotConnectedException`, `SocketException` and `TimeoutException` as transient;
anything else is `Unknown` and propagates unclassified rather than being guessed. The
permanent-promotion `catch` block in `DeliverAsync` now keys off this classification.
Regression tests `Send_Smtp5xxCommandException_ClassifiedPermanent`,
`Send_Smtp4xxCommandException_ClassifiedTransientAndBuffered`,
`Send_SmtpProtocolException_ClassifiedTransient`, and
`Send_NonSmtpExceptionWith5xxLookalikeText_NotPromotedToPermanent`.
### NotificationService-004 — `DeliverAsync` constructs two SMTP clients and leaks the used one
| | |
|--|--|
| Severity | High |
| Category | Performance & resource management |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:118-119` |
**Description**
```csharp
using var client = _smtpClientFactory() as IDisposable;
var smtp = _smtpClientFactory();
```
The factory is invoked twice, creating two separate `MailKitSmtpClientWrapper` instances (each owning a real `SmtpClient` with a socket). The first instance is assigned to `client` and disposed by the `using`, but it is never used. The second instance, `smtp`, is the one actually connected, authenticated, used to send, and `DisconnectAsync`'d — but it is never `Dispose`d. `MailKitSmtpClientWrapper` implements `IDisposable` and wraps an unmanaged socket; the connected client is leaked on every send. `DisconnectAsync` closes the connection but does not dispose the `SmtpClient`. Over time this leaks sockets/handles.
**Recommendation**
Create exactly one client and dispose the one that is actually used:
`using var smtp = _smtpClientFactory();` then cast to `IDisposable` only if needed (the factory's `Func<ISmtpClientWrapper>` should ideally return a type that the `using` can dispose directly — consider having `ISmtpClientWrapper` extend `IAsyncDisposable`/`IDisposable`).
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `DeliverAsync` now invokes `_smtpClientFactory()`
exactly once and disposes the client actually used via `using var disposable = smtp as
IDisposable;`. The previous code created two `MailKitSmtpClientWrapper` instances per send
and disposed the unused one while leaking the connected one. Regression test
`Send_CreatesExactlyOneSmtpClient_AndDisposesIt` verifies the factory is invoked once and
the resulting client is disposed.
### NotificationService-005 — Non-TLS path uses `SecureSocketOptions.Auto`, contradicting the requested mode
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:18`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:123` |
**Description**
`ConnectAsync` maps `useTls` to either `SecureSocketOptions.StartTls` or `SecureSocketOptions.Auto`. `useTls` is computed in `DeliverAsync` as `TlsMode == "starttls"`. So a configuration of `TlsMode = "none"` produces `useTls = false``SecureSocketOptions.Auto`, which lets MailKit opportunistically negotiate TLS — the opposite of "None". Worse, the design doc defines three TLS modes — `None`, `StartTLS`, `SSL` — but the code collapses them to a single boolean, so `SSL` (implicit TLS, typically port 465) is treated identically to `None`/`Auto` and the SSL mode is effectively unsupported. The `bool useTls` parameter cannot represent the three-state requirement.
**Recommendation**
Pass the `TlsMode` string (or a `TlsMode` enum) through to the wrapper and map explicitly: `None``SecureSocketOptions.None`, `StartTLS``SecureSocketOptions.StartTls`, `SSL``SecureSocketOptions.SslOnConnect`. Validate the configured value and reject unknown modes.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: the
`bool useTls` parameter cannot represent three states, and the non-StartTLS branch used
`SecureSocketOptions.Auto`. A new `SmtpTlsMode` enum (`None`/`StartTls`/`Ssl`) and
`SmtpTlsModeParser` were added; `ISmtpClientWrapper.ConnectAsync` now takes `SmtpTlsMode`
and `MailKitSmtpClientWrapper` maps it explicitly to `SecureSocketOptions.None`/
`StartTls`/`SslOnConnect`. `SendAsync`/`DeliverBufferedAsync` validate the configured
`TlsMode` up front — an unknown value returns a clean `NotificationResult` failure (or
parks a buffered message) instead of silently negotiating TLS. Regression tests:
`Send_TlsModeNone_DoesNotNegotiateTls`, `Send_TlsModeSsl_UsesImplicitSsl`,
`Send_UnknownTlsMode_ReturnsErrorNotSilentFallback`, and the `SmtpTlsModeParserTests` set.
### NotificationService-006 — OAuth2 token cache is keyed to nothing; wrong token returned when multiple SMTP configs exist
| | |
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/OAuth2TokenService.cs:14-15`, `src/ScadaLink.NotificationService/OAuth2TokenService.cs:30-35` |
**Description**
`OAuth2TokenService` is registered as a singleton and stores a single `_cachedToken`/`_tokenExpiry` pair. `GetTokenAsync` ignores the `credentials` argument when deciding whether the cache is valid — it only checks expiry. If two SMTP configurations with different tenant/client credentials are ever used (the repository's `GetAllSmtpConfigurationsAsync` returns a list, implying multiple configs are possible), the second caller receives the first caller's token, which will fail authentication against the second tenant. Even with a single config today this is a latent correctness bug and makes the service's behaviour depend on call order.
**Recommendation**
Key the cache by the credential identity (e.g. a dictionary keyed by `tenantId:clientId`, or by a hash of the credential string), or document and enforce the single-SMTP-config invariant. Given the design doc says one SMTP config is deployed per site, enforcing the invariant is acceptable but should be explicit.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: the singleton held a single
`_cachedToken`/`_tokenExpiry` pair and `GetTokenAsync` ignored the `credentials` argument
when validating the cache, so a second SMTP config got the first config's token.
`OAuth2TokenService` now stores a `ConcurrentDictionary<string, CacheEntry>` keyed by the
SHA-256 hash of the credential string; each distinct tenant/client/secret gets its own
cached token, expiry, and per-credential `SemaphoreSlim` (double-checked locking
preserved). Regression tests: `GetTokenAsync_DifferentCredentials_ReturnPerCredentialTokens`
and `GetTokenAsync_SameCredentials_CachedPerCredential`.
### NotificationService-007 — Connection timeout and max-concurrent-connections from the design doc are not implemented
| | |
|--|--|
| Severity | Medium |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationOptions.cs:11-14`, `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:16-20`, `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:111-140` |
**Description**
The design doc specifies an SMTP "Connection timeout (default 30s)" and "Max concurrent connections (default 5)", and `NotificationOptions`/`SmtpConfiguration` both carry these fields. Neither is enforced: `MailKitSmtpClientWrapper.ConnectAsync` never sets `SmtpClient.Timeout`, so the connection relies on MailKit's default timeout rather than the configured value (only the caller's `CancellationToken` bounds it, and callers may pass `default`). There is no semaphore or other throttle limiting concurrent SMTP connections per site, so `MaxConcurrentConnections` has no effect. Both options exist but are dead configuration.
**Recommendation**
Set `SmtpClient.Timeout` from `ConnectionTimeoutSeconds` in `ConnectAsync` (and/or derive a linked `CancellationTokenSource`). Introduce a `SemaphoreSlim(MaxConcurrentConnections)` gating `DeliverAsync`. If these limits are intentionally deferred, mark the options `[Obsolete]`/document them as not-yet-enforced and note the gap in the design doc.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: `ConnectAsync` never set
`SmtpClient.Timeout` and no throttle gated `DeliverAsync`. `ISmtpClientWrapper.ConnectAsync`
now takes a `connectionTimeoutSeconds` argument; `MailKitSmtpClientWrapper` sets
`SmtpClient.Timeout` from `SmtpConfiguration.ConnectionTimeoutSeconds`. `DeliverAsync`
acquires a lazily-created `SemaphoreSlim` sized to `SmtpConfiguration.MaxConcurrentConnections`
(default 5 when non-positive) and releases it in a `finally`, so concurrent SMTP
deliveries per site are bounded. The timeout is sourced from the deployed
`SmtpConfiguration` rather than `NotificationOptions`; the `NotificationOptions` fields
remain as operational fallback defaults. Regression tests:
`Send_PassesConfiguredConnectionTimeoutToClient` and
`Send_MaxConcurrentConnections_LimitsConcurrentDeliveries`.
### NotificationService-008 — Recipient email addresses are not validated before send
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:136-137`, `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:50-53` |
**Description**
`SendAsync` builds `bccAddresses` directly from `recipient.EmailAddress` and passes them to `MailboxAddress.Parse`. If any recipient row has a malformed address, `MailboxAddress.Parse` throws `ParseException`. `ParseException` is not a `TimeoutException`/`SocketException`/`IOException` and its message will not generally contain "4." or "5.", so it falls through `DeliverAsync`'s outer `catch ... when (... && !IsTransientSmtpError(ex))` filter, which re-throws it (`:153`); it then escapes `SendAsync` entirely as an unhandled exception (the `SendAsync` catch blocks only cover `SmtpPermanentException` and transient errors). A single bad address in a list therefore crashes the send with an exception type the calling script is not told to expect, instead of producing a clean `NotificationResult` error. The same applies to a malformed `FromAddress`.
**Recommendation**
Validate addresses up front (e.g. `MailboxAddress.TryParse`) and return a `NotificationResult(false, ...)` listing invalid recipients, or wrap `DeliverAsync` so any non-classified exception becomes a permanent `NotificationResult` failure rather than escaping. Consider validating addresses at definition time in the Central UI as well.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause confirmed: `MailboxAddress.Parse` of a
malformed `FromAddress`/recipient threw `ParseException`, which is unclassified and
escaped `SendAsync` as an unhandled exception. A new `ValidateAddresses` helper uses
`MailboxAddress.TryParse` for the sender and every recipient; `SendAsync` now returns a
clean `NotificationResult(false, ...)` listing the invalid address(es) before any SMTP
attempt, and `DeliverBufferedAsync` parks a buffered message with a bad address (a fault
retrying cannot fix). Regression tests:
`Send_MalformedRecipientAddress_ReturnsCleanError_DoesNotThrow` and
`Send_MalformedFromAddress_ReturnsCleanError_DoesNotThrow`. Definition-time validation in
the Central UI is a separate component and out of this module's scope.
### NotificationService-009 — Credentials handled as plaintext strings; OAuth2 client secret logged risk
| | |
|--|--|
| Severity | Medium — re-triaged: split into an in-scope log-leak fix (resolved) and a Commons-scoped at-rest-encryption / structured-credential follow-up (NotificationService-013, Deferred). |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:127-134`, `src/ScadaLink.NotificationService/OAuth2TokenService.cs:30-65`, `src/ScadaLink.Commons/Entities/Notifications/SmtpConfiguration.cs:9` |
**Description**
SMTP credentials — Basic Auth `user:pass` and OAuth2 `tenantId:clientId:clientSecret` — are stored and passed as a single colon-delimited plaintext `string` (`SmtpConfiguration.Credentials`). There is no indication the value is encrypted at rest in SQLite or in the central config DB. The colon-delimited packing is also brittle: a password or client secret containing a `:` will be split incorrectly (`Split(':', 2)` / `Split(':', 3)`), silently corrupting the secret. Separately, while the current code does not log the secret directly, the substring-based error classification logs full exception messages (`_logger.LogWarning(ex, ...)`, `LogError(ex, ...)`) and MailKit exceptions can echo back server responses; an authentication failure message could surface credential fragments into logs. There is no defensive scrubbing.
**Recommendation**
Store credentials encrypted at rest (DPAPI/Data Protection or a secret store) and model them as structured fields rather than a colon-packed string, so secrets containing `:` are safe. Ensure credential values are never written to logs; consider a redaction step on exception messages before logging.
**Resolution**
Resolved 2026-05-16 (commit pending). Root cause re-triaged against source: the finding
conflates two concerns with different ownership.
1. **Log-leak risk (in scope — fixed).** The original code logged whole exception objects
(`_logger.LogWarning(ex, ...)` / `LogError(ex, ...)`); MailKit auth exceptions can echo
server responses quoting the supplied credentials. A new internal `CredentialRedactor`
masks every colon-delimited credential component out of any text. `SendAsync` and
`DeliverBufferedAsync` now log a scrubbed message string (not the raw exception) and the
permanent-failure `NotificationResult` is scrubbed before it returns to the caller.
`OAuth2TokenService` logs the tenant id only — never the client secret or access token.
Regression tests: `CredentialRedactorTests` and
`Send_PermanentError_RedactsCredentialFromResultMessage`.
2. **At-rest encryption + structured-credential modelling (out of scope — Deferred).**
Encrypting `SmtpConfiguration.Credentials` at rest and replacing the brittle
colon-packed `string` with structured fields requires editing
`src/ScadaLink.Commons/Entities/Notifications/SmtpConfiguration.cs` and the
ConfigurationDatabase EF layer — both outside this module. Tracked separately as
**NotificationService-013** (Deferred) so it is not lost.
### NotificationService-013 — Encrypt SMTP credentials at rest; replace colon-packed string with structured fields
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Deferred |
| Location | `src/ScadaLink.Commons/Entities/Notifications/SmtpConfiguration.cs:9`, ConfigurationDatabase EF mapping |
**Description**
Split out of NotificationService-009. `SmtpConfiguration.Credentials` packs Basic Auth
`user:pass` and OAuth2 `tenantId:clientId:clientSecret` into a single plaintext
colon-delimited `string`: (a) there is no encryption at rest in SQLite or the central
config DB; (b) a password or client secret containing a `:` is split incorrectly by
`Split(':', 2)` / `Split(':', 3)`, silently corrupting the secret.
**Recommendation**
Model credentials as structured fields (or an encrypted blob) on the Commons entity and
encrypt at rest via Data Protection / a secret store. The colon-delimited parsing in
`MailKitSmtpClientWrapper` and `OAuth2TokenService` would then consume the structured
fields directly.
**Resolution**
Deferred — requires changes to `src/ScadaLink.Commons` and the ConfigurationDatabase
component, which are outside the NotificationService module. To be addressed in a
Commons/ConfigurationDatabase-scoped change. The associated log-leak risk is resolved
under NotificationService-009.
### NotificationService-010 — `DeliverAsync` does not disconnect the SMTP client on failure
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:121-154` |
**Description**
`DisconnectAsync` is only called at `:139`, on the success path inside the `try` block. If `AuthenticateAsync` or `SendAsync` throws, control jumps to the `catch` filter at `:141` and the method exits (re-throwing or wrapping) without ever calling `DisconnectAsync`. Combined with NS-004 (the client is never disposed either), a failed send leaves an open, authenticated SMTP connection until the socket is eventually reclaimed by finalization. Under sustained transient failures this can exhaust the SMTP server's connection slots.
**Recommendation**
Move disconnect/dispose into a `finally` block (or use `await using` once `ISmtpClientWrapper` supports `IAsyncDisposable`) so the connection is always torn down regardless of outcome.
**Resolution**
_Unresolved._
### NotificationService-011 — `SmtpPermanentException` declared in the wrong file; module conventions
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:173-177`, `src/ScadaLink.Commons/Entities/Notifications/SmtpConfiguration.cs:5-15` |
**Description**
Two minor convention issues. (1) `SmtpPermanentException` is a public exception type declared at the bottom of `NotificationDeliveryService.cs` rather than in its own file (`SmtpPermanentException.cs`), which is inconsistent with the one-type-per-file layout used elsewhere and makes it harder to locate. (2) `SmtpConfiguration` (a Commons POCO) declares non-nullable `string` properties (`Host`, `AuthType`, `FromAddress`) that are only guaranteed by the constructor; EF Core materialization or object-initializer use can leave them null while the type system says otherwise. These are persistence-ignorant POCO concerns but worth flagging because the delivery service dereferences `config.Host`, `config.AuthType`, `config.FromAddress` without null checks.
**Recommendation**
Move `SmtpPermanentException` to its own file. For `SmtpConfiguration`, either keep the constructor as the only path and document it, or use `required` members so the compiler enforces initialization.
**Resolution**
_Unresolved._
### NotificationService-012 — Test coverage gaps: OAuth2 delivery path, permanent-classification fallback, token-cache concurrency
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.NotificationService.Tests/NotificationDeliveryServiceTests.cs`, `tests/ScadaLink.NotificationService.Tests/OAuth2TokenServiceTests.cs` |
**Description**
The tests cover the happy path, list-not-found, no-recipients, no-SMTP-config, permanent failure, transient-without-S&F, and transient-with-S&F buffering. Notable untested paths: (1) the OAuth2 delivery branch in `DeliverAsync:128-132` — every test uses `tokenService: null` and Basic Auth, so OAuth2 token resolution during a send is never exercised; (2) `DeliverAsync`'s permanent-classification fallback (`:144-149`) that promotes a generic exception whose message contains "550"/"553"/"554" to `SmtpPermanentException` is never tested; (3) `OAuth2TokenServiceTests` never tests concurrent `GetTokenAsync` calls (the double-checked-locking path) or token expiry/refresh — the cache test uses a 3600s token so refresh never triggers; (4) no test covers the transient-with-S&F path actually delivering after retry (which would also have caught NS-001). Given NS-001 is a critical defect, the absence of an end-to-end buffer-and-retry test is significant.
**Recommendation**
Add tests for: OAuth2-authenticated send with a mocked `OAuth2TokenService`; the `DeliverAsync` 5xx-message permanent fallback; token expiry/refresh (short `expires_in`); concurrent token acquisition; and an end-to-end buffered-notification retry once a `Notification` S&F handler is registered.
**Resolution**
_Unresolved._
+210
View File
@@ -0,0 +1,210 @@
# Code Reviews
Comprehensive, per-module code reviews of the ScadaLink codebase. Each module (one
buildable project under `src/`) has its own folder containing a `findings.md`. This
README is the aggregated index — the single place to see all outstanding work.
> Generated by `regen-readme.py` from the per-module `findings.md` files. Do not
> edit by hand — edit the findings files and re-run the script.
## How it works
- Reviews are performed one module at a time against a fixed checklist.
- Every finding is recorded in the module's `findings.md` with a severity and status.
- Findings are **never deleted** — they are closed by changing their status, keeping
a full audit trail.
- This README aggregates every **pending** finding (`Open` / `In Progress`) across all
modules.
See **[REVIEW-PROCESS.md](REVIEW-PROCESS.md)** for the full procedure: the review
checklist, severity definitions, finding format, and how to mark items resolved.
## Layout
```
code-reviews/
├── README.md # this file — process overview + pending findings
├── REVIEW-PROCESS.md # how to perform a review and track findings
├── regen-readme.py # regenerates this README from the findings files
├── _template/findings.md # copy-this template for a module review
└── <Module>/findings.md # one folder per src/ project
```
## Baseline review — 2026-05-16
All 19 modules were reviewed at commit `9c60592` (241 findings: 6 Critical, 46 High,
100 Medium, 89 Low). The tables below track what remains **open** as findings are
resolved and re-triaged; findings discovered after the baseline are appended to their
module file and counted in **Total**.
| Severity | Open findings |
|----------|---------------|
| Critical | 0 |
| High | 0 |
| Medium | 25 |
| Low | 90 |
| **Total** | **115** |
## Module Status
| Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total |
|--------|---------------|--------|----------------|------|-------|
| [CLI](CLI/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/6 | 6 | 13 |
| [CentralUI](CentralUI/findings.md) | 2026-05-16 | `9c60592` | 0/0/2/5 | 7 | 19 |
| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/3 | 3 | 8 |
| [Commons](Commons/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/8 | 8 | 12 |
| [Communication](Communication/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/3 | 3 | 11 |
| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/6 | 6 | 11 |
| [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/2 | 2 | 13 |
| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/5 | 5 | 14 |
| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/4 | 4 | 14 |
| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/5 | 5 | 12 |
| [Host](Host/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/7 | 8 | 11 |
| [InboundAPI](InboundAPI/findings.md) | 2026-05-16 | `9c60592` | 0/0/1/5 | 6 | 13 |
| [ManagementService](ManagementService/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/5 | 5 | 13 |
| [NotificationService](NotificationService/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/3 | 3 | 13 |
| [Security](Security/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/4 | 4 | 11 |
| [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-16 | `9c60592` | 0/0/4/3 | 7 | 11 |
| [SiteRuntime](SiteRuntime/findings.md) | 2026-05-16 | `9c60592` | 0/0/8/5 | 13 | 16 |
| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-16 | `9c60592` | 0/0/4/7 | 11 | 14 |
| [TemplateEngine](TemplateEngine/findings.md) | 2026-05-16 | `9c60592` | 0/0/5/4 | 9 | 14 |
## Pending Findings
Every `Open` / `In Progress` finding across all modules, highest severity first.
Resolved findings drop off this list but remain recorded in their module's
`findings.md` (see [REVIEW-PROCESS.md](REVIEW-PROCESS.md) §4–§5). Full detail —
description, location, recommendation — lives in the module's `findings.md`.
### Critical (0)
_None open._
### High (0)
_None open._
### Medium (25)
| ID | Module | Title |
|----|--------|-------|
| CentralUI-005 | [CentralUI](CentralUI/findings.md) | Session expiry implementation diverges from the documented policy |
| CentralUI-006 | [CentralUI](CentralUI/findings.md) | Deployment status page polls every 10s despite the documented SignalR-push design |
| Host-002 | [Host](Host/findings.md) | Akka.Persistence required by REQ-HOST-6 is not configured and not used |
| InboundAPI-007 | [InboundAPI](InboundAPI/findings.md) | `Database.Connection()` script API from the design doc is not implemented |
| SiteEventLogging-005 | [SiteEventLogging](SiteEventLogging/findings.md) | `LogEventAsync` performs synchronous disk I/O on the caller's thread |
| SiteEventLogging-007 | [SiteEventLogging](SiteEventLogging/findings.md) | `ISiteEventLogger` consumers downcast to the concrete type and reach into the DB connection |
| SiteEventLogging-008 | [SiteEventLogging](SiteEventLogging/findings.md) | Event-recording write failures are silently swallowed |
| SiteEventLogging-010 | [SiteEventLogging](SiteEventLogging/findings.md) | Test coverage gaps: actor bridge, purge/write concurrency, vacuum effectiveness, query error path |
| SiteRuntime-004 | [SiteRuntime](SiteRuntime/findings.md) | `_totalDeployedCount` is incremented on redeployment of an existing instance |
| SiteRuntime-005 | [SiteRuntime](SiteRuntime/findings.md) | Deployment reports `Success` to central before persistence completes |
| SiteRuntime-006 | [SiteRuntime](SiteRuntime/findings.md) | Site-local repositories read `SiteStorageService` private field via reflection |
| SiteRuntime-007 | [SiteRuntime](SiteRuntime/findings.md) | Synthetic entity IDs use the non-deterministic `string.GetHashCode()` |
| SiteRuntime-008 | [SiteRuntime](SiteRuntime/findings.md) | Blocking `.GetAwaiter().GetResult()` on the actor thread during startup |
| SiteRuntime-009 | [SiteRuntime](SiteRuntime/findings.md) | Script execution actors run scripts on the default thread pool, not a dedicated dispatcher |
| SiteRuntime-010 | [SiteRuntime](SiteRuntime/findings.md) | `EnsureDclConnections` never updates a connection whose configuration changed |
| SiteRuntime-011 | [SiteRuntime](SiteRuntime/findings.md) | Trust-model validation is a substring scan and is both over- and under-inclusive |
| StoreAndForward-004 | [StoreAndForward](StoreAndForward/findings.md) | `RegisterDeliveryHandler` XML doc contradicts the implemented contract |
| StoreAndForward-005 | [StoreAndForward](StoreAndForward/findings.md) | Parked-message retry/discard can race with the in-progress retry sweep |
| StoreAndForward-010 | [StoreAndForward](StoreAndForward/findings.md) | Retry of a parked message does not reset `LastAttemptAt`, so its retry timing is unspecified |
| StoreAndForward-013 | [StoreAndForward](StoreAndForward/findings.md) | Critical paths lack test coverage: retry-due timing, replication-from-active, and the actor bridge |
| TemplateEngine-006 | [TemplateEngine](TemplateEngine/findings.md) | Forbidden-API enforcement is a naive substring scan (bypassable and false-positive prone) |
| TemplateEngine-007 | [TemplateEngine](TemplateEngine/findings.md) | Brace-balance "compilation" misjudges verbatim / interpolated / raw strings |
| TemplateEngine-008 | [TemplateEngine](TemplateEngine/findings.md) | `SetAlarmOverrideAsync` accepts overrides for unknown / composed alarms with no validation |
| TemplateEngine-009 | [TemplateEngine](TemplateEngine/findings.md) | N+1 query in `TemplateDeletionService.CanDeleteTemplateAsync` |
| TemplateEngine-010 | [TemplateEngine](TemplateEngine/findings.md) | `InstanceService` documents optimistic concurrency that is not implemented |
### Low (90)
| ID | Module | Title |
|----|--------|-------|
| CLI-008 | [CLI](CLI/findings.md) | `--format` value is not validated |
| CLI-009 | [CLI](CLI/findings.md) | Exit-code documentation does not match `HandleResponse` behaviour |
| CLI-010 | [CLI](CLI/findings.md) | `debug stream` reports Ctrl+C during connect as a connection failure |
| CLI-011 | [CLI](CLI/findings.md) | `CancellationTokenSource` in `debug stream` is never disposed |
| CLI-012 | [CLI](CLI/findings.md) | `debug stream` exit code is unreliable after stream termination |
| CLI-013 | [CLI](CLI/findings.md) | HTTP client, `debug stream`, and JSON-argument parsing are untested |
| CentralUI-015 | [CentralUI](CentralUI/findings.md) | `DialogService` continuations resolve off the render thread |
| CentralUI-016 | [CentralUI](CentralUI/findings.md) | Pagers render one button per page with no windowing |
| CentralUI-017 | [CentralUI](CentralUI/findings.md) | `/auth/logout` POST disables antiforgery, enabling logout CSRF |
| CentralUI-018 | [CentralUI](CentralUI/findings.md) | Broad `catch {}` blocks swallow JS interop and storage errors silently |
| CentralUI-019 | [CentralUI](CentralUI/findings.md) | Sparse unit-test coverage for a large module; critical paths untested |
| ClusterInfrastructure-005 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | No configuration section name constant for the Options pattern binding |
| ClusterInfrastructure-007 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | ClusterOptions lacks XML documentation comments |
| ClusterInfrastructure-008 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | "Phase 0 skeleton" status is undocumented at the module level |
| Commons-005 | [Commons](Commons/findings.md) | `OpcUaEndpointConfigSerializer.Deserialize` discards malformed legacy input and over-reports `IsLegacy` |
| Commons-006 | [Commons](Commons/findings.md) | `DynamicJsonElement.TryConvert` reports success for unconvertible target types |
| Commons-007 | [Commons](Commons/findings.md) | Several Commons types carry non-trivial logic, stretching REQ-COM-6 |
| Commons-008 | [Commons](Commons/findings.md) | `SetConnectionBindingsCommand` uses `ValueTuple` in a wire message contract |
| Commons-009 | [Commons](Commons/findings.md) | `Component-Commons.md` is stale relative to the actual file set |
| Commons-010 | [Commons](Commons/findings.md) | Behavior-bearing Commons types have no unit tests |
| Commons-011 | [Commons](Commons/findings.md) | `Result<T>.Failure` accepts a null error string |
| Commons-012 | [Commons](Commons/findings.md) | `ValueFormatter` uses current-culture formatting without documenting it |
| Communication-009 | [Communication](Communication/findings.md) | `_siteClients` field is mutable and reassignable; cache update is not atomic on failure |
| Communication-010 | [Communication](Communication/findings.md) | `DebugStreamBridgeActor` XML doc incorrectly describes it as a "Persistent actor" |
| Communication-011 | [Communication](Communication/findings.md) | No test coverage for snapshot-timeout cleanup, address-cache failure, or gRPC reconnect leak |
| ConfigurationDatabase-005 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Audit `Id` type disagrees with the design doc |
| ConfigurationDatabase-006 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `Site.GrpcNodeAAddress` / `GrpcNodeBAddress` columns are unbounded |
| ConfigurationDatabase-008 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `GetApprovedKeysForMethodAsync` CSV parsing silently drops malformed ids |
| ConfigurationDatabase-009 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Multi-collection eager loads issue cartesian-product queries |
| ConfigurationDatabase-010 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Several repositories and `InstanceLocator` lack direct test coverage |
| ConfigurationDatabase-011 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Inconsistent constructor null-guarding across repositories/services |
| DataConnectionLayer-008 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `HandleUnsubscribe` is O(n^2) over instances and rechecks `_unresolvedTags` redundantly |
| DataConnectionLayer-013 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Misleading XML comment: `RaiseDisconnected` claims thread safety it does not provide |
| DeploymentManager-009 | [DeploymentManager](DeploymentManager/findings.md) | Misleading timeout comment on `DeleteInstanceAsync` |
| DeploymentManager-010 | [DeploymentManager](DeploymentManager/findings.md) | `SystemArtifactDeploymentRecord` does not persist the deployment ID |
| DeploymentManager-012 | [DeploymentManager](DeploymentManager/findings.md) | `LifecycleCommandTimeout` option is dead code |
| DeploymentManager-013 | [DeploymentManager](DeploymentManager/findings.md) | SMTP credentials serialized and broadcast to all sites |
| DeploymentManager-014 | [DeploymentManager](DeploymentManager/findings.md) | Dead `CreateCommand` helper in artifact tests |
| ExternalSystemGateway-011 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Every call performs a full repository scan of all systems and methods |
| ExternalSystemGateway-012 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Permanent-failure logging requirement is not met; `_logger` is injected but unused |
| ExternalSystemGateway-013 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `MaxConcurrentConnectionsPerSystem` and `DefaultHttpTimeout` options are defined but never used |
| ExternalSystemGateway-014 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Cached-call buffering path and `DatabaseGateway` are untested |
| HealthMonitoring-004 | [HealthMonitoring](HealthMonitoring/findings.md) | Inconsistent heartbeat interval described across XML docs |
| HealthMonitoring-006 | [HealthMonitoring](HealthMonitoring/findings.md) | Sequence seeding contradicts the doc's "starting at 1" wording and is untestable |
| HealthMonitoring-010 | [HealthMonitoring](HealthMonitoring/findings.md) | `HealthReportSender` silently swallows inner failures with bare `catch {}` |
| HealthMonitoring-011 | [HealthMonitoring](HealthMonitoring/findings.md) | `AddHealthMonitoringActors` is a dead no-op placeholder |
| HealthMonitoring-012 | [HealthMonitoring](HealthMonitoring/findings.md) | `SiteHealthState.LatestReport` initialized to `null!`, misrepresenting the contract |
| Host-005 | [Host](Host/findings.md) | Blocking sync-over-async (`GetAwaiter().GetResult()`) inside `StartAsync` |
| Host-006 | [Host](Host/findings.md) | HOCON assembled by unescaped string interpolation |
| Host-007 | [Host](Host/findings.md) | REQ-HOST-4 rule "GrpcPort ≠ RemotingPort" is not enforced |
| Host-008 | [Host](Host/findings.md) | `MachineDataDb` is validated and declared but never consumed |
| Host-009 | [Host](Host/findings.md) | `StartAsync` reports success before role actors are confirmed running |
| Host-010 | [Host](Host/findings.md) | No retry/backoff around startup preconditions (DB migration, readiness) |
| Host-011 | [Host](Host/findings.md) | `LoggingOptions.MinimumLevel` is dead configuration |
| InboundAPI-009 | [InboundAPI](InboundAPI/findings.md) | Failed compilation is retried on every subsequent request |
| InboundAPI-010 | [InboundAPI](InboundAPI/findings.md) | `ParameterValidator` ignores extra body fields and cannot validate Object/List element types |
| InboundAPI-011 | [InboundAPI](InboundAPI/findings.md) | Method-existence check leaks to unapproved callers (enumeration oracle) |
| InboundAPI-012 | [InboundAPI](InboundAPI/findings.md) | `ParameterDefinition` POCO declared in the component project, not Commons |
| InboundAPI-013 | [InboundAPI](InboundAPI/findings.md) | `ApiKeyValidationResult.NotFound` factory returns HTTP 400, contradicting its name |
| ManagementService-005 | [ManagementService](ManagementService/findings.md) | ManagementActor declares no supervision strategy |
| ManagementService-008 | [ManagementService](ManagementService/findings.md) | HandleResolveRoles constructs RoleMapper manually instead of via DI |
| ManagementService-010 | [ManagementService](ManagementService/findings.md) | ManagementServiceOptions.CommandTimeout is defined but never used |
| ManagementService-011 | [ManagementService](ManagementService/findings.md) | ResolveRolesCommand dispatch path is stale dead code |
| ManagementService-012 | [ManagementService](ManagementService/findings.md) | ManagementEnvelope carries a loosely-typed object payload |
| NotificationService-010 | [NotificationService](NotificationService/findings.md) | `DeliverAsync` does not disconnect the SMTP client on failure |
| NotificationService-011 | [NotificationService](NotificationService/findings.md) | `SmtpPermanentException` declared in the wrong file; module conventions |
| NotificationService-012 | [NotificationService](NotificationService/findings.md) | Test coverage gaps: OAuth2 delivery path, permanent-classification fallback, token-cache concurrency |
| Security-008 | [Security](Security/findings.md) | N+1 query loading site-scope rules in `RoleMapper` |
| Security-009 | [Security](Security/findings.md) | CancellationToken not honored inside `Task.Run` LDAP calls |
| Security-010 | [Security](Security/findings.md) | Design doc contradicts itself on Windows Integrated Authentication |
| Security-011 | [Security](Security/findings.md) | Missing tests for security-critical paths |
| SiteEventLogging-006 | [SiteEventLogging](SiteEventLogging/findings.md) | Missing indexes for severity and keyword-search query paths |
| SiteEventLogging-009 | [SiteEventLogging](SiteEventLogging/findings.md) | XML doc on `LogEventAsync` claims asynchronous behaviour |
| SiteEventLogging-011 | [SiteEventLogging](SiteEventLogging/findings.md) | Stale "Phase 4+" placeholder in `ServiceCollectionExtensions` |
| SiteRuntime-012 | [SiteRuntime](SiteRuntime/findings.md) | `AttributeAccessor`/`ScopeAccessors` block the script on a synchronous Ask |
| SiteRuntime-013 | [SiteRuntime](SiteRuntime/findings.md) | `HandleUnsubscribeDebugView` does nothing despite documented behaviour |
| SiteRuntime-014 | [SiteRuntime](SiteRuntime/findings.md) | Trigger-expression evaluation blocks the coordinator actor thread |
| SiteRuntime-015 | [SiteRuntime](SiteRuntime/findings.md) | `LoggerFactory` created per Instance Actor and never disposed |
| SiteRuntime-016 | [SiteRuntime](SiteRuntime/findings.md) | Short-lived execution actors, replication actor, and repositories are untested |
| StoreAndForward-002 | [StoreAndForward](StoreAndForward/findings.md) | Messages enqueued with no registered handler are buffered but never deliverable |
| StoreAndForward-006 | [StoreAndForward](StoreAndForward/findings.md) | `GetParkedMessagesAsync` count and page run without a transaction |
| StoreAndForward-007 | [StoreAndForward](StoreAndForward/findings.md) | Async work in `ParkedMessageHandlerActor` uses `ContinueWith` without scheduler/affinity guarantees |
| StoreAndForward-008 | [StoreAndForward](StoreAndForward/findings.md) | A SQLite connection is opened and torn down on every storage call |
| StoreAndForward-009 | [StoreAndForward](StoreAndForward/findings.md) | `OnActivity` event invocation is not thread-safe against concurrent subscribe/unsubscribe |
| StoreAndForward-011 | [StoreAndForward](StoreAndForward/findings.md) | `StoreAndForwardMessageStatus.InFlight` is unused and the doc's "retrying" status is unmodelled |
| StoreAndForward-012 | [StoreAndForward](StoreAndForward/findings.md) | `StoreAndForwardMessage` is a persistence entity but lives in the component, not Commons |
| TemplateEngine-011 | [TemplateEngine](TemplateEngine/findings.md) | `SortedPropertiesConverterFactory` is dead code with a misleading comment |
| TemplateEngine-012 | [TemplateEngine](TemplateEngine/findings.md) | `DataType` enum naming diverges from the design doc |
| TemplateEngine-013 | [TemplateEngine](TemplateEngine/findings.md) | `ToDictionary(t => t.Id)` throws on duplicate IDs; cycle detectors overload Id 0 as a sentinel |
| TemplateEngine-014 | [TemplateEngine](TemplateEngine/findings.md) | Template-deletion constraint logic is duplicated and divergent |
+113
View File
@@ -0,0 +1,113 @@
# Code Review Process
This document describes how to perform a comprehensive, per-module code review of
the ScadaLink codebase and how to track findings to resolution.
A **module** is one buildable project under `src/` (e.g. `src/ScadaLink.TemplateEngine`).
Each module has its own folder under `code-reviews/` containing a single `findings.md`.
## 1. Before you start
1. Pick the module to review. Its folder is `code-reviews/<Module>/` where `<Module>`
is the project name with the `ScadaLink.` prefix stripped.
2. Identify the design context for the module:
- Its component design doc: `docs/requirements/Component-<Name>.md`.
- The relevant **Key Design Decisions** in `CLAUDE.md`.
- `docs/requirements/HighLevelReqs.md` for cross-cutting requirements.
3. Record the exact commit being reviewed: `git rev-parse --short HEAD`. Every review
is a snapshot — a finding only means something relative to a known commit.
4. Open `code-reviews/<Module>/findings.md` and fill in the header table
(reviewer, date, commit SHA).
## 2. Review checklist
Work through **every** category below for the module. A comprehensive review means
the checklist is completed even where it produces no findings — record "No issues
found" for a category rather than leaving it ambiguous.
1. **Correctness & logic bugs** — off-by-one, null handling, incorrect conditionals,
misuse of APIs, broken edge cases.
2. **Akka.NET conventions** — supervision strategies (Resume for coordinators, Stop
for short-lived actors), `Tell` for hot paths / `Ask` only at system boundaries,
message immutability, no blocking on non-blocking dispatchers, no `sender`/`this`
captured in closures (`PipeTo` instead), correlation IDs on request/response.
3. **Concurrency & thread safety** — shared mutable state, actor state mutated only
on the actor thread, race conditions, correct use of async/await.
4. **Error handling & resilience** — exception paths, store-and-forward integration,
reconnect/retry logic, failover behaviour, transient vs permanent error
classification, graceful degradation.
5. **Security** — authentication/authorization checks, input validation, the script
trust model (forbidden APIs: `System.IO`, `Process`, `Threading`, `Reflection`,
raw network), secret handling, SQL/LDAP injection, logging of sensitive data.
6. **Performance & resource management**`IDisposable` disposal, stream/connection
lifetimes, buffering and back-pressure, unnecessary allocations, N+1 queries.
7. **Design-document adherence** — does the code match `Component-<Name>.md` and the
relevant CLAUDE.md decisions? Flag both code that drifts from the design and design
docs that are now stale.
8. **Code organization & conventions** — persistence-ignorant POCO entities in
Commons, repository interfaces in Commons / implementations in ConfigurationDatabase,
namespace hierarchy, Options pattern (options classes owned by component projects),
additive-only message contract evolution.
9. **Testing coverage** — are the module's behaviours covered by tests in `tests/`?
Note untested critical paths and missing edge-case tests.
10. **Documentation & comments** — XML doc accuracy, misleading or stale comments,
undocumented non-obvious behaviour.
## 3. Recording findings
Add one entry per finding to the `## Findings` section of the module's `findings.md`,
using the entry format in [`_template/findings.md`](_template/findings.md).
- **Finding ID**`<Module>-NNN`, numbered sequentially within the module and never
reused (e.g. `TemplateEngine-001`). IDs are permanent even after resolution.
- **Severity:**
- **Critical** — data loss, security breach, crash/deadlock, or cluster-wide outage.
- **High** — incorrect behaviour with significant impact; no safe workaround.
- **Medium** — incorrect or risky behaviour with limited impact or a workaround.
- **Low** — minor issues, style, maintainability, documentation.
- **Category** — one of the 10 checklist categories above.
- **Location**`file:line` (clickable), or a list of locations.
- **Description** — what is wrong and why it matters.
- **Recommendation** — concrete suggested fix.
After recording findings, update the module header table (status, open-finding count)
and refresh the base README (step 5).
## 4. Marking an item resolved
Findings are **never deleted** — they are an audit trail. To close one, change its
**Status** and complete the **Resolution** field:
- `Open` — newly recorded, not yet addressed.
- `In Progress` — a fix is actively being worked on.
- `Resolved` — fixed. The Resolution field must state the fixing commit SHA, the
date, and a one-line description of the fix.
- `Won't Fix` — intentionally not fixed. The Resolution field must justify why.
- `Deferred` — valid but postponed. The Resolution field must say what it is waiting
on (e.g. a tracked issue or a later milestone).
`Resolved`, `Won't Fix`, and `Deferred` findings are all considered **closed** and
drop off the base README's pending list. `Open` and `In Progress` are **pending**.
## 5. Updating the base README
`code-reviews/README.md` holds the single cross-module view (process overview, the
Pending Findings tables, and the Module Status table). It is **generated** from the
per-module `findings.md` files — do not edit it by hand.
After any review or status change, regenerate it:
```
python3 code-reviews/regen-readme.py
```
`regen-readme.py --check` exits non-zero if `README.md` is stale, for use in CI.
The per-module `findings.md` files are the source of truth; `README.md` is the
aggregated index and must always agree with them — which the script guarantees.
## 6. Re-reviewing a module
Re-reviews append to the same `findings.md`. Update the header to the new commit and
date, continue the finding numbering from the last used ID, and leave prior findings
(including closed ones) in place as history.
+420
View File
@@ -0,0 +1,420 @@
# Code Review — Security
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.Security` |
| Design doc | `docs/requirements/Component-Security.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 4 |
## Summary
The Security module is small and reasonably structured: a stateless `LdapAuthService`
for search-then-bind authentication, a `JwtTokenService` for HMAC-signed cookie tokens,
a `RoleMapper` that resolves LDAP groups to roles, and ASP.NET Core authorization
policies plus a site-scope handler. Unit-test coverage of the happy paths is decent.
However, the review surfaced several real security weaknesses, the most serious being
that **StartTLS is dead code** (the design's "LDAPS or StartTLS" requirement is only
half met), that **the authentication cookie is not marked `Secure`** despite the design
mandating it, and that **the JWT signing key is never length-validated** so a weak or
empty key is silently accepted. There is also a genuine **DN-injection** gap in the
no-service-account fallback path, a filter/DN attribute mismatch (`uid=` vs `cn=`) that
makes that fallback path internally inconsistent, and an N+1 query in `RoleMapper`.
JWT validation also disables issuer/audience checks and the idle-timeout claim is reset
on every refresh, weakening the documented 30-minute idle policy. None of these are
crash/data-loss bugs, but the TLS, cookie, and key-validation items are security
defects that should be fixed before any production deployment.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | `uid=`/`cn=` attribute mismatch between search filter and fallback DN construction (Security-004); StartTLS branch is unreachable (Security-001). |
| 2 | Akka.NET conventions | ☑ | No actors in this module — `AddSecurityActors` is an empty placeholder. Nothing to assess. |
| 3 | Concurrency & thread safety | ☑ | Services are stateless and DI-scoped; LDAP sync calls wrapped in `Task.Run`. No shared mutable state. No issues found. |
| 4 | Error handling & resilience | ☑ | LDAP failure paths return structured `LdapAuthResult`; group-lookup failure is tolerated per design. `ct` not honored inside `Task.Run` bodies (Security-009). |
| 5 | Security | ☑ | StartTLS dead code (Security-001), cookie not `Secure` (Security-002), JWT key unvalidated (Security-003), DN injection (Security-005), no issuer/audience validation (Security-006), idle-timeout reset on refresh (Security-007). |
| 6 | Performance & resource management | ☑ | N+1 scope-rule query in `RoleMapper` (Security-008). `LdapConnection` correctly disposed via `using`. |
| 7 | Design-document adherence | ☑ | StartTLS unsupported and Secure cookie missing both contradict the design doc; design also says "Windows Integrated Authentication" in Responsibilities, contradicting its own Authentication section (Security-010). |
| 8 | Code organization & conventions | ☑ | `SecurityOptions` correctly owned by the component; repository interface in Commons. No issues found. |
| 9 | Testing coverage | ☑ | No tests for `RoleMapper` N+1 behavior, DN-injection inputs, StartTLS path, or idle-timeout-after-refresh. Insecure-config combinations under-tested (Security-011). |
| 10 | Documentation & comments | ☑ | `SecurityOptions` XML docs say direct bind uses `cn={username}` while the search filter uses `uid=` — comment is misleading (covered under Security-004). |
## Findings
### Security-001 — StartTLS upgrade path is unreachable dead code
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:37-47` |
**Description**
When `LdapUseTls` is true the code sets `connection.SecureSocketLayer = true` (LDAPS).
The subsequent StartTLS block is guarded by `if (_options.LdapUseTls && !connection.SecureSocketLayer)`.
Because `SecureSocketLayer` was just set to `true`, the second condition `!connection.SecureSocketLayer`
is always false, so `connection.StartTls()` is never called. The design doc explicitly
states LDAP connections must use **"LDAPS (port 636) or StartTLS"** — StartTLS is in
practice unsupported. A deployment that intends to use StartTLS on port 389 would get a
plaintext LDAPS-mode connection attempt that fails, or worse, an operator may disable
TLS entirely to make it work, sending credentials in cleartext.
**Recommendation**
Introduce an explicit transport mode (e.g. `LdapTransport { Ldaps, StartTls, None }`)
or a separate `LdapUseStartTls` flag. For StartTLS, leave `SecureSocketLayer` false,
call `connection.Connect`, then call `connection.StartTls()` and verify the negotiated
session is encrypted before binding. Remove the unreachable conditional.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Added an explicit `LdapTransport` enum
(`Ldaps`/`StartTls`/`None`); `SecureSocketLayer` is set only for LDAPS, and the
StartTLS branch now connects in plaintext, calls `StartTls()`, and verifies
`connection.Tls` before binding. `LdapUseTls` is retained as a compatibility shim
mapping onto the enum. Regression tests `AuthenticateAsync_StartTlsTransport_AttemptsConnection`
and `AuthenticateAsync_NoTlsTransport_RejectedWithoutAllowInsecure`.
### Security-002 — Authentication cookie is not marked `Secure`
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.Security/ServiceCollectionExtensions.cs:16-23` |
**Description**
`AddCookie` sets `HttpOnly = true` and `SameSite = Strict` but never sets
`options.Cookie.SecurePolicy`. The ASP.NET Core default is `CookieSecurePolicy.SameAsRequest`,
which permits the cookie (carrying the embedded JWT — a bearer credential) to be sent
over plain HTTP. The design doc states the cookie is **"HttpOnly and Secure (requires
HTTPS)"**. As written, the module does not enforce that requirement; a misconfigured or
HTTP-fronted deployment would transmit the session token in cleartext.
**Recommendation**
Set `options.Cookie.SecurePolicy = CookieSecurePolicy.Always` in `AddCookie`. Consider
also setting `ExpireTimeSpan` and `SlidingExpiration` to align the cookie lifetime with
the documented 15-minute JWT / 30-minute idle policy.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). Confirmed the cookie is configured in this
module (`ServiceCollectionExtensions.AddSecurity`), not CentralUI — no misattribution.
Added `options.Cookie.SecurePolicy = CookieSecurePolicy.Always` so the JWT-bearing
cookie is never sent over plain HTTP. Regression test
`AddSecurity_AuthCookie_IsMarkedSecureAlways`. (`ExpireTimeSpan`/`SlidingExpiration`
tuning left as a separate, lower-priority improvement.)
### Security-003 — JWT signing key length is never validated
| | |
|--|--|
| Severity | High |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.Security/JwtTokenService.cs:33`, `src/ScadaLink.Security/SecurityOptions.cs:42` |
**Description**
`SecurityOptions.JwtSigningKey` defaults to `string.Empty` and is fed directly into
`new SymmetricSecurityKey(Encoding.UTF8.GetBytes(_options.JwtSigningKey))` with no
validation. HMAC-SHA256 requires a key of at least 256 bits (32 bytes); a short or empty
key produces a trivially forgeable token. The `SecurityHardeningTests` comment claims a
minimum length is "enforced", but no code in this module enforces it — the test only
asserts that a 32+ char key works. A deployment with a missing or short `JwtSigningKey`
would start successfully and issue weakly-signed tokens.
**Recommendation**
Validate `JwtSigningKey` at startup — fail fast if it is empty or shorter than 32 bytes.
Use an `IValidateOptions<SecurityOptions>` validator or guard in the `JwtTokenService`
constructor so a weak key is rejected before any token is issued.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). The `JwtTokenService` constructor now
fails fast with an `InvalidOperationException` when `JwtSigningKey` is empty or
shorter than 32 bytes (`SecurityOptions.MinJwtSigningKeyBytes`), so a weak key is
rejected before any token is issued. The misleading `SecurityOptions` XML doc was
corrected to state the requirement. Regression tests
`JwtTokenService_EmptySigningKey_ThrowsAtConstruction`,
`JwtTokenService_ShortSigningKey_ThrowsAtConstruction`, and
`JwtTokenService_AdequateSigningKey_ConstructsSuccessfully`.
### Security-004 — Search filter uses `uid=` while fallback DN construction uses `cn=`
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:66`, `:138`, `:157-159` |
**Description**
`AuthenticateAsync` and `ResolveUserDnAsync` build the search filter as
`(uid={username})`, but the no-service-account fallback in `ResolveUserDnAsync`
constructs the bind DN as `cn={username},{LdapSearchBase}`. The `SecurityOptions.LdapServiceAccountDn`
XML comment also documents the fallback as `cn={username},{LdapSearchBase}`. A directory
keyed on `uid` will succeed via search-then-bind but fail via the direct-bind fallback
(and vice versa). The attribute used for lookup is hard-coded and inconsistent across
the two code paths, so the two configuration modes are not interchangeable.
**Recommendation**
Introduce a single configurable `LdapUserIdAttribute` (default `uid`) and use it
consistently in both the search filter and the fallback DN. Update the XML doc to match.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Confirmed: the search filter was hard-coded
`(uid={username})` (both in `AuthenticateAsync` and `ResolveUserDnAsync`) while the
fallback DN used `cn={username}` — the two auth modes were not interchangeable. Added
a configurable `SecurityOptions.LdapUserIdAttribute` (default `uid`) used for both the
search filter and the fallback DN via the new `BuildFallbackUserDn` helper, and
corrected the `LdapServiceAccountDn` XML doc to reference `{LdapUserIdAttribute}`.
Regression tests `BuildFallbackUserDn_UsesConfiguredUserIdAttribute`,
`BuildFallbackUserDn_HonoursNonDefaultUserIdAttribute`,
`SecurityOptions_LdapUserIdAttribute_DefaultsToUid`.
### Security-005 — DN injection in the no-service-account bind fallback
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:157-159` |
**Description**
When no service account is configured, the user-supplied `username` is interpolated
directly into a distinguished name: `$"cn={username},{LdapSearchBase}"`. `EscapeLdapFilter`
escapes *search-filter* metacharacters, but DN construction requires a different
escaping scheme (RFC 4514 — `,`, `+`, `"`, `\`, `<`, `>`, `;`, leading/trailing spaces).
No DN escaping is applied here. A username such as `victim,ou=admins` alters the DN
structure, allowing a caller to attempt a bind as a different DN than intended. Combined
with the `username.Contains('=')` shortcut at line 129 — which lets a caller supply a
full arbitrary DN — the fallback path gives the client undue control over the bind
identity.
**Recommendation**
Apply RFC 4514 DN-component escaping to `username` before interpolation, or use the
LDAP library's DN-builder API. Reconsider the `Contains('=')` shortcut — accepting a
raw DN from untrusted input is risky; restrict it or remove it.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Confirmed: the fallback path interpolated
the raw `username` straight into `cn={username},{LdapSearchBase}` with no DN escaping,
and the `username.Contains('=')` shortcut let a caller supply an arbitrary bind DN.
Added an RFC 4514 `EscapeLdapDn` helper (escapes `, + " \ < > ;`, leading/trailing
space, leading `#`, NUL) applied in `BuildFallbackUserDn`, so a username such as
`victim,ou=admins` can no longer alter the DN structure. The `Contains('=')` raw-DN
shortcut was removed entirely — untrusted input no longer controls the bind identity.
Regression tests `BuildFallbackUserDn_EscapesDnMetacharacters`,
`EscapeLdapDn_EscapesAllRfc4514Specials`, `EscapeLdapDn_EscapesLeadingAndTrailingSpaces`.
### Security-006 — JWT validation disables issuer and audience checks
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Resolved |
| Location | `src/ScadaLink.Security/JwtTokenService.cs:67-75`, `:56-59` |
**Description**
`ValidateToken` sets `ValidateIssuer = false` and `ValidateAudience = false`, and
`GenerateToken` never sets an `iss` or `aud`. With a shared symmetric HMAC key, any
other system or component that signs JWTs with the same key would produce tokens this
service accepts. While the design states the key is shared only between the two central
nodes, omitting issuer/audience binding removes a cheap defense-in-depth control and
makes accidental key reuse (e.g. the same secret used for another internal token)
silently exploitable.
**Recommendation**
Set a fixed `Issuer` and `Audience` (e.g. `"scadalink-central"`) when generating tokens
and enable `ValidateIssuer`/`ValidateAudience` with the matching expected values during
validation.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Confirmed: `GenerateToken` set neither `iss`
nor `aud` and `ValidateToken` had `ValidateIssuer = false`/`ValidateAudience = false`.
`GenerateToken` now binds `JwtTokenService.TokenIssuer`/`TokenAudience`
(both `"scadalink-central"`) into every token, and `ValidateToken` enables
`ValidateIssuer`/`ValidateAudience` against those fixed values — a token signed with
the shared key but a foreign issuer is now rejected. Regression tests
`GenerateToken_SetsIssuerAndAudience`, `ValidateToken_RejectsTokenWithWrongIssuer`,
`ValidateToken_AcceptsOwnIssuerAndAudience`.
### Security-007 — Idle-timeout claim is reset on every token refresh
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.Security/JwtTokenService.cs:40`, `:111-123` |
**Description**
The design states the 30-minute idle timeout is tracked via a "last-activity timestamp
in the token", and `IsIdleTimedOut` reads the `LastActivity` claim. But `RefreshToken`
calls `GenerateToken`, which unconditionally writes `LastActivity = DateTimeOffset.UtcNow`.
Token refresh fires whenever a request arrives within ~5 minutes of expiry. The result
is that `LastActivity` reflects *token issuance time*, not genuine user activity — and
since refresh itself is a request, the timestamp keeps moving forward. A more subtle
consequence: the idle window is effectively measured from the last refresh, not the
last real interaction, so the documented "no requests within the idle window" semantics
are not faithfully implemented. The claim name `LastActivity` is also misleading.
**Recommendation**
Decide explicitly how activity is tracked. Either (a) carry the original `LastActivity`
forward across refreshes and update it only on real request handling in the middleware,
or (b) rename the claim to `IssuedAt`/`TokenCreated` and document that the idle window
is measured from issuance. Whichever is chosen, ensure `IsIdleTimedOut` and the refresh
path agree on the semantics.
**Resolution**
Resolved 2026-05-16 (commit `pending`). Confirmed: `RefreshToken``GenerateToken`
unconditionally wrote `LastActivity = UtcNow`, so the idle clock reset on every
refresh and the documented 30-minute idle timeout could never fire for a client that
polls in the background. Implemented option (a) — the Security-side half of the
documented "15-min sliding + 30-min idle" policy (the cross-module partner of
CentralUI-005): `GenerateToken` now takes an optional `lastActivity` anchor;
`RefreshToken` carries the **existing** `LastActivity` claim forward unchanged, and a
new explicit `RecordActivity` method advances the anchor to now — to be called by the
CentralUI request pipeline on genuine user interaction (not on a background refresh).
`IsIdleTimedOut` is unchanged and now agrees with the refresh path. The remaining
CentralUI-side wiring (calling `RecordActivity` from the middleware, setting
`SlidingExpiration`/`ExpireTimeSpan`) stays tracked under CentralUI-005; this finding's
Security-side defect — the reset-on-refresh bug — is fully fixed here. Regression tests
`RefreshToken_PreservesOriginalLastActivityClaim`,
`RefreshToken_DoesNotResetIdleTimeoutWhenUserIsActuallyIdle`,
`RecordActivity_UpdatesLastActivityToNow`.
### Security-008 — N+1 query loading site-scope rules in `RoleMapper`
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.Security/RoleMapper.cs:25-48` |
**Description**
`MapGroupsToRolesAsync` first calls `GetAllMappingsAsync`, then inside the per-mapping
loop calls `GetScopeRulesForMappingAsync(mapping.Id, ct)` once for every matched
Deployment mapping. This is an N+1 query pattern executed on the login hot path and on
every 15-minute token refresh. With multiple site-scoped Deployment groups it issues a
round-trip per group.
**Recommendation**
Add a repository method that loads scope rules for a set of mapping IDs in one query
(or eager-loads them with the mappings), and resolve all scope rules with a single call.
**Resolution**
_Unresolved._
### Security-009 — CancellationToken not honored inside `Task.Run` LDAP calls
| | |
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.Security/LdapAuthService.cs:42`, `:46`, `:51`, `:56-57`, `:67-73`, `:135`, `:139-145` |
**Description**
The synchronous Novell LDAP calls are wrapped in `Task.Run(() => ..., ct)`. The `ct`
argument only prevents the work item from *starting* if cancellation is already
signaled; once a `connection.Connect`/`Bind`/`Search` call is in progress it cannot be
cancelled. A cancelled or timed-out login request will continue to occupy a thread-pool
thread and an LDAP connection until the blocking call returns on its own. There is also
no explicit network/operation timeout configured on the `LdapConnection`.
**Recommendation**
Configure `LdapConnection.ConnectionTimeout` and search/operation time limits so a
hung LDAP server cannot pin a thread indefinitely. Document that `ct` only guards
work-item scheduling, or implement a timeout-with-disconnect fallback.
**Resolution**
_Unresolved._
### Security-010 — Design doc contradicts itself on Windows Integrated Authentication
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `docs/requirements/Component-Security.md:13` (vs. `:23`) |
**Description**
The Responsibilities section states the component authenticates "using Windows
Integrated Authentication", but the Authentication section (line 23) and CLAUDE.md
explicitly state **"No Windows Integrated Authentication ... authenticates directly
against LDAP/AD, not via Kerberos/NTLM"** — which is what the code actually does
(direct LDAP bind). The Responsibilities line is stale and contradicts both the rest of
the doc and the implementation.
**Recommendation**
Fix `Component-Security.md:13` to say "using a direct LDAP/Active Directory bind"
to match the implemented behavior and the rest of the document.
**Resolution**
_Unresolved._
### Security-011 — Missing tests for security-critical paths
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.Security.Tests/UnitTest1.cs` |
**Description**
The test suite covers happy paths well but omits several security-relevant cases:
no test exercises the StartTLS path (Security-001), the DN-injection / `Contains('=')`
fallback inputs (Security-005), JWT validation with a too-short or empty signing key
(Security-003), `IsIdleTimedOut` returning true after a token has been refreshed
(Security-007), or the `uid`/`cn` mismatch in the no-service-account path (Security-004).
The integration `SecurityHardeningTests` only asserts default option values, not
enforcement. The test file is still named `UnitTest1.cs`.
**Recommendation**
Add negative/edge-case tests for the items above, particularly key-length rejection,
DN-escaping of hostile usernames, and idle-timeout behavior across a refresh. Rename
`UnitTest1.cs` to a descriptive name.
**Resolution**
_Unresolved._
+454
View File
@@ -0,0 +1,454 @@
# Code Review — SiteEventLogging
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.SiteEventLogging` |
| Design doc | `docs/requirements/Component-SiteEventLogging.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 7 |
## Summary
The SiteEventLogging module is small and broadly well-structured: a SQLite-backed
recorder (`SiteEventLogger`), a query service with keyset pagination, a background
purge service, and a thin Akka actor bridge. The query path is parameterised
correctly (no SQL injection) and reasonably well tested. However, the storage-cap
enforcement is functionally broken: `PRAGMA incremental_vacuum` is a no-op because
`auto_vacuum = INCREMENTAL` is never set, so the cap-purge loop never sees the
database shrink and over-deletes the entire table when triggered. There is also a
genuine concurrency hazard: the purge service and query service share the single
`SqliteConnection` owned by `SiteEventLogger` but bypass its `_writeLock`, so a purge
running on the background thread can collide with a write or a query on another
thread. The `LogEventAsync` API is synchronous despite its name and `Task` return,
which silently blocks Akka actor threads on disk I/O. Other findings concern the
cluster-singleton placement of the handler actor (which can pin to the standby
node), missing indexes for common query filters, retention/cap purge not enforcing
the requirement strictly, and several documentation/maintainability issues.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | `incremental_vacuum` no-op breaks cap purge (-001); over-delete on cap (-002). |
| 2 | Akka.NET conventions | ☑ | Handler actor has no supervision/correlation concerns of its own; singleton placement issue (-004). `Ask` boundary is appropriate. |
| 3 | Concurrency & thread safety | ☑ | Shared `SqliteConnection` used by purge/query without the write lock (-003). |
| 4 | Error handling & resilience | ☑ | `LogEventAsync` swallows write failures silently into a log line only (-008); purge catches broadly. |
| 5 | Security | ☑ | Queries fully parameterised. No authz in module (delegated to caller) — noted, not a finding. |
| 6 | Performance & resource management | ☑ | Synchronous I/O on actor threads (-005); missing indexes for severity/source/message (-006). |
| 7 | Design-document adherence | ☑ | Singleton placement contradicts "active node" model (-004); cap purge does not honour "oldest first within budget" cleanly (-002). |
| 8 | Code organization & conventions | ☑ | Concrete-type downcast of `ISiteEventLogger` (-007); `internal Connection` leaks DB handle (-007). |
| 9 | Testing coverage | ☑ | No tests for purge interaction with live writes, vacuum effectiveness, the actor bridge, or query error path (-010). |
| 10 | Documentation & comments | ☑ | `LogEventAsync` XML doc says "asynchronously" but is synchronous (-009); stale "Phase 4+" placeholder (-011). |
## Findings
### SiteEventLogging-001 — `PRAGMA incremental_vacuum` is a no-op; storage cap cannot reclaim space
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:100-102`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:36-55` |
**Description**
`PurgeByStorageCap` issues `PRAGMA incremental_vacuum` after each delete batch to
reclaim space, then re-measures the database size via `page_count * page_size`.
`incremental_vacuum` only has any effect when the database was created with
`auto_vacuum = INCREMENTAL`. `InitializeSchema` never sets `auto_vacuum`, so the
database uses the SQLite default (`auto_vacuum = NONE`). With `NONE`,
`incremental_vacuum` is silently ignored and `page_count` does not decrease when
rows are deleted (free pages are retained in the file). Consequently the
`while (currentSizeBytes > capBytes)` loop never observes the size dropping. The
storage-cap feature required by the design ("configurable maximum database size...
oldest events are purged first") is therefore non-functional — it cannot bring the
file back under the cap.
**Recommendation**
Set `PRAGMA auto_vacuum = INCREMENTAL` in `InitializeSchema` before any tables are
created (it must be set before table creation or followed by a full `VACUUM` to take
effect on an existing database). Alternatively, run a full `VACUUM` after cap-purge
deletes, or measure logical data size (e.g. `page_count - freelist_count` times
`page_size`) instead of relying on `incremental_vacuum`.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): `InitializeSchema` now sets
`PRAGMA auto_vacuum = INCREMENTAL` before any table is created, and
`GetDatabaseSizeBytes` measures logical size as `(page_count - freelist_count) *
page_size` so reclaimed pages no longer mask the size drop. The cap-purge loop now
reliably observes the database shrinking. Regression test
`PurgeByStorageCap_StopsWhenUnderCap_DoesNotEmptyTable`.
### SiteEventLogging-002 — Storage-cap purge deletes the entire table when space is not reclaimed
| | |
|--|--|
| Severity | High |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:87-105` |
**Description**
Because of SiteEventLogging-001 the on-disk size never shrinks after a delete batch,
so `currentSizeBytes` stays above `capBytes`. The loop then keeps deleting 1000-row
batches on every iteration until `ExecuteNonQuery` returns 0 — i.e. until the table
is completely empty. The design states the cap should purge "the oldest events...
first" to stay within budget, not wipe the whole log. When the cap is hit (e.g.
during an alarm storm) this destroys all retained diagnostic history rather than
trimming it to the budget. The unit test `PurgeByStorageCap_DeletesOldestWhenOverCap`
masks the problem because it uses `MaxStorageMb = 0`, which legitimately expects an
empty table, so the over-delete behaviour is never exercised against a realistic cap.
**Recommendation**
Fix the size measurement / vacuum (SiteEventLogging-001) so the loop terminates when
the file is genuinely under the cap. Add a guard so the loop stops once
`currentSizeBytes` has stopped decreasing across iterations, and add a test with a
non-zero cap and a known oversized dataset to assert that only the oldest events are
removed.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): with the size measurement fixed
(SiteEventLogging-001) the cap loop terminates when the file is genuinely under the
cap. An additional guard stops the loop if the on-disk size fails to decrease across
an iteration, so a cap that can never be met no longer empties the whole table.
Regression tests `PurgeByStorageCap_StopsWhenUnderCap_DoesNotEmptyTable` (asserts the
table is not emptied and the file ends under a realistic non-zero cap) and
`PurgeByStorageCap_RemovesOldestEventsFirst` (asserts only the oldest events are
removed).
### SiteEventLogging-003 — Shared `SqliteConnection` used by purge and query without the write lock
| | |
|--|--|
| Severity | High |
| Category | Concurrency & thread safety |
| Status | Resolved |
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:64,90,100,110,114`, `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:36`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:34,72` |
**Description**
`SiteEventLogger` owns a single `SqliteConnection` and serialises its own writes via
`lock (_writeLock)`. `EventLogPurgeService` and `EventLogQueryService` both reach
into `_eventLogger.Connection` and execute commands directly, without acquiring
`_writeLock`. The purge runs on a `BackgroundService` thread (a different thread from
event-recording callers and from the actor that drives the query service). A single
`SqliteConnection` / `SqliteCommand` is not thread-safe; concurrent use from the
purge thread and a recording thread (or query thread) can throw
`SqliteException`/`InvalidOperationException` ("DataReader already open",
"connection busy") or corrupt command state. The purge `DELETE` and the recorder
`INSERT` racing is the most likely collision because event recording is continuous.
**Recommendation**
Funnel all access to the connection through a single synchronisation point: either
expose lock-guarded methods on `SiteEventLogger` for purge/query to call, or give the
purge and query services their own dedicated `SqliteConnection` instances (SQLite
supports multiple connections to the same file; `Cache=Shared` plus a `busy_timeout`
makes this safer). Do not share one `SqliteConnection` across threads.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): the raw `internal Connection` property was
removed from `SiteEventLogger` and replaced with lock-guarded `WithConnection(...)`
overloads that hold the existing `_writeLock` for the duration of the caller's
delegate. `EventLogPurgeService`, `EventLogQueryService`, and `LogEventAsync` all now
access the connection exclusively through `WithConnection`, so the purge thread,
query thread, and recording threads are serialised on a single lock. `Dispose` was
also brought under the lock to avoid a dispose/use race. Regression test
`PurgeByStorageCap_ConcurrentWritesDoNotCorruptConnection` exercises purge running
concurrently with multiple writer threads.
### SiteEventLogging-004 — Event-log handler runs as a cluster singleton that can land on the standby node
| | |
|--|--|
| Severity | Low |
| Original severity | High (re-triaged down to Low on 2026-05-16 — see Re-triage note) |
| Category | Design-document adherence |
| Status | Won't Fix |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:313-336`, `src/ScadaLink.SiteEventLogging/EventLogHandlerActor.cs:21-25` |
**Description**
`EventLogHandlerActor` is hosted as a `ClusterSingletonManager` singleton with the
stated intent that "queries always reach the active node". However, an Akka.NET
cluster singleton is pinned to the *oldest* member of the role, which is not the
same concept as the SCADA "active node" (the node currently running the Deployment
Manager singleton / serving live traffic). The design doc is explicit: "Only the
active node generates and stores events... the new active node starts logging to its
own SQLite database." The event-log SQLite file is node-local and unreplicated.
Nothing guarantees the event-log singleton co-locates with the active node, so a
remote query can be served by the standby node and read that node's near-empty
database, returning no events even though the active node has a full log. The
explanatory comment in `AkkaHostedService.cs` asserts the opposite of what actually
happens.
**Recommendation**
Either (a) host the query handler as a normal per-node actor and route queries to
the active node explicitly (the node owning the Deployment Manager singleton), or
(b) make the event-log writer follow the same singleton so the writer and the query
handler are guaranteed co-located. Reconcile the design doc and the inline comment
with whichever model is chosen.
**Re-triage note (2026-05-16)**
The finding's central claim — that a remote query "can be served by the standby
node and read that node's near-empty database" — is incorrect for the query path.
In `AkkaHostedService.cs` the `event-log-handler` `ClusterSingletonManager` and the
`deployment-manager` `ClusterSingletonManager` are created with the **same role**
(`siteRole`) in the **same cluster**. Akka.NET pins every cluster singleton of a
given role to the *oldest member of that role* — so all same-role singletons in a
cluster co-locate on one node. The "active node" in this codebase is, by definition,
the node hosting the `deployment-manager` singleton; the event-log query singleton
is therefore *guaranteed* to run on that same node. A `ClusterClient` query cannot
land on the standby. The inline comment in `AkkaHostedService.cs` is accurate, not
"the opposite of what happens".
A real but distinct concern exists: the *writer* (`SiteEventLogger`) is registered
as a plain per-node DI singleton (`AddSiteEventLogging`), so it records to a local
SQLite file on **every** node, including the standby. That wastes storage on the
standby but does **not** cause the query-returns-nothing symptom the finding
describes, because the query singleton always reads the *active* node's (populated)
database. Gating the writer to the active node would be a `ScadaLink.Host` wiring
change, outside this module's scope, and is a minor optimisation rather than a
correctness defect.
Re-triaged from High to Low and closed as **Won't Fix**: the High-severity
correctness claim does not hold. Any residual cleanup (gate the standby-node writer;
the comment needs no change) can be raised as a fresh Low finding against
`ScadaLink.Host` if desired.
**Resolution**
Won't Fix — 2026-05-16 (commit `<pending>`). Re-triaged: the asserted defect (query
served by standby returning an empty log) cannot occur because the event-log query
singleton and the deployment-manager singleton share a role and so always co-locate
on the active node. No code change made; see the re-triage note above.
### SiteEventLogging-005 — `LogEventAsync` performs synchronous disk I/O on the caller's thread
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:57-99` |
**Description**
`LogEventAsync` is declared `async`-shaped (returns `Task`, `Async` suffix) but its
body is entirely synchronous: it takes `lock (_writeLock)`, runs
`cmd.ExecuteNonQuery()` (a blocking SQLite write), then returns `Task.CompletedTask`.
Callers across the codebase invoke it fire-and-forget as `_ = LogEventAsync(...)`
(e.g. `ScriptExecutionActor.cs:133`, `DataConnectionActor.cs:292`,
`ScriptActor.cs:250`) expecting it to be non-blocking. In reality the SQLite write,
and any contention on `_writeLock`, executes inline on the Akka actor thread of the
calling subsystem. Under an event burst (alarm storm, script failure loop) this
serialises actor threads on disk I/O and the global write lock, degrading the
hot-path subsystems the design intends to keep responsive.
**Recommendation**
Either make recording genuinely asynchronous (offload to a dedicated single-threaded
writer / `Channel<T>` consumer so callers truly fire-and-forget), or rename the
method to `LogEvent` and document that it blocks, so callers can decide. Given the
design's emphasis on not impacting runtime subsystems, an internal queue with a
background flush is preferable.
**Resolution**
_Unresolved._
### SiteEventLogging-006 — Missing indexes for severity and keyword-search query paths
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:50-52`, `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:65-81` |
**Description**
`InitializeSchema` creates indexes on `timestamp`, `event_type`, and `instance_id`.
The query service also filters on `severity` (`severity = $severity`) and performs
`message LIKE '%...%'` / `source LIKE '%...%'` keyword search. `severity` has no
index, and a leading-wildcard `LIKE` cannot use a normal index at all. With up to a
1 GB database and a 500-row page size, severity-filtered and keyword queries do full
table scans on every page. The design explicitly lists keyword search as a supported,
expected query type.
**Recommendation**
Add an index on `severity` (or a composite index aligned with common filter
combinations such as `(event_type, severity, id)`). For keyword search, consider an
FTS5 virtual table over `message` and `source`, or accept the scan but document the
cost.
**Resolution**
_Unresolved._
### SiteEventLogging-007 — `ISiteEventLogger` consumers downcast to the concrete type and reach into the DB connection
| | |
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.SiteEventLogging/EventLogPurgeService.cs:25`, `src/ScadaLink.SiteEventLogging/EventLogQueryService.cs:26`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:34` |
**Description**
Both `EventLogPurgeService` and `EventLogQueryService` take `ISiteEventLogger` via
DI and immediately downcast it: `_eventLogger = (SiteEventLogger)eventLogger;`. They
then access the `internal SqliteConnection Connection` property to run arbitrary SQL.
This defeats the purpose of the interface abstraction, makes the registration
fragile (any `ISiteEventLogger` that is not exactly `SiteEventLogger` causes an
`InvalidCastException` at construction), and leaks the database handle and raw SQL
surface out of the recorder. It is also the root cause of the unsynchronised
connection sharing in SiteEventLogging-003.
**Recommendation**
Introduce a proper data-access abstraction (e.g. an `IEventLogStore` with
`Insert`, `Query`, `PurgeOlderThan`, `PurgeToSize`, `GetSizeBytes`) that owns the
connection and its locking, and inject that into the recorder, query, and purge
services. Remove the `internal Connection` property and the concrete-type downcasts.
**Resolution**
_Unresolved._
### SiteEventLogging-008 — Event-recording write failures are silently swallowed
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:92-95` |
**Description**
If `ExecuteNonQuery` throws (disk full, database locked, file corruption), the
exception is caught, written to `ILogger`, and discarded; `LogEventAsync` still
returns `Task.CompletedTask` as if successful. Callers fire-and-forget the result so
they cannot detect failure. The event log is the site's diagnostic audit trail; a
sustained write failure (for example a locked-database storm caused by the
unsynchronised purge in SiteEventLogging-003) means events vanish with no signal to
operators except a local log line that nobody is watching. There is no failure
counter, no health-metric hook, and no retry.
**Recommendation**
Expose a failure signal: increment a counter that the Health Monitoring component
can surface (the design notes script/alarm error rates are derived from the event
log — a logging outage should be visible). At minimum, escalate repeated failures to
a Warning/Error health metric rather than only a local log line.
**Resolution**
_Unresolved._
### SiteEventLogging-009 — XML doc on `LogEventAsync` claims asynchronous behaviour
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.SiteEventLogging/ISiteEventLogger.cs:8-10`, `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:57` |
**Description**
The interface XML doc states "Record an event asynchronously." and the method is
named `LogEventAsync`, but the implementation is fully synchronous (see
SiteEventLogging-005). The documentation and naming are misleading: a reader will
reasonably assume the write is offloaded and the caller's thread is not blocked,
which is false. The `details` parameter doc says "Optional JSON details" but nothing
validates or requires JSON, so callers may pass arbitrary text.
**Recommendation**
Align the name, signature, and documentation with the actual behaviour — either make
the method genuinely asynchronous or rename to `LogEvent` and correct the doc.
Clarify that `details` is free-form text unless JSON is actually enforced.
**Resolution**
_Unresolved._
### SiteEventLogging-010 — Test coverage gaps: actor bridge, purge/write concurrency, vacuum effectiveness, query error path
| | |
|--|--|
| Severity | Medium |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.SiteEventLogging.Tests/` |
**Description**
The test suite covers recording, query filtering/pagination, and basic purge, but
several critical behaviours are untested:
- `EventLogHandlerActor` has no test — the actor message contract
(`EventLogQueryRequest` -> `EventLogQueryResponse`, `Sender.Tell`) is unverified.
- No test exercises purge running concurrently with active writes/queries, so the
connection-sharing race (SiteEventLogging-003) is invisible to CI.
- `PurgeByStorageCap` is only tested with `MaxStorageMb = 0`, which hides the
no-op-vacuum / over-delete bug (SiteEventLogging-001, -002). No test asserts the
file shrinks or that only oldest events are removed under a realistic cap.
- `EventLogQueryService.ExecuteQuery`'s catch block (`Success: false`,
`ErrorMessage`) has no test.
- `SiteEventLogger.Dispose` semantics (logging after dispose returns
`Task.CompletedTask`) and re-entrant dispose are untested.
**Recommendation**
Add tests for the actor bridge, a concurrency stress test (purge + write + query in
parallel), a realistic non-zero-cap purge test asserting size reduction and
oldest-first deletion, and a query-error-path test (e.g. corrupt/closed connection).
**Resolution**
_Unresolved._
### SiteEventLogging-011 — Stale "Phase 4+" placeholder in `ServiceCollectionExtensions`
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.SiteEventLogging/ServiceCollectionExtensions.cs:18-22` |
**Description**
`AddSiteEventLoggingActors` is an empty method with a comment "Placeholder for Akka
actor registration (Phase 4+)". The actor (`EventLogHandlerActor`) is in fact already
implemented and is registered directly in `AkkaHostedService.cs:313-336`, not through
this method. The placeholder is dead code: it is either never called or called with
no effect, and the comment is stale. A reader looking for where the event-log actor
is wired up will be misdirected.
**Recommendation**
Either implement the actor registration here and have `AkkaHostedService` call it
(centralising the wiring), or delete `AddSiteEventLoggingActors` entirely and remove
the misleading comment.
**Resolution**
_Unresolved._
+586
View File
@@ -0,0 +1,586 @@
# Code Review — SiteRuntime
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.SiteRuntime` |
| Design doc | `docs/requirements/Component-SiteRuntime.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 13 |
## Summary
The SiteRuntime module is broadly well-structured: the actor hierarchy matches the
design doc, supervision strategies are explicit, and the trigger/alarm evaluation
logic is thorough. However the review surfaced one genuinely serious correctness
defect — `Instance.SetAttribute` never routes writes to the Data Connection Layer
for data-sourced attributes, contradicting a core design decision and silently
turning device writes into local-only static overrides. Several other findings
cluster around two themes: (1) actor-thread discipline is violated in a few hot
paths (blocking `.GetAwaiter().GetResult()` calls on the actor thread, a fragile
fixed-delay reschedule for redeployment), and (2) the site-local repositories reach
into `SiteStorageService` private state via reflection and mint entity IDs with the
non-deterministic `string.GetHashCode()`. Script execution runs on the default
thread pool rather than a dedicated blocking dispatcher (the code acknowledges this
in a comment but ships it anyway). Test coverage exists for the coordinator actors,
persistence and scripting, but the short-lived execution actors, the replication
actor, and the repositories are untested.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | SetAttribute mis-routing, deploy double-count, redeploy reschedule race. |
| 2 | Akka.NET conventions | ✓ | Blocking on actor thread, script execution not on a dedicated dispatcher, premature success reply. |
| 3 | Concurrency & thread safety | ✓ | `_attributes` dictionary shared with child actors by reference; `_executionCounter` is actor-confined (OK). |
| 4 | Error handling & resilience | ✓ | Deploy reports Success before persistence; replicated artifact/S&F failures only logged (matches best-effort design). |
| 5 | Security | ✓ | Trust-model validation is substring-based and weak; reflection used to read private fields. |
| 6 | Performance & resource management | ✓ | Per-call SQLite connections (acceptable); CPU-bound scripts not interruptible by timeout. |
| 7 | Design-document adherence | ✓ | SetAttribute DCL routing missing; staggered-startup and supervision otherwise conform. |
| 8 | Code organization & conventions | ✓ | Repositories reflect into another class; synthetic IDs non-deterministic. |
| 9 | Testing coverage | ✓ | No tests for ScriptExecutionActor, AlarmExecutionActor, SiteReplicationActor, or the two repositories. |
| 10 | Documentation & comments | ✓ | Several XML comments describe behaviour the code does not implement (see findings). |
## Findings
### SiteRuntime-001 — `Instance.SetAttribute` never writes to the Data Connection Layer
| | |
|--|--|
| Severity | High |
| Category | Design-document adherence |
| Status | Resolved |
| Location | `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs:106`, `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:204` |
**Description**
The design doc (Component-SiteRuntime.md, "GetAttribute / SetAttribute" and
"Script Runtime API") states that `Instance.SetAttribute` on a *data-connected*
attribute must send a write request to the DCL, which writes to the physical
device, and that the in-memory value is **not** optimistically updated. For *static*
attributes it updates memory and persists an override.
The implementation makes no such distinction. `ScriptRuntimeContext.SetAttribute`
unconditionally sends a `SetStaticAttributeCommand`, and `InstanceActor.HandleSetStaticAttribute`
unconditionally treats every write as a static override: it mutates `_attributes`,
publishes an `AttributeValueChanged` with hard-coded `"Good"` quality, notifies
children, and persists a SQLite override. A script writing a data-sourced attribute
therefore never reaches the device, the write failure can never be returned
synchronously to the script, and the in-memory value diverges from the device
until the next subscription update overwrites it. The persisted override is also
wrong: data-sourced attributes should not have static overrides.
**Recommendation**
In `InstanceActor`, look up the target attribute in `_configuration.Attributes`. If
it has a non-empty `DataSourceReference`, issue a DCL write (e.g. a `WriteTagRequest`
to `_dclManager`) and surface success/failure to the caller; do not persist an
override and do not optimistically mutate `_attributes`. Only attributes with no
data source reference should follow the current static-override path. Consider
splitting the message into `SetStaticAttributeCommand` vs `SetDataAttributeCommand`,
or branching inside the handler.
**Resolution**
Resolved 2026-05-16 (`<pending>`): `InstanceActor.HandleSetStaticAttribute` now resolves
the target attribute's data binding from `_configuration`. Data-sourced attributes are
routed via a new `HandleSetDataAttribute` that Asks the DCL with a `WriteTagRequest` and
pipes the device-write outcome back to the caller as a `SetStaticAttributeResponse`
no override is persisted and `_attributes` is not optimistically mutated. Static
attributes keep the override path and now also reply with a `SetStaticAttributeResponse`.
`ScriptRuntimeContext.SetAttribute` is now `async Task` and Asks the Instance Actor,
throwing `InvalidOperationException` on a failed device write so scripts get the failure
synchronously.
### SiteRuntime-002 — `RouteInboundApiSetAttributes` always treats writes as static overrides
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:632` |
**Description**
`RouteInboundApiSetAttributes` (handling `Route.To().SetAttribute(s)` from the
Inbound API) emits a `SetStaticAttributeCommand` for every attribute, so it inherits
the same defect as SiteRuntime-001: writes to data-sourced attributes never reach
the device and are instead persisted as static overrides. In addition the response
is sent back as unconditionally successful (`true`) before the Instance Actor has
even processed the command, so a non-existent attribute or a future DCL write
failure is reported to the external caller as success.
**Recommendation**
Route through the same corrected `InstanceActor` write handler as SiteRuntime-001 so
the static-vs-data distinction is honoured. The optimistic ack is acceptable for
fire-and-forget static writes per the doc, but the XML comment should make the
limitation explicit, and once data-attribute writes are supported they need a real
response path.
**Resolution**
Resolved 2026-05-16 (`<pending>`): `RouteInboundApiSetAttributes` now Asks the Instance
Actor per attribute (instead of fire-and-forget Tell) and aggregates the
`SetStaticAttributeResponse` results. Because the Instance Actor handler is the
SiteRuntime-001 corrected handler, data-sourced attributes now reach the DCL and the
`RouteToSetAttributesResponse` reflects the real per-attribute outcome — a non-existent
attribute or a failed device write is reported as failure rather than an unconditional
optimistic `true`.
### SiteRuntime-003 — Redeployment relies on a fixed 500 ms reschedule and can collide on the child actor name
| | |
|--|--|
| Severity | High |
| Category | Akka.NET conventions |
| Status | Resolved |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:222` |
**Description**
`HandleDeploy` stops an existing Instance Actor with `Context.Stop` and then
reschedules the same `DeployInstanceCommand` to itself after a hard-coded 500 ms,
hoping the child has fully terminated by then. `Context.Stop` is asynchronous; the
child is only removed from the parent's children collection after it actually stops
(including running `PostStop` on its descendants). If a deeply nested or slow
hierarchy takes longer than 500 ms, `CreateInstanceActor` calls `Context.ActorOf`
with a name that still belongs to the terminating child and throws
`InvalidActorNameException`. The `_instanceActors` dictionary check does not prevent
this — the dictionary entry is removed immediately, but the Akka child registry is
not. The 500 ms delay is also unconditionally added to every redeploy latency.
**Recommendation**
Watch the terminating child (`Context.Watch`) and recreate the Instance Actor only
after receiving the `Terminated` message, instead of guessing with a timer. Buffer
or stash the in-flight `DeployInstanceCommand` (and any further commands for that
instance) until termination completes.
**Resolution**
Resolved 2026-05-16 (`<pending>`): `HandleDeploy` no longer uses a fixed 500 ms
reschedule. When a redeployment targets a running instance, the existing Instance Actor
is `Context.Watch`-ed and stopped, and the in-flight `DeployInstanceCommand` is buffered
in a `_pendingRedeploys` map keyed by the terminating actor ref. A new `Terminated`
handler recreates the Instance Actor only after the predecessor (and its whole subtree)
has fully stopped, eliminating the `InvalidActorNameException` race and the
unconditional redeploy-latency penalty. The shared `ApplyDeployment` helper also skips
the `_totalDeployedCount` increment for redeployments, so the deployed-instance count no
longer drifts (this additionally addresses the root cause behind SiteRuntime-004).
### SiteRuntime-004 — `_totalDeployedCount` is incremented on redeployment of an existing instance
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:239` |
**Description**
In `HandleDeploy`, the existing-actor branch (line 223) reschedules the command and
returns. When the rescheduled command runs, no actor exists, so the code falls
through to the "new instance" branch and executes `_totalDeployedCount++`
(line 239). A redeployment is an *update* of an already-deployed instance, not a new
one, so the deployed count is over-counted by one on every redeploy.
`StoreDeployedConfigAsync` uses UPSERT semantics, so the SQLite row count does not
grow, but the in-memory `_totalDeployedCount` (reported to the health collector via
`UpdateInstanceCounts`) drifts upward and the reported "disabled" count becomes
wrong.
**Recommendation**
Only increment `_totalDeployedCount` when the instance is genuinely new. Either
track whether this deploy replaced an existing config, or derive the deployed count
from storage / the union of running actors and disabled configs rather than
maintaining a hand-incremented counter.
**Resolution**
_Unresolved._
### SiteRuntime-005 — Deployment reports `Success` to central before persistence completes
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:272` |
**Description**
`HandleDeploy` replies to central with `DeploymentStatus.Success` immediately after
creating the Instance Actor, while the SQLite persistence (`StoreDeployedConfigAsync`
+ `ClearStaticOverridesAsync`) runs asynchronously on a `Task.Run`. If persistence
fails, `HandleDeployPersistenceResult` only logs an error — central has already been
told the deployment succeeded. On a subsequent node restart or failover the instance
will not be re-created (it is not in SQLite), so the deployment is silently lost
despite central recording success. This contradicts the design's intent that the
site is the durable source of truth for deployed configs.
**Recommendation**
Persist the config before replying, or treat a persistence failure as a deployment
failure and send a corrective `DeploymentStatusResponse`/health signal to central.
At minimum, do not report `Success` until the config row is committed.
**Resolution**
_Unresolved._
### SiteRuntime-006 — Site-local repositories read `SiteStorageService` private field via reflection
| | |
|--|--|
| Severity | Medium |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:183`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs:181` |
**Description**
Both repositories' `CreateConnection()` use `Type.GetField("_connectionString",
BindingFlags.NonPublic | BindingFlags.Instance)` to extract the private connection
string out of `SiteStorageService`. This is brittle (any rename or refactor of the
field breaks it at runtime, not compile time), defeats encapsulation, and the
accompanying XML comment openly describes it as a "pragmatic" hack and is internally
contradictory (it states a connection string is "passed separately at DI
registration time" which is not what the code does). It also sits awkwardly against
the project's own script trust model, which forbids `System.Reflection` in scripts.
**Recommendation**
Expose the connection string properly: add an `ISiteStorageConnectionProvider`
(already referenced in `ServiceCollectionExtensions` XML docs but not used), or have
`SiteStorageService` expose a `CreateConnection()` factory, and inject that into the
repositories. Remove the reflection entirely.
**Resolution**
_Unresolved._
### SiteRuntime-007 — Synthetic entity IDs use the non-deterministic `string.GetHashCode()`
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:241`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs:254` |
**Description**
`GenerateSyntheticId` computes `name.GetHashCode() & 0x7FFFFFFF`. On .NET Core,
`string.GetHashCode()` is randomized per process by default, so the "stable
deterministic synthetic ID" promised by the XML comment is not stable at all — it
changes every time the process restarts. Any caller that obtained an ID and later
calls `GetExternalSystemByIdAsync`/`GetNotificationListByIdAsync` after a restart
will fail to find the entity. It also risks collisions: distinct names can hash to
the same 31-bit value, and `GetExternalSystemByIdAsync` would then return the wrong
row.
**Recommendation**
Use a deterministic, collision-resistant hash (e.g. a stable FNV-1a or the first
bytes of a SHA-256 of the name) if a synthetic integer ID is genuinely required, or
better, change the repository contract to key these site-local artifacts by name
rather than synthesising integer IDs.
**Resolution**
_Unresolved._
### SiteRuntime-008 — Blocking `.GetAwaiter().GetResult()` on the actor thread during startup
| | |
|--|--|
| Severity | Medium |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:479` |
**Description**
`LoadSharedScriptsFromStorage` is called synchronously from
`HandleStartupConfigsLoaded` (the actor's message handler) and performs
`_storage.GetAllSharedScriptsAsync().GetAwaiter().GetResult()` followed by Roslyn
compilation of every shared script. This blocks the DeploymentManager singleton's
mailbox thread for the full duration of the SQLite read and all shared-script
compilation. On the default dispatcher this also ties up a thread-pool thread and
risks thread-pool starvation, and the singleton cannot process any other message
(deployments, lifecycle commands, debug routing) until it returns. The rest of the
class correctly uses `PipeTo`/`ContinueWith`.
**Recommendation**
Load shared scripts asynchronously and `PipeTo(Self)` an internal message, the same
pattern already used for `StartupConfigsLoaded`. Perform compilation either inside
the piped continuation handler (still on the actor thread but at least off the
synchronous startup path) or on a dedicated background task whose result is piped
back.
**Resolution**
_Unresolved._
### SiteRuntime-009 — Script execution actors run scripts on the default thread pool, not a dedicated dispatcher
| | |
|--|--|
| Severity | Medium |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs:72`, `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:289`, `src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs:57` |
**Description**
The design (CLAUDE.md "Architecture & Runtime") states Script Execution Actors run
on a *dedicated blocking I/O dispatcher*. The code does not do this: `ScriptActor.SpawnExecution`
and `AlarmActor.SpawnAlarmExecution` create the execution actors with no
`.WithDispatcher(...)`, and the execution itself runs inside a bare `Task.Run`,
i.e. on the shared .NET thread pool. The `// NOTE: In production, configure a
dedicated ... dispatcher` comments acknowledge the gap but it ships unconfigured.
Scripts can perform synchronous blocking I/O (`Database.Connection`, synchronous
`ExternalSystem.Call`); running them on the shared pool can starve it and stall
unrelated Akka dispatchers and HTTP request handling under load.
**Recommendation**
Define the dedicated dispatcher in HOCON and chain `.WithDispatcher(...)` on the
execution actor `Props`. If the `Task.Run` model is kept, run script bodies on a
dedicated `TaskScheduler` / bounded scheduler rather than the global pool. Either
way, remove the "in production, configure…" comments by actually configuring it.
**Resolution**
_Unresolved._
### SiteRuntime-010 — `EnsureDclConnections` never updates a connection whose configuration changed
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:413` |
**Description**
`EnsureDclConnections` tracks created connections in `_createdConnections` and skips
any name already present (`if (_createdConnections.Contains(name)) continue;`). The
skip is purely name-based: if a redeployment (or an artifact deployment) changes the
endpoint, credentials, backup endpoint, or `FailoverRetryCount` of an existing
connection, the new configuration is silently ignored and the DCL keeps using the
stale `CreateConnectionCommand`. There is no `UpdateConnectionCommand` path. The
design states that after artifact deployment the site is fully self-contained with
current configuration; this caching breaks that for connection changes.
**Recommendation**
Compare the incoming connection config against the last one sent and re-issue a
create/update command when it differs, or have the DCL treat `CreateConnectionCommand`
as idempotent upsert and always forward it. Key the cache on a config hash, not just
the name.
**Resolution**
_Unresolved._
### SiteRuntime-011 — Trust-model validation is a substring scan and is both over- and under-inclusive
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs:52` |
**Description**
`ValidateTrustModel` enforces the script trust model by doing raw `string.Contains` /
`IndexOf` on the script source text for forbidden namespace strings. This is
unreliable in both directions:
- **Bypass (under-inclusive):** the check looks only for the literal namespace
strings. A script can reach forbidden APIs without ever writing `System.IO` etc. —
e.g. via fully-qualified type use through aliasing, `global::`-prefixed names, or
simply because the namespace is already imported transitively. The compilation
references include `typeof(object).Assembly` (the whole of `System.Private.CoreLib`,
which contains `System.IO.File`, `System.Threading.Thread`, `System.Reflection`,
etc.), so forbidden types are fully resolvable at compile time and the only barrier
is this textual scan.
- **False positives (over-inclusive):** any occurrence of the substring in a comment,
string literal, or an unrelated identifier (e.g. a variable named `ProcessThreading`)
triggers a violation; the `AllowedExceptions` logic only rescues exact prefixes.
- The dead `isAllowed` variable at line 64 is computed and never used.
**Recommendation**
Enforce the trust model with a Roslyn `SyntaxWalker`/semantic analysis (inspect
resolved symbols and their containing namespaces/assemblies), or restrict the
compilation's metadata references and `AssemblyLoadContext` so forbidden types are
genuinely unavailable, rather than relying on source-text matching. Remove the
unused `isAllowed` variable.
**Resolution**
_Unresolved._
### SiteRuntime-012 — `AttributeAccessor`/`ScopeAccessors` block the script on a synchronous Ask
| | |
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Scripts/ScopeAccessors.cs:28` |
**Description**
`AttributeAccessor`'s indexer getter calls `_ctx.GetAttribute(...).GetAwaiter().GetResult()`,
synchronously blocking the script-execution thread on an actor Ask. Combined with
SiteRuntime-009 (scripts run on the shared thread pool) this means a script that
reads several attributes via `Attributes["X"]` holds a pool thread blocked for each
round-trip. The async variants (`GetAsync`/`SetAsync`) exist but the ergonomic
indexer encourages the blocking path. The XML comment notes "Reads block on the
actor Ask" but does not warn about the thread-pool impact.
**Recommendation**
Once a dedicated script dispatcher exists (SiteRuntime-009) the blocking is contained
to that pool, which is acceptable; until then, document the cost clearly and prefer
steering script authors to the async accessors. Consider making the indexer
internal-only and exposing only the async API.
**Resolution**
_Unresolved._
### SiteRuntime-013 — `HandleUnsubscribeDebugView` does nothing despite documented behaviour
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:414` |
**Description**
`HandleUnsubscribeDebugView` is documented ("Debug view unsubscribe — removes
subscription") and the actor registers a handler for `UnsubscribeDebugViewRequest`,
but the body only logs a debug message — there is no subscription state in the
Instance Actor to remove. The design places the actual subscription lifecycle in
`SiteStreamManager` (`Subscribe`/`Unsubscribe`/`RemoveSubscriber`), so the Instance
Actor genuinely has nothing to do here. The handler and its XML comment are
therefore misleading: a reader expects it to tear down a subscription.
**Recommendation**
Either remove the no-op handler and route `UnsubscribeDebugViewRequest` to wherever
the `SiteStreamManager` subscription is actually cancelled, or correct the XML
comment to state explicitly that subscription teardown is handled by
`SiteStreamManager` and this handler is a no-op acknowledgement.
**Resolution**
_Unresolved._
### SiteRuntime-014 — Trigger-expression evaluation blocks the coordinator actor thread
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:219`, `src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs:389` |
**Description**
`EvaluateExpressionTrigger` (ScriptActor) and `EvaluateExpression` (AlarmActor) run a
compiled Roslyn script with `.RunAsync(...).GetAwaiter().GetResult()` directly inside
the actor's `AttributeValueChanged` message handler. This blocks the coordinator
actor's mailbox thread for up to the 2-second timeout on every monitored attribute
change. Coordinator actors are on the default dispatcher and process the hot path of
attribute-change fan-out; a slow expression delays all other messages to that actor
and consumes a thread-pool thread for the duration. The inline comments correctly
note CPU-bound expressions are not interruptible but do not address the
mailbox-blocking concern.
**Recommendation**
Trigger expressions are expected to be cheap, but to keep the actor responsive
consider evaluating them off the actor thread (pipe the boolean result back as an
internal message) or pre-compiling to a plain delegate that executes near-instantly
without the Roslyn scripting `RunAsync` machinery.
**Resolution**
_Unresolved._
### SiteRuntime-015 — `LoggerFactory` created per Instance Actor and never disposed
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:746` |
**Description**
`CreateInstanceActor` does `var loggerFactory = new LoggerFactory();` for every
Instance Actor it creates, uses it once to produce an `ILogger<InstanceActor>`, and
never disposes it. `LoggerFactory` is `IDisposable`. With up to 500 instances (and
churn from redeployments) this leaks a factory per instance, and the produced
loggers are detached from the application's configured logging providers, so
Instance Actor logs may not be routed/filtered consistently with the rest of the
host.
**Recommendation**
Inject the application's `ILoggerFactory` (or an `ILogger<InstanceActor>` factory
delegate) into `DeploymentManagerActor` via DI and reuse it, rather than newing one
up per child. Do not create a fresh `LoggerFactory` in a hot creation path.
**Resolution**
_Unresolved._
### SiteRuntime-016 — Short-lived execution actors, replication actor, and repositories are untested
| | |
|--|--|
| Severity | Low |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.SiteRuntime.Tests/` |
**Description**
The test project covers the coordinator actors (`InstanceActor`, `ScriptActor`,
`AlarmActor`, `DeploymentManagerActor`), persistence, scripting and streaming, but a
search of the test sources finds no references to `ScriptExecutionActor`,
`AlarmExecutionActor`, `SiteReplicationActor`, `SiteExternalSystemRepository`, or
`SiteNotificationRepository`. These cover critical paths: script timeout/failure
handling and result reply, alarm on-trigger execution, peer config/S&F replication
(including the `SendToPeer` no-peer drop), and the reflection-based repository reads.
Several findings above (001/002 mis-routing, 007 ID instability, 011 trust bypass)
would likely have been caught by targeted tests.
**Recommendation**
Add unit/integration tests for the execution actors (success, timeout, exception,
Ask-reply, PoisonPill self-stop), `SiteReplicationActor` (outbound forward, inbound
apply, peer tracking on cluster events), and the two repositories (round-trip read,
synthetic-ID lookup, missing-row behaviour).
**Resolution**
_Unresolved._
+555
View File
@@ -0,0 +1,555 @@
# Code Review — StoreAndForward
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.StoreAndForward` |
| Design doc | `docs/requirements/Component-StoreAndForward.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 11 |
## Summary
The Store-and-Forward module is small and readable, with a clean SQLite persistence
layer, a sensible service API, and reasonable test coverage of the storage and service
happy paths. However the review surfaced two issues that undermine the module's core
purpose. First, the active delivery path never invokes the `ReplicationService`
`ReplicateEnqueue/Remove/Park` have no callers anywhere in the codebase, so buffered
messages are not replicated to the standby node and the design's failover-durability
guarantee (Component doc "Persistence", CLAUDE.md "Store-and-Forward") is not met.
Second, there is an off-by-one in retry accounting: the immediate-failure path stores a
buffered message with `RetryCount = 1`, so a message configured with `MaxRetries = N`
is actually attempted `N` times in total rather than one immediate attempt plus `N`
retries, and a per-source `MaxRetries` of 1 produces zero retry attempts. Additional
themes: SQLite connection-per-call with no transactional grouping of multi-statement
operations, no concurrency guard against a parked message being retried while the
sweep is mid-flight, an unused enum member (`InFlight`) that drifts from the documented
status set, and untested critical paths (retry-due timing, replication-from-active,
the actor bridge). None of the findings are blockers for compilation, but the
replication and retry-count issues are functional defects against the design.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☑ | Off-by-one in retry counting (003); parked-message retry timing (010). |
| 2 | Akka.NET conventions | ☑ | `ContinueWith` used instead of `PipeTo`-friendly continuations; default supervision; see 007. |
| 3 | Concurrency & thread safety | ☑ | Sweep guarded by `Interlocked`, but no guard against retry-vs-manage races (005); `OnActivity` event not thread-safe (009). |
| 4 | Error handling & resilience | ☑ | Replication never invoked from active path (001); no-handler messages buffered then stuck (002). |
| 5 | Security | ☑ | No issues found — parameterised SQL throughout; no secrets handled directly; payload JSON treated opaquely. |
| 6 | Performance & resource management | ☑ | New SQLite connection per call; multi-statement operations not wrapped in a transaction (006, 008). |
| 7 | Design-document adherence | ☑ | Replication gap (001); `InFlight` status undocumented/unused (011); "retrying" status from design doc not modelled. |
| 8 | Code organization & conventions | ☑ | `StoreAndForwardMessage` is an entity-like POCO living in the component, not Commons (012). |
| 9 | Testing coverage | ☑ | Retry-due timing, replication-from-active, and `ParkedMessageHandlerActor` are untested (013). |
| 10 | Documentation & comments | ☑ | XML doc on `RegisterDeliveryHandler` contract is inconsistent with code (004). |
## Findings
### StoreAndForward-001 — Replication to standby is never triggered by the active node
| | |
|--|--|
| Severity | Critical |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.StoreAndForward/ReplicationService.cs:40`, `:53`, `:66`; `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:155`, `:212`, `:222`, `:236` |
**Description**
`ReplicationService` exposes `ReplicateEnqueue`, `ReplicateRemove` and `ReplicatePark`
to forward buffer operations to the standby node, but a codebase-wide search shows these
methods have no callers. `StoreAndForwardService` — which performs every add (`EnqueueAsync`
line 155 / 163), remove (`RemoveMessageAsync` call at line 212) and park
(`UpdateMessageAsync` calls at lines 222/236) — holds no reference to `ReplicationService`
and never invokes it. Only the receiving half is wired (`SetReplicationHandler` and
`ApplyReplicatedOperationAsync` are used by `SiteReplicationActor`). The Component design
doc ("Persistence") and CLAUDE.md ("Store-and-Forward") require the active node to
forward every buffer operation to the standby so that, on failover, the new active node
"has a near-complete copy of the buffer." As written, the standby's S&F SQLite database
stays empty and a failover loses the entire buffer — a data-loss defect against a core
requirement.
**Recommendation**
Inject `ReplicationService` into `StoreAndForwardService` and call `ReplicateEnqueue`
after a successful `_storage.EnqueueAsync`, `ReplicateRemove` after `RemoveMessageAsync`,
and `ReplicatePark` after a park-causing `UpdateMessageAsync`. Update
`ServiceCollectionExtensions.AddStoreAndForward` to pass the dependency. Add a test that
asserts the replication handler observes each operation type.
**Resolution**
Resolved 2026-05-16. `ReplicationService` is now injected into `StoreAndForwardService`
(wired in `AddStoreAndForward`), and every buffer operation is forwarded to the standby:
a new `BufferAsync` helper calls `ReplicateEnqueue` after each persist, `ReplicateRemove`
runs after a successful retry removes a message, and `ReplicatePark` runs on both park
paths. Replication stays fire-and-forget and is a no-op when `ReplicationEnabled` is
false or no handler is wired. Regression tests `StoreAndForwardReplicationTests` assert
the replication handler observes the Add, Remove and Park operations. Fixed by the
commit whose message references `StoreAndForward-001`.
### StoreAndForward-002 — Messages enqueued with no registered handler are buffered but never deliverable
| | |
|--|--|
| Severity | Low |
| Original severity | High (re-triaged down to Low on 2026-05-16 — see Re-triage note) |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:162`, `:201` |
**Description**
`EnqueueAsync` falls through to "No handler registered — buffer for later" (line 162)
when no delivery handler is registered for the category. The retry sweep
(`RetryMessageAsync`, line 201) then logs "No delivery handler for category" and
`return`s without touching the message. No caller in the codebase ever calls
`RegisterDeliveryHandler` (the External System Gateway, Notification Service and
Database Gateway only call `EnqueueAsync`), so in the current wiring **every** buffered
message lands in this dead state: it is persisted, counts toward buffer depth, but can
never be retried, delivered or parked. It will sit Pending forever. Either the handler
registration is missing from Host/gateway startup, or the "buffer for later" path is a
silent trap. Either way the engine cannot deliver anything.
**Recommendation**
Decide the intended contract. If handlers are expected to be registered before
`EnqueueAsync` is reachable, make `EnqueueAsync` reject (or log an error) when no
handler exists rather than silently buffering an undeliverable message, and wire
`RegisterDeliveryHandler` calls in Host startup for all three categories. If late
registration is intended, the retry sweep should treat a still-missing handler as a
transient condition with bounded logging rather than a permanent no-op.
**Re-triage note (2026-05-16)**
The finding's central factual claim — *"No caller in the codebase ever calls
`RegisterDeliveryHandler`"* and therefore *"every buffered message lands in this dead
state"* — is **no longer true at the reviewed code**. `ScadaLink.Host`
(`AkkaHostedService.RegisterSiteActors`, `AkkaHostedService.cs:353-379`) registers all
three delivery handlers (`ExternalSystem`, `CachedDbWrite`, `Notification`) at site
startup, immediately after `StoreAndForwardService.StartAsync()`. The finding was
written against commit `9c60592` before that wiring existed; the High-severity
"engine cannot deliver anything" outcome no longer occurs.
The remaining residual risk is narrow: a message enqueued for a category that genuinely
has no handler (e.g. an enqueue racing ahead of `RegisterDeliveryHandler`, or a future
category added without a handler) is still buffered and then skipped by the sweep
forever. That is a real but minor robustness gap, hence the **downgrade to Low**.
It is left **Open** rather than fixed in this pass because the finding's recommended
fix — making `EnqueueAsync` reject when no handler is registered — is a behavioural
contract change, not a localised bug fix: the "buffer with no handler yet" path is
exercised by `StoreAndForwardReplicationTests` and by three NotificationService and
ExternalSystemGateway tests (`Send_TransientError_WithStoreAndForward_BuffersMessage`,
`Send_Smtp4xxCommandException_ClassifiedTransientAndBuffered`,
`Send_SmtpProtocolException_ClassifiedTransient`) which construct a real
`StoreAndForwardService` without registering a handler and assert `WasBuffered`.
Changing the contract requires deciding whether late handler registration is supported
and updating tests in modules outside this review's edit scope — a design decision that
should be made deliberately rather than forced here.
**Resolution**
_Open — re-triaged to Low. Premise (no handler registration anywhere) is stale: Host
now wires all three handlers. Residual gap is minor and the prescribed fix is a
cross-module contract change needing a design decision._
### StoreAndForward-003 — Off-by-one in retry accounting: immediate failure pre-counts as retry 1
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:153`, `:229`, `:233` |
**Description**
On a transient immediate-delivery failure, `EnqueueAsync` buffers the message with
`message.RetryCount = 1` (line 153). The retry sweep then increments `RetryCount` before
the max check (`RetryCount++` at line 229; `RetryCount >= MaxRetries` at line 233).
Consequences: (1) a message configured with `MaxRetries = 1` is parked on the *first*
retry sweep without ever being retried, because after the immediate attempt `RetryCount`
is already 1 and the first sweep makes it 2 ≥ 1 — zero actual retries occur, contradicting
the design intent that the immediate attempt and the retry budget are distinct;
(2) the design doc's `Retry Count` field is "Number of attempts so far," but here it is
seeded to 1 before any *retry* has happened, making the parked-message `AttemptCount`
shown to operators off by one relative to configured `MaxRetries`. The
`EnqueueAsync_TransientFailure_BuffersForRetry` test even asserts `RetryCount == 1`,
locking in the ambiguity.
**Recommendation**
Choose one consistent meaning for `RetryCount` (recommended: total delivery attempts,
including the immediate one) and apply it uniformly. If `MaxRetries` is meant to bound
*retries* after the immediate attempt, buffer with `RetryCount = 0` and treat the
immediate failure as attempt 0; if it bounds *total attempts*, document that and adjust
the comparison. Update the affected test to match the chosen semantics.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`). `RetryCount` now consistently means "number
of background retry-sweep attempts so far"; the initial immediate (or caller-made)
delivery attempt is attempt 0 and is not counted, and `MaxRetries` bounds retry-sweep
attempts after that initial attempt. `EnqueueAsync` no longer seeds `RetryCount = 1` on
either the transient-immediate-failure path or the `attemptImmediateDelivery: false`
path — a freshly buffered message has `RetryCount = 0`. `RetryMessageAsync` already
increments before the `>= MaxRetries` check, which is now correct, so a message with
`MaxRetries = 1` gets exactly one real retry before parking (previously zero). The
`StoreAndForwardMessage.RetryCount` XML doc was corrected to match. Regression test
`RetryPendingMessagesAsync_MaxRetriesOne_PerformsExactlyOneRetryBeforeParking` asserts
the immediate attempt plus exactly one retry occur before parking; the affected
existing tests (`EnqueueAsync_TransientFailure_BuffersForRetry`,
`EnqueueAsync_AttemptImmediateDeliveryFalse_BuffersWithoutInvokingHandler`,
`RetryPendingMessagesAsync_MaxRetriesReached_ParksMessage`) were updated to the
corrected semantics.
### StoreAndForward-004 — `RegisterDeliveryHandler` XML doc contradicts the implemented contract
| | |
|--|--|
| Severity | Medium |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:38`, `:60` |
**Description**
The XML comment on the handler delegate (lines 3740) says "Returns true on success,
throws on transient failure. Permanent failures should return false (message will NOT
be buffered)." That last clause is wrong for the retry path: in `RetryMessageAsync`,
a handler returning `false` does not "not buffer" — the message is already buffered, and
the code *parks* it immediately (lines 218224). The comment describes only the
`EnqueueAsync` immediate path and misleads anyone implementing a handler about what
`false` means once a message is in the retry loop.
**Recommendation**
Reword the contract to cover both paths explicitly: `true` = delivered (remove from
buffer); `false` = permanent failure (not buffered on immediate attempt, parked on a
retry); exception = transient failure (buffer / increment retry).
**Resolution**
_Unresolved._
### StoreAndForward-005 — Parked-message retry/discard can race with the in-progress retry sweep
| | |
|--|--|
| Severity | Medium |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:184`, `:266`, `:280` |
**Description**
`RetryPendingMessagesAsync` loads a snapshot of due messages (line 179) and then
processes them one by one (line 184), `await`-ing delivery for each. Meanwhile
`RetryParkedMessageAsync` / `DiscardParkedMessageAsync` (operator actions arriving via
`ParkedMessageHandlerActor`) run on unrelated threads and mutate the same rows. Because
each operation opens its own SQLite connection and there is no row-level coordination,
an operator can `DiscardParkedMessageAsync` a message that the sweep is concurrently
delivering: the sweep's later `RemoveMessageAsync`/`UpdateMessageAsync` operates on a
now-deleted row (harmless) — but if an operator `RetryParkedMessageAsync` resets a row
to Pending while the sweep simultaneously parks the same in-flight message, the operator
intent is silently overwritten. The `Interlocked` guard only prevents *overlapping
sweeps*, not sweep-vs-management races.
**Recommendation**
Funnel all message-state mutations through a single serialization point — e.g. perform
all S&F state changes inside the `ParkedMessageHandlerActor` (or a dedicated S&F actor)
so the actor mailbox serialises them, or make status transitions conditional in SQL
(e.g. `UPDATE ... WHERE id = @id AND status = @expected`) and re-check the affected
row count.
**Resolution**
_Unresolved._
### StoreAndForward-006 — `GetParkedMessagesAsync` count and page run without a transaction
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:166`, `:175` |
**Description**
`GetParkedMessagesAsync` issues a `COUNT(*)` and then a separate paged `SELECT` on two
commands on the same connection with no surrounding transaction. A concurrent
enqueue/park/discard between the two statements yields a `TotalCount` inconsistent with
the returned page (e.g. total reported as 51 while only 50 distinct parked rows now
exist, or a row visible in the page but excluded from the count). For a paginated UI
this produces flickering totals and occasional off-by-one page math.
**Recommendation**
Wrap both reads in a single transaction (`BeginTransaction`) so they see a consistent
snapshot, or accept the staleness and document it. A transaction is cheap here and
removes the inconsistency.
**Resolution**
_Unresolved._
### StoreAndForward-007 — Async work in `ParkedMessageHandlerActor` uses `ContinueWith` without scheduler/affinity guarantees
| | |
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/ParkedMessageHandlerActor.cs:34`, `:68`, `:87` |
**Description**
The three handlers call a `Task`-returning service method and chain `.ContinueWith(...)
.PipeTo(sender)`. `Sender` is correctly captured into a local first, so the closure is
safe. However `ContinueWith` without an explicit `TaskScheduler` runs the continuation
on a thread-pool thread and the captured continuation builds the response objects there
— acceptable since it only touches locals, but it bypasses the idiomatic
`PipeTo`-with-success/failure-projection pattern and is fragile if someone later adds a
line touching actor state inside the continuation. There is also no `TaskContinuationOptions`,
so a faulted antecedent still runs the continuation (handled here via `IsCompletedSuccessfully`,
but only by convention).
**Recommendation**
Replace `ContinueWith(...).PipeTo(sender)` with `PipeTo(sender, success: result => ...,
failure: ex => ...)`, which is the documented Akka pattern, keeps response construction
off the actor thread safely, and makes the success/failure branches explicit.
**Resolution**
_Unresolved._
### StoreAndForward-008 — A SQLite connection is opened and torn down on every storage call
| | |
|--|--|
| Severity | Low |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:28`, `:61`, `:93`, `:117`, `:144`, `:162`, `:199`, `:221`, `:237`, `:267`, `:285`, `:305`, `:319` |
**Description**
Every method in `StoreAndForwardStorage` constructs a fresh `SqliteConnection` and calls
`OpenAsync`. Microsoft.Data.Sqlite pools connections, so this is not a correctness bug,
but a retry sweep over a large buffer performs one open per `UpdateMessageAsync`/
`RemoveMessageAsync` call inside the loop (`RetryMessageAsync`), multiplying connection
churn under load. With no max buffer size (by design) the buffer can grow large, so the
per-message connection acquisition is a measurable overhead on the hot retry path.
**Recommendation**
Consider a batched retry API that opens one connection (and one transaction) per sweep,
or pass an open connection into the per-message update calls. At minimum, document that
the design relies on the Sqlite connection pool for acceptable performance.
**Resolution**
_Unresolved._
### StoreAndForward-009 — `OnActivity` event invocation is not thread-safe against concurrent subscribe/unsubscribe
| | |
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:46`, `:309` |
**Description**
`OnActivity` is a public `event Action<...>` raised via `OnActivity?.Invoke(...)` in
`RaiseActivity` (line 309). `RaiseActivity` is called from both `EnqueueAsync` (caller
thread) and `RetryMessageAsync` (timer thread). The `?.Invoke` null-conditional captures
the delegate once so it will not NRE, but there is no synchronisation around the event
field itself; a subscriber added/removed concurrently with a raise has no defined
ordering. More importantly, subscriber callbacks run synchronously on the timer thread,
so a slow or throwing subscriber stalls or aborts the retry sweep (an exception in a
subscriber propagates out of `RaiseActivity` into `RetryMessageAsync`'s `try` and is
swallowed as a "transient failure," wrongly incrementing the message's retry count).
**Recommendation**
Snapshot the delegate (already done) and additionally wrap subscriber invocation in a
`try/catch` so a faulting logging subscriber cannot be misclassified as a delivery
failure. Document that handlers must be fast and non-throwing, or dispatch activity
notifications asynchronously.
**Resolution**
_Unresolved._
### StoreAndForward-010 — Retry of a parked message does not reset `LastAttemptAt`, so its retry timing is unspecified
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:203`, `:101` |
**Description**
`RetryParkedMessageAsync` sets `status = Pending, retry_count = 0, last_error = NULL`
but leaves `last_attempt_at` unchanged (line 203206). The retry-due query
(`GetMessagesForRetryAsync`, line 101105) selects Pending rows where
`last_attempt_at IS NULL OR ... elapsed >= retry_interval_ms`. A message parked after
exhausting retries has an old `last_attempt_at`; once re-queued, the elapsed time since
that stale timestamp is almost certainly already greater than the retry interval, so the
operator-retried message is attempted on the very next sweep regardless of the
configured interval. That is probably the desired behaviour (operator wants it tried
now), but it is unspecified and inconsistent — if `retry_interval_ms` were very large the
behaviour would instead be "try immediately" by accident rather than by design.
**Recommendation**
Explicitly decide and encode the intent: either set `last_attempt_at = NULL` on
re-queue so the message is unambiguously due now, or set it to "now" so it waits one
interval. Document the chosen behaviour in the method's XML comment.
**Resolution**
_Unresolved._
### StoreAndForward-011 — `StoreAndForwardMessageStatus.InFlight` is unused and the doc's "retrying" status is unmodelled
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.Commons/Types/Enums/StoreAndForwardMessageStatus.cs:9`; `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:219`, `:235` |
**Description**
The enum defines `Pending, InFlight, Parked, Delivered`. The module only ever uses
`Pending` and `Parked``InFlight` and `Delivered` are never assigned (delivered
messages are deleted, not marked `Delivered`). Meanwhile the Component design doc
("Message Format" -> Status) specifies the set "Pending, retrying, or parked." So the
code's enum drifts from the doc in two directions: it carries dead members the doc does
not mention (`InFlight`, `Delivered`) and omits the doc's `retrying` state. A message
mid-retry is indistinguishable from one that has never been attempted.
**Recommendation**
Reconcile the enum with the design. Either drop the unused members and update the doc,
or implement the documented `retrying` state and use `InFlight` to mark a message the
sweep is actively delivering (which would also help with finding 005).
**Resolution**
_Unresolved._
### StoreAndForward-012 — `StoreAndForwardMessage` is a persistence entity but lives in the component, not Commons
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardMessage.cs:9` |
**Description**
`StoreAndForwardMessage` is a persistence-ignorant POCO that maps directly to the
`sf_messages` table and is also carried across the network inside `ReplicationOperation`
(replicated to the standby node over Akka remoting). CLAUDE.md "Code Organization" states
that entity classes are persistence-ignorant POCOs in Commons and that message contracts
follow additive-only evolution. Because this type doubles as a replication wire contract
but lives in the component assembly, it is not co-located with the other Commons
entities and its evolution is not governed by the additive-only message-contract rule.
This is a borderline case (the type is site-local), but the cross-node use via
`ReplicationOperation` makes it a de-facto message contract.
**Recommendation**
Either move `StoreAndForwardMessage` (and `ReplicationOperation`) into the Commons
`Entities`/`Messages` hierarchy so they are governed by the contract-evolution rules, or
introduce a separate DTO for replication and keep `StoreAndForwardMessage` purely as the
local persistence model. Document the decision.
**Resolution**
_Unresolved._
### StoreAndForward-013 — Critical paths lack test coverage: retry-due timing, replication-from-active, and the actor bridge
| | |
|--|--|
| Severity | Medium |
| Category | Testing coverage |
| Status | Open |
| Location | `tests/ScadaLink.StoreAndForward.Tests/` (whole directory); `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:101`; `src/ScadaLink.StoreAndForward/ParkedMessageHandlerActor.cs` |
**Description**
The existing tests cover storage CRUD and the service happy/failure paths well, but
three important behaviours are untested: (1) the retry-due time filter in
`GetMessagesForRetryAsync` — every service test sets `DefaultRetryInterval = TimeSpan.Zero`,
so the `julianday` elapsed-time comparison (the most error-prone SQL in the module) is
never exercised with a non-zero interval; a message that is *not yet due* should be
skipped, and that is never verified. (2) Replication from the active side — no test
asserts that an enqueue/remove/park causes a `Replicate*` call (this is exactly the gap
behind finding 001; a test would have caught it). (3) `ParkedMessageHandlerActor` has no
test at all — the Query/Retry/Discard request-to-response mapping and the
`ExtractMethodName` JSON parsing are unverified, including the malformed-JSON branch.
**Recommendation**
Add tests for: a non-zero retry interval where a recently-attempted message is excluded
and an older one is included; active-side replication invocation per operation type
(once finding 001 is fixed); and `ParkedMessageHandlerActor` using `Akka.TestKit`,
including `ExtractMethodName` for `MethodName`, `Subject`, missing-property and
invalid-JSON payloads.
**Resolution**
_Unresolved._
### StoreAndForward-014 — Storage does not create its SQLite database directory
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Resolved |
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardStorage.cs:26` |
**Found 2026-05-16** while verifying the store-and-forward fixes — this defect was
not part of the original baseline review.
**Description**
`StoreAndForwardStorage.InitializeAsync` opened a `SqliteConnection` against the
configured `SqliteDbPath` (default `./data/store-and-forward.db`) without ensuring the
parent directory exists. SQLite creates the database *file* on demand but not its
*directory*, so when `data/` does not already exist the connection fails to open with
`SQLite Error 14: 'unable to open database file'`. Every site-host boot therefore failed
in any environment whose working directory has no `data/` folder — the cause of the six
failing `SiteActorPathTests` (the host's `RegisterSiteActors` aborts at
`StoreAndForwardService.StartAsync`). Production masked it because `data/` is created by
the Docker image / deployment.
**Recommendation**
Create the parent directory of a file-backed SQLite database before opening it.
**Resolution**
Resolved 2026-05-16. `InitializeAsync` now calls a new `EnsureDatabaseDirectoryExists`
helper that parses the connection string with `SqliteConnectionStringBuilder` and, for a
file-backed database, creates the parent directory if it is missing (in-memory databases
and bare filenames are skipped). Regression test
`InitializeAsync_FileInMissingDirectory_CreatesDirectory` fails against the pre-fix code;
all six `SiteActorPathTests` now pass. Fixed by the commit whose message references
`StoreAndForward-014`.
+532
View File
@@ -0,0 +1,532 @@
# Code Review — TemplateEngine
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.TemplateEngine` |
| Design doc | `docs/requirements/Component-TemplateEngine.md` |
| Status | Reviewed |
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 9 |
## Summary
The Template Engine is a pure central-side modeling library: stateless services
over `ITemplateEngineRepository` plus four static helper classes (collision, cycle,
lock, resolver). It has no Akka actors and no direct concurrency, so the Akka and
thread-safety categories produce nothing of substance. The code is generally
well-structured and the cascade-based composition model (derived templates owned by
composition slots) is consistently applied. However the review surfaced several real
correctness gaps. The most serious are in **flattening**: composed alarms and scripts
nested below the first level are silently dropped, derived templates omit base
alarms entirely (breaking per-slot alarm override), and the alarm-on-trigger-script
resolution step is an empty placeholder so that whole validation rule is dead.
Validation has two security-relevant weaknesses — the forbidden-API scan is a naive
substring match and the brace-balance "compile" check mispredicts on verbatim /
interpolated / raw string literals. Several documented behaviours (collision check on
create, optimistic concurrency on instance state) are claimed but not implemented.
Themes: validation that is weaker than the design promises, and asymmetric handling
of attributes vs. alarms vs. scripts throughout the resolve/flatten/derive paths.
## Checklist coverage
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ✓ | Multiple real bugs: deep composed-member loss, derived alarms omitted, granularity bypass, no-op create-time collision block. |
| 2 | Akka.NET conventions | ✓ | No actors in this module (`AddTemplateEngineActors` is an empty placeholder). Nothing to assess. |
| 3 | Concurrency & thread safety | ✓ | Services are stateless, scoped per request; static helpers hold no mutable state. Design says template editing is last-write-wins; that is honoured. See TemplateEngine-010 re: a doc claim of optimistic concurrency that is not implemented. |
| 4 | Error handling & resilience | ✓ | `Result<T>` used consistently; repository nulls guarded. `FlatteningService` wraps in try/catch. No store-and-forward or failover surface in this module. |
| 5 | Security | ✓ | No auth checks in-module (delegated to callers per design). Script trust-model enforcement is weak — see TemplateEngine-006 and TemplateEngine-007. |
| 6 | Performance & resource management | ✓ | `GetAllTemplatesAsync` reloaded on most member edits; one genuine N+1 in `TemplateDeletionService` (TemplateEngine-009). No `IDisposable` leaks (`JsonDocument`/streams disposed). |
| 7 | Design-document adherence | ✓ | Drift found: recursive composition not fully implemented in flattening; `DataType` enum naming differs from doc; optimistic-concurrency claim. |
| 8 | Code organization & conventions | ✓ | POCO entities in Commons, repo interfaces in Commons, Options pattern N/A (no options here). Duplicate deletion logic (TemplateEngine-014). |
| 9 | Testing coverage | ✓ | Tests exist for every file, but the dead/placeholder paths (TemplateEngine-004, 005) and deep nesting (TemplateEngine-001) are not exercised. |
| 10 | Documentation & comments | ✓ | Mostly accurate; a misleading converter comment (TemplateEngine-011) and a stale enum/doc mismatch (TemplateEngine-012). |
## Findings
### TemplateEngine-001 — Deeply nested composed members are dropped during flattening
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:211`, `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:535`, `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:609` |
**Description**
The design doc states composition supports "recursive nesting of feature modules"
and that nested paths extend as `[Outer].[Inner].[Member]`. `ResolveComposedAttributes`
only descends **one** level of nesting: it resolves the directly-composed module, then
its immediate child compositions, and stops. A module composed three or more levels
deep contributes no attributes to the flattened configuration. `ResolveComposedAlarms`
and `ResolveComposedScripts` are worse — they handle only the first (direct) level and
do not descend at all, so any alarm or script in a nested composed module is dropped
entirely. `CollisionDetector` and `TemplateResolver` recurse fully, so collision
detection and the authoring UI will show members that the deployed configuration
silently lacks.
**Recommendation**
Replace the hand-unrolled one/two-level loops with a single recursive walk
(carrying the accumulated path prefix) for attributes, alarms, and scripts, matching
the recursion already in `TemplateResolver.AddComposedMembers` and
`CollisionDetector.CollectComposedMembers`.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): replaced the hand-unrolled
one/two-level composition loops in `ResolveComposedAttributes`,
`ResolveComposedAlarms`, and `ResolveComposedScripts` with single recursive
walks (`*Recursive` helpers) carrying the accumulated path prefix and a
`visited` set, so composed members at arbitrary nesting depth are resolved.
Regression tests: `Flatten_ThreeLevelComposition_AttributesAlarmsScriptsAllResolved`,
`Flatten_NestedComposedAlarm_TriggerAttributePrefixed`.
### TemplateEngine-002 — Derived templates omit all base alarms; composed alarms cannot be overridden per slot
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:799` |
**Description**
`BuildDerivedTemplate` copies the base template's `Attributes` and `Scripts` into the
new derived template as `IsInherited = true` placeholder rows so they can be overridden
per composition slot, but there is **no loop for `Alarms`**. The derived template
therefore has zero alarm rows. The `TemplateAlarm` entity also has no `IsInherited` or
`LockedInDerived` fields (unlike `TemplateAttribute` / `TemplateScript`), so even if a
copy loop were added there is no mechanism to mark a copied alarm as inherited or to
override one. The design's Override Granularity section explicitly requires composed
alarm fields (Priority, Trigger thresholds, Description, On-Trigger Script) to be
overridable. As written, a composed module's alarms cannot be tuned for the slot they
are used in.
**Recommendation**
Add an alarm copy loop to `BuildDerivedTemplate` and add `IsInherited` /
`LockedInDerived` fields to `TemplateAlarm`, mirroring `TemplateAttribute`. Update
`UpdateAlarmAsync` to honour them as `UpdateAttributeAsync` / `UpdateScriptAsync`
already do.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): implemented the per-slot alarm
override mechanism as a coordinated `Commons` + `ConfigurationDatabase` +
`TemplateEngine` change, mirroring the existing attribute/script override
design. Added `IsInherited` / `LockedInDerived` to the `TemplateAlarm` POCO
(`ScadaLink.Commons`) and an EF migration `AddDerivedAlarmFields` adding two
`bit NOT NULL DEFAULT 0` columns to `TemplateAlarms`. `BuildDerivedTemplate`
now copies base alarms as `IsInherited = true` placeholder rows.
`FlatteningService.ResolveInheritedAlarms` skips `IsInherited` placeholder
rows so they no longer shadow the live base alarm, and `ValidateLockedInDerived`
now rejects a derived override of a `LockedInDerived` base alarm.
`UpdateAlarmAsync` honours the base `LockedInDerived` lock and persists
`IsInherited` / `LockedInDerived`, exactly as `UpdateAttributeAsync` /
`UpdateScriptAsync` do. Regression tests:
`Flatten_InheritedAlarmOnDerived_BaseValueWins`,
`Flatten_OverriddenAlarmOnDerived_DerivedValueWins`,
`Flatten_LockedInDerivedAlarmOverride_Fails`,
`AddComposition_CopiesAlarmsAsInherited`,
`UpdateAlarm_LockedInDerivedBase_RejectsOnDerived`,
`UpdateAlarm_DerivedOverride_PersistsIsInheritedFalse`.
### TemplateEngine-003 — `UpdateAttributeAsync` lets a non-locked attribute change its fixed DataType / DataSourceReference
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:285` |
**Description**
`LockEnforcer.ValidateAttributeOverride` correctly rejects a change to `DataType` or
`DataSourceReference` (both "fixed by the defining level" per the design). But the
caller only honours that error when the attribute is already locked:
```csharp
var granularityError = LockEnforcer.ValidateAttributeOverride(existing, proposed);
if (granularityError != null && existing.IsLocked)
return Result<TemplateAttribute>.Failure(granularityError);
```
Lines 293-294 then unconditionally apply `existing.DataType = proposed.DataType` and
`existing.DataSourceReference = proposed.DataSourceReference`. For the common case of an
unlocked attribute, the fixed-field guard is dead and both fields are silently mutable,
violating the override-granularity rule. (The lock-error branch of the same helper is
also redundant — a locked attribute already returns earlier inside the helper.)
**Recommendation**
Remove the `&& existing.IsLocked` condition so the granularity error is always
returned, and stop assigning `DataType` / `DataSourceReference` from `proposed` in the
apply block.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): removed the `&& existing.IsLocked`
guard in `UpdateAttributeAsync` so the fixed-field granularity error is always
honoured, and removed the unconditional `existing.DataType` /
`existing.DataSourceReference` assignments from the apply block. Regression
tests: `UpdateAttribute_UnlockedAttribute_DataTypeChangeRejected`,
`UpdateAttribute_UnlockedAttribute_DataSourceReferenceChangeRejected`.
### TemplateEngine-004 — Alarm on-trigger script references are never resolved (empty placeholder)
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/Flattening/FlatteningService.cs:695` |
**Description**
`ResolveAlarmScriptReferences` is invoked as Step 7 of `Flatten` but its body is empty
— only a comment describing what it should do. Consequently every
`ResolvedAlarm.OnTriggerScriptCanonicalName` stays `null`. This has two downstream
effects: (1) `SemanticValidator`'s "on-trigger script must exist" check
(`SemanticValidator.cs:209`) can never fire, so the design-mandated validation of
alarm on-trigger script references is silently absent; (2) `RevisionHashService` and
`DiffService` both hash/compare `OnTriggerScriptCanonicalName`, so a change to which
script an alarm triggers never affects the revision hash and is invisible to the diff
— a real staleness-detection gap.
**Recommendation**
Implement the resolution: map each alarm's `OnTriggerScriptId` (set on `TemplateAlarm`)
to the canonical name of the corresponding resolved script, accounting for composition
prefixes. If the design intends scripts to be referenced by name within scope, document
and implement that consistently.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): implemented `ResolveAlarmScriptReferences`.
Alarm resolution now records each resolved alarm's `OnTriggerScriptId` keyed by
canonical name, and script resolution records each resolved `TemplateScript.Id`
keyed by its canonical name (both honour composition path prefixes). Step 7
joins the two maps to set `ResolvedAlarm.OnTriggerScriptCanonicalName`, so the
revision hash, diff, and `SemanticValidator` on-trigger-script-exists check now
all see the reference. Regression tests:
`Flatten_AlarmOnTriggerScript_ResolvedToCanonicalName`,
`Flatten_ComposedAlarmOnTriggerScript_ResolvedWithPrefix`.
### TemplateEngine-005 — Collision validation is skipped when creating a child template
| | |
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Resolved |
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:56` |
**Description**
`CreateTemplateAsync` contains a block guarded by `if (parentTemplateId.HasValue)` that
loads `GetAllTemplatesAsync` and then does nothing but hold a comment — it never runs a
collision check. A child template created with a parent inherits the parent's members;
if the child is later given members (via `AddAttributeAsync` etc.) those calls do run
`CollisionDetector`, but the create path itself performs no naming-collision validation
and `UpdateTemplateAsync` only validates collisions on a name change. The design states
naming collisions are design-time errors that must block a save. The dead block is also
confusing and allocates an unused full-table read.
**Recommendation**
Either run a real collision check on the to-be-created template (including its
inherited members) or delete the dead block and its unused query. If create-time
collisions are genuinely impossible because a fresh template has no members, document
that explicitly instead of leaving a no-op.
**Resolution**
Resolved 2026-05-16 (commit `<pending>`): deleted the dead `if
(parentTemplateId.HasValue)` block and its unused `GetAllTemplatesAsync`
read in `CreateTemplateAsync`. A create-time collision check on a child is a
guaranteed no-op — a freshly created template has no members of its own, the
parent's members were already collision-validated on every member-mutating
call, and a new child cannot be an ancestor of its parent. Replaced the no-op
with an explanatory comment documenting that collision detection is enforced
on `AddAttribute`/`AddAlarm`/`AddScript`/`AddComposition` and on rename.
Regression test: `CreateTemplate_WithParent_DoesNotRunDeadCollisionQuery`.
### TemplateEngine-006 — Forbidden-API enforcement is a naive substring scan (bypassable and false-positive prone)
| | |
|--|--|
| Severity | Medium |
| Category | Security |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Validation/ScriptCompiler.cs:21`, `src/ScadaLink.TemplateEngine/Validation/ValidationService.cs:318` |
**Description**
`ScriptCompiler.ForbiddenPatterns` is checked with `code.Contains(pattern)`. This is
both under- and over-inclusive against the script trust model:
- **Bypass:** `using System.IO;` followed by `File.ReadAllText(...)` contains no
`System.IO.` token; `using static System.IO.File;`, namespace aliases, and
`global::System.IO.File` all evade the literal patterns.
- **False positive:** a string literal, comment, or attribute name containing the text
`System.IO.` is flagged as a forbidden API even though it is inert.
The same patterns are reused for trigger-expression validation
(`CheckExpressionSyntax`), inheriting the same weakness. The file comment acknowledges
this is interim until Roslyn is wired in, but the trust model is security-relevant and
the gap should be tracked.
**Recommendation**
Defer real enforcement to the Roslyn-based compiler (semantic symbol analysis of
referenced types/namespaces) rather than text matching. Until then, document the
limitation prominently and treat the substring scan as advisory, not authoritative.
**Resolution**
_Unresolved._
### TemplateEngine-007 — Brace-balance "compilation" misjudges verbatim / interpolated / raw strings
| | |
|--|--|
| Severity | Medium |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Validation/ScriptCompiler.cs:54`, `src/ScadaLink.TemplateEngine/SharedScriptService.cs:124` |
**Description**
`ScriptCompiler.TryCompile` tracks string state with a single `inString` flag toggled
on `"` and an escaped-quote check of `code[i-1] != '\\'`. It does not understand
verbatim strings (`@"..."` where `""` is the escape and `\` is literal), interpolated
strings (`$"{...}"` whose braces are code, not text), raw string literals (`"""..."""`),
or char literals. A script with a verbatim string containing a brace, an interpolated
string, or a `'}'` char literal will be wrongly rejected as having mismatched braces —
blocking a valid script from deployment. `SharedScriptService.ValidateSyntax` is even
cruder: it counts braces/brackets/parens with no string or comment awareness at all, so
any string literal containing one of those characters produces a false syntax error.
**Recommendation**
Once the Roslyn compiler is available, parse with `CSharpSyntaxTree.ParseText` and
inspect diagnostics instead of hand-rolling a tokenizer. If an interim check must
remain, at minimum handle verbatim/interpolated/char literals or scope the check down
to something that cannot false-positive.
**Resolution**
_Unresolved._
### TemplateEngine-008 — `SetAlarmOverrideAsync` accepts overrides for unknown / composed alarms with no validation
| | |
|--|--|
| Severity | Medium |
| Category | Error handling & resilience |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Services/InstanceService.cs:178` |
**Description**
`SetAlarmOverrideAsync` looks up the alarm by name among the template's **direct**
alarms only. When the lookup returns `null` — which is the case for every composed
(path-qualified) alarm as well as for a genuinely non-existent name — the method skips
the lock check and proceeds to persist the override. This means: (1) an override can be
created for an alarm that does not exist (a silent dead record), and (2) a composed
alarm that is `IsLocked` at the template level can be overridden, bypassing the lock
rule. `SetAttributeOverrideAsync` by contrast rejects unknown attribute names. The
inline comment acknowledges the gap but the behaviour is inconsistent and risky.
**Recommendation**
Resolve the full effective alarm set (via the resolver / flattening) so composed
alarms are found, reject overrides whose canonical name is not in that set, and apply
the lock check to composed alarms too.
**Resolution**
_Unresolved._
### TemplateEngine-009 — N+1 query in `TemplateDeletionService.CanDeleteTemplateAsync`
| | |
|--|--|
| Severity | Medium |
| Category | Performance & resource management |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Services/TemplateDeletionService.cs:75` |
**Description**
Check 3 ("other templates compose it directly") loads all templates and then issues a
separate `GetCompositionsByTemplateIdAsync` call **inside a loop over every template**
— one round-trip per template in the database. The composition information needed is
already reachable via `t.Compositions` on the templates returned by
`GetAllTemplatesAsync` (which `TemplateService.DeleteTemplateAsync` uses for the
equivalent check at line 162). The loop scales linearly with the template count on
every delete-precheck and every actual delete.
**Recommendation**
Use the `Compositions` navigation already loaded by `GetAllTemplatesAsync`, or add a
single repository call that returns all compositions, rather than querying per
template.
**Resolution**
_Unresolved._
### TemplateEngine-010 — `InstanceService` documents optimistic concurrency that is not implemented
| | |
|--|--|
| Severity | Medium |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Services/InstanceService.cs:9` |
**Description**
The class summary states instances support "Enabled/disabled state with optimistic
concurrency". `EnableAsync`, `DisableAsync`, `AssignToAreaAsync` and the override/binding
mutators all perform a plain read-modify-write with no version token, `RowVersion`, or
concurrency check. Two concurrent enable/disable requests last-writer-wins with no
detection. Either the doc is stale (the design's optimistic-concurrency decision
applies to *deployment status records*, not instance state) or a concurrency token was
intended and is missing.
**Recommendation**
If last-write-wins is acceptable for instance state, correct the XML doc. If optimistic
concurrency is required, add a concurrency token to `Instance` and surface a conflict
result.
**Resolution**
_Unresolved._
### TemplateEngine-011 — `SortedPropertiesConverterFactory` is dead code with a misleading comment
| | |
|--|--|
| Severity | Low |
| Category | Documentation & comments |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Flattening/RevisionHashService.cs:136` |
**Description**
`SortedPropertiesConverterFactory.CanConvert` always returns `false` and
`CreateConverter` always returns `null`, so the factory registered in
`CanonicalJsonOptions` does nothing. The class comment claims it "ensures properties are
serialized in alphabetical order for deterministic output", and the options comment says
"Ensure consistent ordering" — both are false. Determinism actually relies entirely on
the `Hashable*` records being hand-declared with alphabetically-ordered properties (plus
camelCase). That works today but is fragile: a future contributor adding a property out
of alphabetical order silently changes every revision hash, and the dead converter gives
false confidence that ordering is enforced programmatically.
**Recommendation**
Either implement the converter to genuinely sort properties, or delete it and replace
the comments with an explicit note that determinism depends on the manual property
ordering of the `Hashable*` records (ideally enforced by a test).
**Resolution**
_Unresolved._
### TemplateEngine-012 — `DataType` enum naming diverges from the design doc
| | |
|--|--|
| Severity | Low |
| Category | Design-document adherence |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/Validation/SemanticValidator.cs:18` |
**Description**
The design doc (Attribute section) lists data types as "Boolean, Integer, Float,
String". The actual `DataType` enum is `Boolean, Int32, Float, Double, DateTime,
Binary`. `SemanticValidator.NumericDataTypes` correctly hard-codes the real names
(`Int32`, `Float`, `Double`), so the code is internally consistent, but the design doc
is stale — it omits `Double`, `DateTime`, `Binary` and calls the integer type
"Integer". This makes the doc an unreliable reference for which trigger-operand types
are numeric.
**Recommendation**
Update `docs/requirements/Component-TemplateEngine.md` to list the actual enum members,
or rename the enum to match the doc if "Integer" is the intended canonical name.
**Resolution**
_Unresolved._
### TemplateEngine-013 — `ToDictionary(t => t.Id)` throws on duplicate IDs; cycle detectors overload Id 0 as a sentinel
| | |
|--|--|
| Severity | Low |
| Category | Correctness & logic bugs |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/CycleDetector.cs:30`, `src/ScadaLink.TemplateEngine/CycleDetector.cs:38` |
**Description**
Across the static helpers, `allTemplates.ToDictionary(t => t.Id)` is used freely; if the
caller ever passes a list containing two templates with the same `Id` (e.g. a
not-yet-saved template assigned `Id == 0`, or duplicated input) the call throws an
unhandled `ArgumentException` rather than returning a `Result` failure. Separately,
`CycleDetector` uses `0` as the "no parent" sentinel (`currentId != 0`,
`ParentTemplateId ?? 0`) and `DetectInheritanceCycle` / `DetectCrossGraphCycle` ignore a
proposed parent/composed id of `0`. EF identity keys start at 1 so this is currently
benign, but the overload is fragile — an in-memory or test template with `Id == 0`
would be treated as "no template" and cycle checks would be silently skipped.
**Recommendation**
Guard the dictionary builds (or use a grouping/`ToLookup`) and validate input, and use
`int?`/`-1` rather than `0` as the no-parent sentinel so a real id of 0 is never
special.
**Resolution**
_Unresolved._
### TemplateEngine-014 — Template-deletion constraint logic is duplicated and divergent
| | |
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
| Status | Open |
| Location | `src/ScadaLink.TemplateEngine/TemplateService.cs:109`, `src/ScadaLink.TemplateEngine/Services/TemplateDeletionService.cs:27` |
**Description**
`TemplateService.DeleteTemplateAsync` and `TemplateDeletionService.CanDeleteTemplateAsync`
both implement the "can this template be deleted" rules (instances, child templates,
derived templates, composing templates). The two implementations have already drifted:
`TemplateService` reads composing templates from the in-memory `t.Compositions`
navigation while `TemplateDeletionService` issues per-template
`GetCompositionsByTemplateIdAsync` calls (see TemplateEngine-009), they format error
messages differently, and `TemplateService` returns on the first failing category while
`TemplateDeletionService` accumulates all of them. A future rule change must be made in
two places or behaviour will diverge further.
**Recommendation**
Make `TemplateService.DeleteTemplateAsync` delegate to `TemplateDeletionService` (or
vice versa) so the constraint logic lives in exactly one place.
**Resolution**
_Unresolved._
+67
View File
@@ -0,0 +1,67 @@
# Code Review — <Module>
<!--
Template for a module review. Copy the structure below into
code-reviews/<Module>/findings.md and fill it in.
See ../REVIEW-PROCESS.md for the full process.
-->
| Field | Value |
|-------|-------|
| Module | `src/ScadaLink.<Module>` |
| Design doc | `docs/requirements/Component-<Name>.md` |
| Status | Not yet reviewed \| In progress \| Reviewed |
| Last reviewed | YYYY-MM-DD |
| Reviewer | <name> |
| Commit reviewed | `<short SHA>` |
| Open findings | 0 |
## Summary
One short paragraph: overall health of the module, themes across findings, and
anything notable that is not a finding.
## Checklist coverage
Confirm every category was examined. Record "No issues found" where applicable.
| # | Category | Examined | Notes |
|---|----------|----------|-------|
| 1 | Correctness & logic bugs | ☐ | |
| 2 | Akka.NET conventions | ☐ | |
| 3 | Concurrency & thread safety | ☐ | |
| 4 | Error handling & resilience | ☐ | |
| 5 | Security | ☐ | |
| 6 | Performance & resource management | ☐ | |
| 7 | Design-document adherence | ☐ | |
| 8 | Code organization & conventions | ☐ | |
| 9 | Testing coverage | ☐ | |
| 10 | Documentation & comments | ☐ | |
## Findings
<!-- One entry per finding. Copy the block below. Never delete a finding; close it
by changing Status and completing Resolution. -->
### <Module>-001 — <Short title>
| | |
|--|--|
| Severity | Critical \| High \| Medium \| Low |
| Category | <one of the 10 checklist categories> |
| Status | Open \| In Progress \| Resolved \| Won't Fix \| Deferred |
| Location | `src/ScadaLink.<Module>/<File>.cs:<line>` |
**Description**
What is wrong and why it matters.
**Recommendation**
Concrete suggested fix.
**Resolution**
_Unresolved._
<!-- When closed: fixing commit `<SHA>`, date YYYY-MM-DD, one-line description.
For Won't Fix / Deferred, justify the decision here. -->
+179
View File
@@ -0,0 +1,179 @@
#!/usr/bin/env python3
"""Regenerate code-reviews/README.md from the per-module findings.md files.
The findings files are the source of truth; README.md is a generated index.
Run this after resolving or re-triaging a finding so the aggregated tables stay
in sync (see REVIEW-PROCESS.md section 5).
Usage:
python3 regen-readme.py # rewrite README.md
python3 regen-readme.py --check # exit 1 if README.md is stale (for CI)
Works from any directory paths are resolved relative to this script.
"""
import os
import re
import sys
BASE = os.path.dirname(os.path.abspath(__file__))
SEVERITY_ORDER = {"Critical": 0, "High": 1, "Medium": 2, "Low": 3}
PENDING_STATUSES = {"Open", "In Progress"}
def discover_modules():
"""Module folders are every subdirectory of code-reviews/ holding a findings.md,
excluding the _template folder. Returned sorted for a stable README order."""
modules = []
for name in sorted(os.listdir(BASE)):
if name.startswith(("_", ".")):
continue
if os.path.isfile(os.path.join(BASE, name, "findings.md")):
modules.append(name)
return modules
def parse_findings(module):
"""Parse one module's findings.md into (module, id, severity, title, status) tuples."""
text = open(os.path.join(BASE, module, "findings.md")).read()
findings = []
for block in re.split(r"^### ", text, flags=re.M)[1:]:
head = block.splitlines()[0].strip()
m = re.match(r"([A-Za-z][A-Za-z0-9]*-\d+)\b(.*)", head)
if not m:
raise SystemExit(f"{module}/findings.md: unparseable finding heading: {head!r}")
fid = m.group(1).strip()
title = m.group(2).strip().lstrip("—–-").strip().replace("|", "\\|")
sev = re.search(r"\|\s*Severity\s*\|\s*([A-Za-z]+)", block)
status = re.search(r"\|\s*Status\s*\|\s*([A-Za-z' ]+?)\s*\|", block)
if not sev or not status:
raise SystemExit(f"{module}/findings.md: {fid} is missing a Severity or Status field")
findings.append((module, fid, sev.group(1), title, status.group(1).strip()))
return findings
def finding_number(finding):
return int(re.search(r"-(\d+)$", finding[1]).group(1))
def build_readme(modules, per_module):
pending = sorted(
(f for fs in per_module.values() for f in fs if f[4] in PENDING_STATUSES),
key=lambda f: (SEVERITY_ORDER.get(f[2], 9), f[0], finding_number(f)),
)
def severity_total(sev):
return sum(1 for f in pending if f[2] == sev)
def open_count(module, sev):
return sum(1 for f in per_module[module]
if f[2] == sev and f[4] in PENDING_STATUSES)
lines = []
add = lines.append
add("# Code Reviews")
add("")
add("Comprehensive, per-module code reviews of the ScadaLink codebase. Each module (one")
add("buildable project under `src/`) has its own folder containing a `findings.md`. This")
add("README is the aggregated index — the single place to see all outstanding work.")
add("")
add("> Generated by `regen-readme.py` from the per-module `findings.md` files. Do not")
add("> edit by hand — edit the findings files and re-run the script.")
add("")
add("## How it works")
add("")
add("- Reviews are performed one module at a time against a fixed checklist.")
add("- Every finding is recorded in the module's `findings.md` with a severity and status.")
add("- Findings are **never deleted** — they are closed by changing their status, keeping")
add(" a full audit trail.")
add("- This README aggregates every **pending** finding (`Open` / `In Progress`) across all")
add(" modules.")
add("")
add("See **[REVIEW-PROCESS.md](REVIEW-PROCESS.md)** for the full procedure: the review")
add("checklist, severity definitions, finding format, and how to mark items resolved.")
add("")
add("## Layout")
add("")
add("```")
add("code-reviews/")
add("├── README.md # this file — process overview + pending findings")
add("├── REVIEW-PROCESS.md # how to perform a review and track findings")
add("├── regen-readme.py # regenerates this README from the findings files")
add("├── _template/findings.md # copy-this template for a module review")
add("└── <Module>/findings.md # one folder per src/ project")
add("```")
add("")
add("## Baseline review — 2026-05-16")
add("")
add("All 19 modules were reviewed at commit `9c60592` (241 findings: 6 Critical, 46 High,")
add("100 Medium, 89 Low). The tables below track what remains **open** as findings are")
add("resolved and re-triaged; findings discovered after the baseline are appended to their")
add("module file and counted in **Total**.")
add("")
add("| Severity | Open findings |")
add("|----------|---------------|")
for sev in ("Critical", "High", "Medium", "Low"):
add(f"| {sev} | {severity_total(sev)} |")
add(f"| **Total** | **{len(pending)}** |")
add("")
add("## Module Status")
add("")
add("| Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total |")
add("|--------|---------------|--------|----------------|------|-------|")
for module in modules:
counts = [open_count(module, s) for s in ("Critical", "High", "Medium", "Low")]
add(f"| [{module}]({module}/findings.md) | 2026-05-16 | `9c60592` "
f"| {counts[0]}/{counts[1]}/{counts[2]}/{counts[3]} "
f"| {sum(counts)} | {len(per_module[module])} |")
add("")
add("## Pending Findings")
add("")
add("Every `Open` / `In Progress` finding across all modules, highest severity first.")
add("Resolved findings drop off this list but remain recorded in their module's")
add("`findings.md` (see [REVIEW-PROCESS.md](REVIEW-PROCESS.md) §4–§5). Full detail —")
add("description, location, recommendation — lives in the module's `findings.md`.")
add("")
for sev in ("Critical", "High", "Medium", "Low"):
rows = [f for f in pending if f[2] == sev]
add(f"### {sev} ({len(rows)})")
add("")
if not rows:
add("_None open._")
add("")
continue
add("| ID | Module | Title |")
add("|----|--------|-------|")
for module, fid, _, title, _ in rows:
add(f"| {fid} | [{module}]({module}/findings.md) | {title} |")
add("")
return "\n".join(lines)
def main():
check = "--check" in sys.argv[1:]
modules = discover_modules()
per_module = {m: parse_findings(m) for m in modules}
content = build_readme(modules, per_module)
readme_path = os.path.join(BASE, "README.md")
pending = sum(1 for fs in per_module.values()
for f in fs if f[4] in PENDING_STATUSES)
total = sum(len(fs) for fs in per_module.values())
if check:
current = open(readme_path).read() if os.path.exists(readme_path) else ""
if current != content:
print("README.md is stale — run: python3 code-reviews/regen-readme.py")
sys.exit(1)
print(f"README.md is up to date ({pending} pending / {total} total).")
return
open(readme_path, "w").write(content)
print(f"README.md regenerated — {pending} pending, {total} total findings "
f"across {len(modules)} modules.")
if __name__ == "__main__":
main()
@@ -0,0 +1,114 @@
# Expression Trigger for Template Scripts and Alarms — Design
**Date:** 2026-05-16
**Status:** Approved (brainstorming) — implementation plan to follow
## Context
Template scripts and template alarms can only be triggered by single-attribute
conditions. Scripts support `Interval`, `ValueChange`, `Conditional`
(`{attributeName, operator, threshold}` — one attribute, numeric compare),
and `Call`. Alarms support `ValueMatch`, `RangeViolation`, `RateOfChange`, and
`HiLo` — all single-attribute. There is no way to trigger on a relationship
between *multiple* attributes (e.g. "speed is high *and* mode is Run").
This design adds an **Expression trigger**: a user-supplied read-only boolean
C# expression, evaluated whenever an instance attribute updates, that fires the
script / activates the alarm when it returns true. It generalizes the existing
single-attribute `Conditional` trigger.
### Decisions taken during brainstorming
- The trigger is a **read-only boolean expression** — no `External`/`Database`/
`Notify`/`CallScript` side effects. It must be cheap and safe to run on every
attribute update.
- **Scripts fire edge-triggered** — once per `false→true` transition.
- **Alarms are level-based** — active while the expression is true, clear when
false (consistent with all existing alarm trigger types).
- **Evaluation approach B** — compile against a *restricted read-only globals
type*, so read-only is enforced, not merely conventional. Reuses the existing
Roslyn compilation pipeline.
## Design
### 1. Trigger model & storage
- **Scripts:** `TemplateScript.TriggerType` (`string?`) gains the value
`"Expression"`. `TriggerConfiguration` JSON is `{ "expression": "<C#>" }`.
- **Alarms:** `AlarmTriggerType` enum gains a member `Expression`.
`TriggerConfiguration` JSON is the same `{ "expression": "<C#>" }`.
- The expression is a bare C# boolean expression (no `return` keyword — Roslyn
scripting returns the trailing expression's value), e.g.
`Attributes["Speed"] > 1000 && (string)Attributes["Mode"] == "Run"`.
- Entity types unchanged: both `TriggerConfiguration` fields stay `string?`.
Adding the `AlarmTriggerType` member touches three switch sites:
`AlarmActor.ParseEvalConfig`, `AlarmActor.HandleAttributeValueChanged`,
`AlarmTriggerConfigCodec`.
### 2. Runtime evaluation
- **`TriggerExpressionGlobals`** (new, `ScadaLink.SiteRuntime`) — a read-only
globals type exposing only `Attributes["X"]`, `Children["C"].Attributes["X"]`,
and `Parent.Attributes["X"]`, backed by an in-memory snapshot dictionary. No
side-effecting APIs. A missing attribute reads as `null` (never throws).
- The expression is compiled once via the existing Roslyn pipeline (same
forbidden-API trust checks) against `TriggerExpressionGlobals`; the compiled
delegate is cached on the actor.
- **Attribute snapshot:** `ScriptActor` and `AlarmActor` already receive every
`AttributeValueChanged`. Each keeps a local `Dictionary<string,object?>`
snapshot — seeded from the instance's initial attribute set at startup, then
updated on each change. The expression evaluates against the snapshot — no
`Ask` back to the `InstanceActor`; cheap and re-entrancy-free.
- **On each `AttributeValueChanged`:** update snapshot → run cached expression
`bool`.
- **Script (edge):** track the previous result; on `false→true`, run the
script (spawn `ScriptExecutionActor`, as the other triggers do).
- **Alarm (level):** the `bool` feeds the existing binary Normal↔Active state
machine — raise on `→Active`, clear on `→Normal`.
- Cost per attribute update: one cached-delegate call + one bool compare.
### 3. Editors & analysis
- **`ScriptTriggerEditor`:** add `Expression` to `ScriptTriggerKind` and
`ScriptTriggerConfigCodec` (round-trips `{ expression }`).
- **`AlarmTriggerEditor`:** add an `Expression` case to its trigger `@switch`.
- Both render the same **expression panel**: a compact `MonacoEditor`
(~120 px) with C# syntax, `Attributes["..."]` completion driven by the
template's attribute metadata (self / children / parent), and live compile
diagnostics. A one-line hint summarizes what fires.
- **Analysis:** reuse the existing `Template` analysis kind — completion and
diagnostics work with no new analyzer code. Editor completion is slightly
permissive (also shows `Instance`/`External`), but the runtime's restricted
`TriggerExpressionGlobals` is what enforces read-only. A dedicated strict
analysis kind is a possible later refinement, out of scope here.
### 4. Error handling & validation
- **Pre-deployment:** extend `ValidationService` to compile-check expression
triggers (against `TriggerExpressionGlobals`); compile errors block
deployment and surface like other validation errors. Unknown
`Attributes["..."]` keys are flagged as the existing trigger-reference
validation does.
- **Runtime — expression throws:** caught; treated as `false` for that update;
a script-error event is written to the site event log. The actor never
crashes.
- **Non-bool result:** treated as `false` and logged.
- **Missing attribute:** reads as `null` (handled in `TriggerExpressionGlobals`).
- **Blank expression:** the trigger is inert; validation emits a warning.
### 5. Testing & verification
- **Unit:** codec round-trip for script and alarm `{ expression }`; expression
compile (valid + invalid).
- **Runtime:** deploy an instance with an expression-triggered script and
alarm; drive attribute updates (bound Test Run / CLI); confirm the script
fires only on `false→true` and the alarm raises/clears with the expression.
- **UI:** the expression panel in both editors; save → reopen round-trip.
## Implementation tasks
- #25 — Implement expression trigger model + codecs
- #26 — Implement runtime expression evaluation (blocked by #25)
- #27 — Add expression panel to the trigger editors (blocked by #25)
- #28 — Validate expression triggers pre-deployment (blocked by #25, #26)
+220
View File
@@ -0,0 +1,220 @@
# Expression Trigger Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
**Goal:** Add an "Expression" trigger to template scripts and alarms — a read-only boolean C# expression evaluated on attribute updates that fires the script (edge) or activates the alarm (level).
**Architecture:** A new restricted read-only globals type (`TriggerExpressionGlobals`) backed by an in-memory attribute snapshot; the expression is compiled once via the existing Roslyn pipeline and cached on `ScriptActor`/`AlarmActor`, which already receive every `AttributeValueChanged`. The CentralUI trigger editors gain an Expression panel. See the approved design: `docs/plans/2026-05-16-expression-trigger-design.md`.
**Tech Stack:** C#/.NET, Akka.NET (site runtime actors), Roslyn C# scripting, Blazor Server (CentralUI), Docker cluster.
**Verification note:** This repo has no CentralUI/Commons unit-test project; pure-logic correctness is verified by `dotnet build` + the editor round-trip, and runtime behavior by `bash docker/deploy.sh` + a browser/CLI walkthrough (the established pattern in this codebase). Steps below follow that.
---
### Task 1: Trigger model + codecs
**Files:**
- Modify: `src/ScadaLink.Commons/Types/Enums/AlarmTriggerType.cs`
- Modify: `src/ScadaLink.CentralUI/Components/Shared/ScriptTriggerConfigCodec.cs`
- Modify: `src/ScadaLink.CentralUI/Components/Shared/AlarmTriggerConfigCodec.cs`
**Step 1: Add the `Expression` alarm trigger type.**
In `AlarmTriggerType.cs`, add `Expression` as the last enum member (append — do not reorder; the enum is persisted by value):
```csharp
public enum AlarmTriggerType
{
ValueMatch,
RangeViolation,
RateOfChange,
HiLo,
Expression
}
```
**Step 2: Extend `ScriptTriggerConfigCodec`.**
- Add `Expression` to `ScriptTriggerKind` (before `Unknown`).
- `ParseKind`: map `"expression"``ScriptTriggerKind.Expression`.
- `KindToString`: `Expression``"Expression"`.
- Add `string? Expression` to `ScriptTriggerModel`.
- `Parse`: for `Expression`, read `model.Expression = root.TryGetProperty("expression", out var e) ? e.GetString() : null;`
- `Serialize`: for `Expression`, write `w.WriteString("expression", model.Expression ?? "");`
**Step 3: Extend `AlarmTriggerConfigCodec`.**
- Add `string? Expression` to `AlarmTriggerModel`.
- `Parse`: `case AlarmTriggerType.Expression:``model.Expression = TryReadString(root, "expression");`
- `Serialize`: `case AlarmTriggerType.Expression:``w.WriteString("expression", model.Expression ?? "");` (note: this codec always writes `attributeName` first — for Expression that key is unused; leave it written empty, harmless, or guard it. Prefer: skip the `attributeName` write when `type == Expression`.)
**Step 4: Build.**
Run: `dotnet build src/ScadaLink.CentralUI/ScadaLink.CentralUI.csproj -nologo`
Expected: `Build succeeded`.
**Step 5: Commit.**
```bash
git add src/ScadaLink.Commons/Types/Enums/AlarmTriggerType.cs src/ScadaLink.CentralUI/Components/Shared/ScriptTriggerConfigCodec.cs src/ScadaLink.CentralUI/Components/Shared/AlarmTriggerConfigCodec.cs
git commit -m "feat(triggers): add Expression to the script & alarm trigger codecs"
```
---
### Task 2: Runtime expression evaluation
**Files:**
- Create: `src/ScadaLink.SiteRuntime/Scripts/TriggerExpressionGlobals.cs`
- Modify: `src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs`
- Modify: `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs`
- Modify: `src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs`
**Step 1: Create `TriggerExpressionGlobals`.**
A read-only globals type backed by a snapshot dictionary. Exposes only attribute reads — no `Instance`/`Scripts`/`ExternalSystem`/`Database`/`Notify`. Mirror the shape of `ScopeAccessors` but read straight from the dict (no actor Ask). Missing key → `null`.
```csharp
namespace ScadaLink.SiteRuntime.Scripts;
/// <summary>
/// Read-only globals a trigger expression is compiled against. Exposes only
/// attribute reads, backed by an in-memory snapshot — no I/O, no actor Ask.
/// </summary>
public sealed class TriggerExpressionGlobals
{
private readonly IReadOnlyDictionary<string, object?> _snapshot;
public TriggerExpressionGlobals(IReadOnlyDictionary<string, object?> snapshot) => _snapshot = snapshot;
public ReadOnlyAttributes Attributes => new(_snapshot, "");
public ReadOnlyChildren Children => new(_snapshot);
public ReadOnlyComposition? Parent { get; init; } // set by caller for derived/composed scopes; null at root
public sealed class ReadOnlyAttributes
{
private readonly IReadOnlyDictionary<string, object?> _s;
private readonly string _prefix;
public ReadOnlyAttributes(IReadOnlyDictionary<string, object?> s, string prefix) { _s = s; _prefix = prefix; }
public object? this[string key] =>
_s.TryGetValue(_prefix.Length == 0 ? key : _prefix + "." + key, out var v) ? v : null;
}
public sealed class ReadOnlyComposition
{
private readonly IReadOnlyDictionary<string, object?> _s;
private readonly string _path;
public ReadOnlyComposition(IReadOnlyDictionary<string, object?> s, string path) { _s = s; _path = path; }
public ReadOnlyAttributes Attributes => new(_s, _path);
}
public sealed class ReadOnlyChildren
{
private readonly IReadOnlyDictionary<string, object?> _s;
public ReadOnlyChildren(IReadOnlyDictionary<string, object?> s) => _s = s;
public ReadOnlyComposition this[string compositionName] => new(_s, compositionName);
}
}
```
Note: confirm against `ScopeAccessors.cs` whether canonical attribute keys are dotted (`TempSensor.Reading`) — they are; the prefix logic matches `AttributeAccessor.Resolve`.
**Step 2: Add expression compilation to `ScriptCompilationService`.**
Add a method that compiles a bare C# boolean expression against `TriggerExpressionGlobals`, reusing the existing `ScriptOptions` (references/imports) and the forbidden-API trust check. Return `Script<object?>` (Roslyn scripting returns the trailing expression's value).
```csharp
public ScriptCompilationResult CompileTriggerExpression(string name, string expression)
{
// same ScriptOptions as Compile(), globalsType: typeof(TriggerExpressionGlobals)
// run the same forbidden-API validation
}
```
Read the existing `Compile` (lines ~94-148) and factor the shared option-building + validation rather than duplicating.
**Step 3: ScriptActor — `ExpressionTriggerConfig` + edge evaluation.**
- Add a trigger config record `ExpressionTriggerConfig(string Expression)` alongside `IntervalTriggerConfig`/etc.
- `ParseTriggerConfig` (~line 262): add `"expression" => ParseExpressionTrigger(triggerConfigJson)` reading `{ "expression": "..." }`.
- On actor start (where the trigger is parsed/registered): if the config is `ExpressionTriggerConfig`, compile via `CompileTriggerExpression`, cache the `Script<object?>`, init `bool _lastExpressionResult = false`.
- Maintain `Dictionary<string,object?> _attributeSnapshot` — update it in the `AttributeValueChanged` handler (~lines 148-168) for **every** change, before trigger logic.
- In that handler, for `ExpressionTriggerConfig`: build `new TriggerExpressionGlobals(_attributeSnapshot)`, run the cached script (`RunAsync(globals)`), coerce `ReturnValue` to bool; if `result && !_lastExpressionResult` → run the script (same path `Conditional`/`ValueChange` use to spawn `ScriptExecutionActor`); set `_lastExpressionResult = result`.
- Wrap the evaluation in try/catch — on throw, treat as `false` and log a site-event-log script error; do not crash.
**Step 4: AlarmActor — `Expression` eval config + level evaluation.**
- `ParseEvalConfig` (~lines 413-484): add `case AlarmTriggerType.Expression:` building an `ExpressionEvalConfig` that holds the compiled `Script<object?>` (compile here via `CompileTriggerExpression`).
- `HandleAttributeValueChanged` (~lines 127-189): maintain the same `_attributeSnapshot`; for the `Expression` case (switch ~lines 141-147) evaluate the compiled expression against `TriggerExpressionGlobals` → bool; feed that bool into the existing **binary** Normal↔Active path (the same one `ValueMatch`/`RangeViolation` use — raise on `→Active`, clear on `→Normal`). Not HiLo.
- Same try/catch → `false` + log on throw.
**Step 5: Build.**
Run: `dotnet build src/ScadaLink.Host/ScadaLink.Host.csproj -nologo`
Expected: `Build succeeded`.
**Step 6: Commit.**
```bash
git add src/ScadaLink.SiteRuntime/
git commit -m "feat(triggers): runtime expression trigger evaluation for scripts and alarms"
```
---
### Task 3: Trigger editor panels (CentralUI)
**Files:**
- Modify: `src/ScadaLink.CentralUI/Components/Shared/ScriptTriggerEditor.razor`
- Modify: `src/ScadaLink.CentralUI/Components/Shared/AlarmTriggerEditor.razor`
- Reference: `src/ScadaLink.CentralUI/Components/Shared/MonacoEditor.razor`, `src/ScadaLink.CentralUI/Components/Pages/Design/TemplateEdit.razor` (how the script Code editor is fed `SelfAttributes`/`Children`/`Parent`)
**Step 1: `ScriptTriggerEditor` — Expression panel.**
- The codec already has the `Expression` kind (Task 1). Add `<option value="Expression">Expression — run when a boolean expression becomes true</option>` to the type `<select>`.
- Add a `case ScriptTriggerKind.Expression:` to the `@switch` rendering `RenderExpression()`.
- `RenderExpression()` hosts a compact `MonacoEditor` (`Height="120px"`, `Language="csharp"`, `ScriptKind=Template`) bound to `_model.Expression`; `ValueChanged` → update model + `Emit()`. Feed it the template attribute metadata for completion (see Step 3).
- Hint: "Runs once each time this expression becomes true."
**Step 2: `AlarmTriggerEditor` — Expression panel.**
- Add `case AlarmTriggerType.Expression: @RenderExpression(); break;` to the trigger `@switch` (~line 72).
- Same compact `MonacoEditor` bound to `_model.Expression`; `Emit()` on change.
- Hint: "Alarm is active while this expression is true."
**Step 3: Feed attribute metadata for completion.**
Both editors already receive `AvailableAttributes` (`IReadOnlyList<AlarmAttributeChoice>`). `MonacoEditor` wants `SelfAttributes` (`AttributeShape[]`) / `Children` / `Parent`. Add a small mapper from `AlarmAttributeChoice` → the Monaco metadata (Direct/Inherited → `SelfAttributes`; Composed → `Children` contexts). Keep it minimal — at least pass `SelfAttributes` so `Attributes["..."]` completion works.
**Step 4: Build.**
Run: `dotnet build src/ScadaLink.CentralUI/ScadaLink.CentralUI.csproj -nologo`
Expected: `Build succeeded`.
**Step 5: Commit.**
```bash
git add src/ScadaLink.CentralUI/Components/Shared/
git commit -m "feat(ui/triggers): expression trigger panel in the script & alarm editors"
```
---
### Task 4: Pre-deployment validation
**Files:**
- Modify: `src/ScadaLink.TemplateEngine/.../ValidationService.cs` (the file with `ValidateScriptTriggerReferences` / `ExtractAttributeNameFromTriggerConfig`)
**Step 1: Compile-check expression triggers.**
In the validation pass, for any script/alarm whose trigger type is `Expression`, extract `expression` from `TriggerConfiguration` and compile-check it. The TemplateEngine project may not reference the SiteRuntime compiler — if so, do a Roslyn syntax/compile check using the same approach, or surface a clear "expression empty / invalid" check at minimum. Confirm the reference graph during execution; prefer reusing `CompileTriggerExpression` if reachable.
**Step 2: Flag unknown attribute references (best-effort).**
Expression text references `Attributes["X"]`; extend the existing attribute-reference validation to scan the expression for `Attributes["..."]` literals and flag keys absent from the flattened config — mirroring `ExtractAttributeNameFromTriggerConfig` for the structured triggers.
**Step 3: Build + commit.**
```bash
dotnet build src/ScadaLink.Host/ScadaLink.Host.csproj -nologo
git add src/ScadaLink.TemplateEngine/
git commit -m "feat(triggers): validate expression triggers pre-deployment"
```
---
### Task 5: Build, deploy, verify
**Step 1:** `bash docker/deploy.sh` and wait for `http://localhost:9000/health/ready`.
**Step 2 — UI:** Log in (`multi-role`/`password`), open a template → Scripts → Add Script. Select trigger type **Expression**; confirm the Monaco expression box renders with attribute completion. Save `Attributes["TestDouble"] > 50` → reopen → confirm round-trip. Repeat on the alarm editor.
**Step 3 — runtime (script, edge):** Deploy an instance; set an attribute so the expression is false, then true → confirm the script runs once on the transition and does **not** re-run while it stays true; flip false then true again → runs again.
**Step 4 — runtime (alarm, level):** Expression-triggered alarm raises when the expression becomes true and clears when it becomes false (check the alarm state / Debug View).
**Step 5:** `git push`.
---
## Notes for the executor
- Append the `AlarmTriggerType.Expression` enum member **last** — the enum is persisted by integer value.
- The trigger expression is a *bare expression* (no `return`) — Roslyn scripting returns the trailing expression's value.
- Keep the evaluation try/catch tight; a throwing expression must never crash `ScriptActor`/`AlarmActor`.
- `_attributeSnapshot` must be updated for **every** `AttributeValueChanged`, not just attributes the expression names.
@@ -0,0 +1,11 @@
{
"planPath": "docs/plans/2026-05-16-expression-trigger.md",
"tasks": [
{"id": 25, "subject": "Task 1: Trigger model + codecs", "status": "pending"},
{"id": 26, "subject": "Task 2: Runtime expression evaluation", "status": "pending", "blockedBy": [25]},
{"id": 27, "subject": "Task 3: Trigger editor panels", "status": "pending", "blockedBy": [25]},
{"id": 28, "subject": "Task 4: Pre-deployment validation", "status": "pending", "blockedBy": [25, 26]},
{"id": 29, "subject": "Task 5: Build, deploy, verify", "status": "pending", "blockedBy": [25, 26, 27, 28]}
],
"lastUpdated": "2026-05-16"
}
+109 -89
View File
@@ -48,127 +48,142 @@ The CLI uses a hierarchical subcommand structure mirroring the Management Servic
scadalink <group> <action> [options]
```
All entities are identified by their integer **ID** (via `--id`, `--template-id`,
`--site-id`, etc.), not by name. Create/update commands take individual flags — there
is no `--file` option. The authoritative, always-current reference is the in-repo
`src/ScadaLink.CLI/README.md`; the command lists below mirror the implemented command
tree at the time of writing.
### Template Commands
```
scadalink template list [--format json|table]
scadalink template get <name> [--format json|table]
scadalink template create --name <name> [--parent <parent>] --file <path>
scadalink template update <name> --file <path>
scadalink template delete <name>
scadalink template validate <name>
scadalink template diff <instance-code>
scadalink template attribute add --template-id <id> --name <name> --data-type <type> [--default-value <value>] [--tag-path <path>]
scadalink template attribute update --template-id <id> --name <name> [--data-type <type>] [--default-value <value>] [--tag-path <path>]
scadalink template attribute delete --template-id <id> --name <name>
scadalink template alarm add --template-id <id> --name <name> --trigger-attribute <attr> --condition <cond> --setpoint <value> [--severity <level>] [--notification-list <name>]
scadalink template alarm update --template-id <id> --name <name> [--condition <cond>] [--setpoint <value>] [--severity <level>] [--notification-list <name>]
scadalink template alarm delete --template-id <id> --name <name>
scadalink template script add --template-id <id> --name <name> --trigger-type <type> [--trigger-attribute <attr>] [--interval <ms>] --code <code>
scadalink template script update --template-id <id> --name <name> [--trigger-type <type>] [--trigger-attribute <attr>] [--interval <ms>] [--code <code>]
scadalink template script delete --template-id <id> --name <name>
scadalink template composition add --template-id <id> --module-template-id <id> --instance-name <name>
scadalink template list
scadalink template get --id <id>
scadalink template create --name <name> [--description <desc>] [--parent-id <id>]
scadalink template update --id <id> [--name <name>] [--description <desc>] [--parent-id <id>]
scadalink template validate --id <id>
scadalink template delete --id <id>
scadalink template attribute add --template-id <id> --name <name> --data-type <type> [--value <value>] [--description <desc>] [--data-source <ref>] [--locked <bool>]
scadalink template attribute update --id <id> [--name <name>] [--data-type <type>] [--value <value>] [--description <desc>] [--data-source <ref>] [--locked <bool>]
scadalink template attribute delete --id <id>
scadalink template alarm add --template-id <id> --name <name> --trigger-type <type> --priority <n> [--description <desc>] [--trigger-config <json>] [--locked <bool>]
scadalink template alarm update --id <id> [--name <name>] [--trigger-type <type>] [--priority <n>] [--description <desc>] [--trigger-config <json>] [--locked <bool>]
scadalink template alarm delete --id <id>
scadalink template script add --template-id <id> --name <name> --code <code> --trigger-type <type> [--trigger-config <json>] [--locked <bool>] [--parameters <json>] [--return-def <json>]
scadalink template script update --id <id> [--name <name>] [--code <code>] [--trigger-type <type>] [--trigger-config <json>] [--locked <bool>] [--parameters <json>] [--return-def <json>]
scadalink template script delete --id <id>
scadalink template composition add --template-id <id> --instance-name <name> --composed-template-id <id>
scadalink template composition delete --template-id <id> --instance-name <name>
```
### Instance Commands
```
scadalink instance list [--site <site>] [--area <area>] [--format json|table]
scadalink instance get <code> [--format json|table]
scadalink instance create --template <name> --site <site> --code <code> [--area <area>]
scadalink instance set-overrides <code> --file <path>
scadalink instance set-bindings <code> --bindings <json>
scadalink instance bind-connections <code> --file <path>
scadalink instance assign-area <code> --area <area>
scadalink instance enable <code>
scadalink instance disable <code>
scadalink instance delete <code>
scadalink instance list [--site-id <id>] [--template-id <id>] [--search <term>]
scadalink instance get --id <id>
scadalink instance create --name <name> --template-id <id> --site-id <id> [--area-id <id>]
scadalink instance set-bindings --id <id> --bindings <json>
scadalink instance set-overrides --id <id> --overrides <json>
scadalink instance alarm-override set --instance-id <id> --alarm <name> [--trigger-config <json>] [--priority <n>]
scadalink instance alarm-override delete --instance-id <id> --alarm <name>
scadalink instance alarm-override list --instance-id <id>
scadalink instance set-area --id <id> [--area-id <id>]
scadalink instance diff --id <id>
scadalink instance deploy --id <id>
scadalink instance enable --id <id>
scadalink instance disable --id <id>
scadalink instance delete --id <id>
```
`--bindings` is a JSON array of `[attributeName, dataConnectionId]` pairs, e.g.
`[["Speed", 5], ["Mode", 7]]`. `--overrides` is a JSON object of attribute name to
value, e.g. `{"Speed": "100", "Mode": null}`.
### Site Commands
```
scadalink site list [--format json|table]
scadalink site get <site-id> [--format json|table]
scadalink site create --name <name> --id <site-id> [--node-a-address <addr>] [--node-b-address <addr>] [--grpc-node-a-address <addr>] [--grpc-node-b-address <addr>]
scadalink site update <site-id> --file <path>
scadalink site delete <site-id>
scadalink site area list <site-id>
scadalink site area create <site-id> --name <name> [--parent <parent-area>]
scadalink site area update <site-id> --name <name> [--new-name <name>] [--parent <parent-area>]
scadalink site area delete <site-id> --name <name>
scadalink site list
scadalink site get --id <id>
scadalink site create --identifier <id> --name <name> [--description <desc>] [--node-a-address <addr>] [--node-b-address <addr>] [--grpc-node-a-address <addr>] [--grpc-node-b-address <addr>]
scadalink site update --id <id> [--name <name>] [--description <desc>] [--node-a-address <addr>] [--node-b-address <addr>] [--grpc-node-a-address <addr>] [--grpc-node-b-address <addr>]
scadalink site delete --id <id>
scadalink site area list --site-id <id>
scadalink site area create --site-id <id> --name <name> [--parent-id <id>]
scadalink site area update --id <id> --name <name>
scadalink site area delete --id <id>
scadalink site deploy-artifacts [--site-id <id>]
```
### Deployment Commands
```
scadalink deploy instance <code>
scadalink deploy artifacts [--site <site>] [--type <artifact-type>]
scadalink deploy status [--format json|table]
scadalink deploy instance --id <id>
scadalink deploy artifacts [--site-id <id>]
scadalink deploy status [--instance-id <id>] [--status <status>] [--page <n>] [--page-size <n>]
```
### Data Connection Commands
```
scadalink data-connection list [--format json|table]
scadalink data-connection get <name> [--format json|table]
scadalink data-connection create --file <path>
scadalink data-connection update <name> --file <path>
scadalink data-connection delete <name>
scadalink data-connection assign <name> --site <site-id>
scadalink data-connection unassign <name> --site <site-id>
scadalink data-connection list [--site-id <id>]
scadalink data-connection get --id <id>
scadalink data-connection create --site-id <id> --name <name> --protocol <protocol> [--backup-config <json>] [--failover-retry-count <n>]
scadalink data-connection update --id <id> [--name <name>] [--protocol <protocol>] [--backup-config <json>] [--failover-retry-count <n>]
scadalink data-connection delete --id <id>
```
### External System Commands
```
scadalink external-system list [--format json|table]
scadalink external-system get <name> [--format json|table]
scadalink external-system create --file <path>
scadalink external-system update <name> --file <path>
scadalink external-system delete <name>
scadalink external-system list
scadalink external-system get --id <id>
scadalink external-system create --name <name> --endpoint-url <url> --auth-type <type> [--auth-config <json>]
scadalink external-system update --id <id> [--name <name>] [--endpoint-url <url>] [--auth-type <type>] [--auth-config <json>]
scadalink external-system delete --id <id>
scadalink external-system method list --external-system-id <id>
scadalink external-system method get --id <id>
scadalink external-system method create --external-system-id <id> --name <name> --http-method <verb> --path <path> [--params <json>] [--return <json>]
scadalink external-system method update --id <id> [--name <name>] [--http-method <verb>] [--path <path>] [--params <json>] [--return <json>]
scadalink external-system method delete --id <id>
```
### Notification Commands
```
scadalink notification list [--format json|table]
scadalink notification get <name> [--format json|table]
scadalink notification create --file <path>
scadalink notification update <name> --file <path>
scadalink notification delete <name>
scadalink notification smtp list [--format json|table]
scadalink notification smtp update --file <path>
scadalink notification list
scadalink notification get --id <id>
scadalink notification create --name <name> --emails <comma-separated>
scadalink notification update --id <id> [--name <name>] [--emails <comma-separated>]
scadalink notification delete --id <id>
scadalink notification smtp list
scadalink notification smtp update --id <id> --server <host> --port <n> --auth-mode <mode> --from-address <email>
```
### Security Commands
```
scadalink security api-key list [--format json|table]
scadalink security api-key list
scadalink security api-key create --name <name>
scadalink security api-key update <name> [--name <new-name>] [--enabled <bool>]
scadalink security api-key enable <name>
scadalink security api-key disable <name>
scadalink security api-key delete <name>
scadalink security role-mapping list [--format json|table]
scadalink security role-mapping create --group <ldap-group> --role <role> [--site <site>]
scadalink security role-mapping update --id <id> [--group <ldap-group>] [--role <role>]
scadalink security role-mapping delete --group <ldap-group> --role <role>
scadalink security scope-rule list [--role-mapping-id <id>] [--format json|table]
scadalink security scope-rule add --role-mapping-id <id> --site-id <site-id>
scadalink security api-key update --id <id> --enabled <bool>
scadalink security api-key delete --id <id>
scadalink security role-mapping list
scadalink security role-mapping create --ldap-group <group> --role <role>
scadalink security role-mapping update --id <id> [--ldap-group <group>] [--role <role>]
scadalink security role-mapping delete --id <id>
scadalink security scope-rule list [--mapping-id <id>]
scadalink security scope-rule add --mapping-id <id> --site-id <id>
scadalink security scope-rule delete --id <id>
```
### Audit Log Commands
```
scadalink audit-log query [--user <username>] [--entity-type <type>] [--from <date>] [--to <date>] [--format json|table]
scadalink audit-log query [--user <username>] [--entity-type <type>] [--action <action>] [--from <date>] [--to <date>] [--page <n>] [--page-size <n>]
```
### Health Commands
```
scadalink health summary [--format json|table]
scadalink health site <site-id> [--format json|table]
scadalink health event-log --site-identifier <site-id> [--from <date>] [--to <date>] [--search <term>] [--page <n>] [--page-size <n>] [--format json|table]
scadalink health parked-messages --site-identifier <site-id> [--page <n>] [--page-size <n>] [--format json|table]
scadalink health summary
scadalink health site --identifier <site-identifier>
scadalink health event-log --site <site-identifier> [--event-type <type>] [--severity <level>] [--keyword <term>] [--from <date>] [--to <date>] [--page <n>] [--page-size <n>] [--instance-name <name>]
scadalink health parked-messages --site <site-identifier> [--page <n>] [--page-size <n>]
```
### Debug Commands
```
scadalink debug snapshot --id <id> [--format json|table]
scadalink debug stream --id <instanceId> [--url ...] [--username ...] [--password ...]
scadalink debug snapshot --id <id>
scadalink debug stream --id <id>
```
The `debug snapshot` command retrieves a point-in-time snapshot via the HTTP Management API.
@@ -185,31 +200,33 @@ Unlike `debug snapshot` (which uses the HTTP Management API), `debug stream` use
### Shared Script Commands
```
scadalink shared-script list [--format json|table]
scadalink shared-script get --id <id> [--format json|table]
scadalink shared-script create --name <name> --code <code>
scadalink shared-script update --id <id> [--name <name>] [--code <code>]
scadalink shared-script list
scadalink shared-script get --id <id>
scadalink shared-script create --name <name> --code <code> [--parameters <json>] [--return-def <json>]
scadalink shared-script update --id <id> [--name <name>] [--code <code>] [--parameters <json>] [--return-def <json>]
scadalink shared-script delete --id <id>
```
### Database Connection Commands
```
scadalink db-connection list [--format json|table]
scadalink db-connection get --id <id> [--format json|table]
scadalink db-connection create --name <name> --connection-string <string> [--provider <provider>]
scadalink db-connection update --id <id> [--name <name>] [--connection-string <string>] [--provider <provider>]
scadalink db-connection list
scadalink db-connection get --id <id>
scadalink db-connection create --name <name> --connection-string <string>
scadalink db-connection update --id <id> [--name <name>] [--connection-string <string>]
scadalink db-connection delete --id <id>
```
### Inbound API Method Commands
```
scadalink api-method list [--format json|table]
scadalink api-method get --id <id> [--format json|table]
scadalink api-method create --name <name> --code <code> [--description <desc>]
scadalink api-method update --id <id> [--name <name>] [--code <code>] [--description <desc>]
scadalink api-method list
scadalink api-method get --id <id>
scadalink api-method create --name <name> --script <code> [--timeout <seconds>] [--parameters <json>] [--return-def <json>]
scadalink api-method update --id <id> [--script <code>] [--timeout <seconds>] [--parameters <json>] [--return-def <json>]
scadalink api-method delete --id <id>
```
The `--format json|table` option is recursive and accepted on every command above.
## Configuration
Configuration is resolved in the following priority order (highest wins):
@@ -218,7 +235,10 @@ Configuration is resolved in the following priority order (highest wins):
2. **Environment variables**:
- `SCADALINK_MANAGEMENT_URL` — Management API URL (e.g., `http://central-host:5000`).
- `SCADALINK_FORMAT` — Default output format (`json` or `table`).
3. **Configuration file**: `~/.scadalink/config.json` — Persistent defaults for management URL and output format.
- `SCADALINK_USERNAME` / `SCADALINK_PASSWORD` — LDAP credentials. Preferred over
`--password` on the command line, which is visible in process listings and shell
history. Credentials are never read from the config file.
3. **Configuration file**: `~/.scadalink/config.json` — Persistent defaults for management URL and output format only (never credentials).
### Configuration File Format
@@ -18,6 +18,26 @@ Both central and site clusters.
- Support cluster singleton hosting (used by the Site Runtime Deployment Manager singleton on site clusters).
- Manage Windows service lifecycle (start, stop, restart) on each node.
## Implementation Note — Code Placement
This component is a **design responsibility**, not a single buildable project that
contains all of the code. The cluster-infrastructure responsibilities above are
realised across two projects:
- **`src/ScadaLink.ClusterInfrastructure`** owns the cluster **configuration model**:
the `ClusterOptions` POCO (seed nodes, roles, remoting/gRPC ports, failure-detection
timings, split-brain settings) bound from `appsettings.json` via the Options pattern.
- **`src/ScadaLink.Host`** owns the cluster **bootstrap and runtime wiring**: it
builds the Akka.NET HOCON from `ClusterOptions`, starts the `ActorSystem`,
configures the keep-oldest split-brain resolver (`down-if-alone = on`), wires
`CoordinatedShutdown` into the service lifecycle, and provides active-node /
cluster-membership health checks. See `Component-Host.md` (REQ-HOST-*) for detail.
This split is deliberate — the Host is the single deployable binary and the only
project that performs Akka.NET bootstrap, so the cluster bring-up lives there
alongside role-based component registration. The `ClusterInfrastructure` project
remains the home of the configuration contract that the Host consumes.
## Cluster Topology
### Central Cluster
+23
View File
@@ -7,6 +7,20 @@ public class CliConfig
public string? ManagementUrl { get; set; }
public string DefaultFormat { get; set; } = "json";
/// <summary>
/// LDAP username from the <c>SCADALINK_USERNAME</c> environment variable, if set.
/// Credentials are intentionally only sourced from environment variables (or the
/// command line) — never from the config file — so they are not persisted to disk.
/// </summary>
public string? Username { get; set; }
/// <summary>
/// LDAP password from the <c>SCADALINK_PASSWORD</c> environment variable, if set.
/// Provides a safer alternative to <c>--password</c>, which leaks into process
/// listings and shell history.
/// </summary>
public string? Password { get; set; }
public static CliConfig Load()
{
var config = new CliConfig();
@@ -38,6 +52,15 @@ public class CliConfig
if (!string.IsNullOrEmpty(envFormat))
config.DefaultFormat = envFormat;
// Credentials from environment variables only (never the config file).
var envUsername = Environment.GetEnvironmentVariable("SCADALINK_USERNAME");
if (!string.IsNullOrEmpty(envUsername))
config.Username = envUsername;
var envPassword = Environment.GetEnvironmentVariable("SCADALINK_PASSWORD");
if (!string.IsNullOrEmpty(envPassword))
config.Password = envPassword;
return config;
}
+114 -38
View File
@@ -15,8 +15,8 @@ internal static class CommandHelpers
Option<string> passwordOption,
object command)
{
var format = result.GetValue(formatOption) ?? "json";
var config = CliConfig.Load();
var format = ResolveFormat(result, formatOption, config);
// Resolve management URL
var url = result.GetValue(urlOption);
@@ -31,14 +31,23 @@ internal static class CommandHelpers
return 1;
}
// Validate credentials
var username = result.GetValue(usernameOption);
var password = result.GetValue(passwordOption);
if (!IsValidManagementUrl(url))
{
OutputFormatter.WriteError(
$"Invalid management URL '{url}'. Expected an absolute http/https URL (e.g. http://localhost:9001).",
"INVALID_URL");
return 1;
}
// Resolve credentials: command-line options take precedence, then the
// SCADALINK_USERNAME / SCADALINK_PASSWORD environment variables.
var username = ResolveCredential(result.GetValue(usernameOption), config.Username);
var password = ResolveCredential(result.GetValue(passwordOption), config.Password);
if (string.IsNullOrWhiteSpace(username) || string.IsNullOrWhiteSpace(password))
{
OutputFormatter.WriteError(
"Credentials required. Use --username and --password options.",
"Credentials required. Use --username/--password or set SCADALINK_USERNAME/SCADALINK_PASSWORD.",
"NO_CREDENTIALS");
return 1;
}
@@ -53,10 +62,61 @@ internal static class CommandHelpers
return HandleResponse(response, format);
}
/// <summary>
/// Resolves the output format using the documented precedence chain:
/// an explicitly supplied <c>--format</c> option wins, otherwise the
/// config-file / environment-variable default (<see cref="CliConfig.DefaultFormat"/>)
/// is used, otherwise <c>json</c>. The <c>--format</c> option must not declare a
/// <c>DefaultValueFactory</c> — that would mask whether the flag was supplied.
/// </summary>
internal static string ResolveFormat(ParseResult result, Option<string> formatOption, CliConfig config)
{
// GetResult returns non-null only when the option was actually present on the
// command line, letting an explicit --format override the config default.
if (result.GetResult(formatOption) != null)
{
var explicitValue = result.GetValue(formatOption);
if (!string.IsNullOrWhiteSpace(explicitValue))
return explicitValue;
}
return string.IsNullOrWhiteSpace(config.DefaultFormat) ? "json" : config.DefaultFormat;
}
/// <summary>
/// Resolves a single credential: an explicit command-line value wins, otherwise the
/// environment-variable fallback (from <see cref="CliConfig"/>) is used.
/// </summary>
internal static string? ResolveCredential(string? commandLineValue, string? envValue)
=> string.IsNullOrWhiteSpace(commandLineValue) ? envValue : commandLineValue;
/// <summary>
/// Validates that a management URL is an absolute http/https URL. A malformed URL
/// (missing scheme, empty, or a non-http scheme) would otherwise reach
/// <c>new Uri(...)</c> in the <see cref="ManagementHttpClient"/> constructor and throw
/// an unhandled <see cref="UriFormatException"/>.
/// </summary>
internal static bool IsValidManagementUrl(string? url)
{
if (string.IsNullOrWhiteSpace(url))
return false;
return Uri.TryCreate(url, UriKind.Absolute, out var uri)
&& (uri.Scheme == Uri.UriSchemeHttp || uri.Scheme == Uri.UriSchemeHttps);
}
internal static int HandleResponse(ManagementResponse response, string format)
{
if (response.JsonData != null)
{
// A success status with an empty/whitespace body (e.g. a 204 from a delete)
// is a "command succeeded, no output" case — do not attempt to parse it.
if (string.IsNullOrWhiteSpace(response.JsonData))
{
Console.WriteLine("(ok)");
return 0;
}
if (string.Equals(format, "table", StringComparison.OrdinalIgnoreCase))
{
WriteAsTable(response.JsonData);
@@ -77,46 +137,62 @@ internal static class CommandHelpers
private static void WriteAsTable(string json)
{
using var doc = JsonDocument.Parse(json);
var root = doc.RootElement;
if (root.ValueKind == JsonValueKind.Array)
JsonDocument doc;
try
{
var items = root.EnumerateArray().ToList();
if (items.Count == 0)
{
Console.WriteLine("(no results)");
return;
}
doc = JsonDocument.Parse(json);
}
catch (JsonException)
{
// The server returned a success status but a non-JSON body (e.g. a proxy
// HTML error page, or a plain-text message). Print it verbatim rather than
// crashing — mirrors the raw-body fallback on the JSON path.
Console.WriteLine(json);
return;
}
var headers = items[0].ValueKind == JsonValueKind.Object
? items[0].EnumerateObject().Select(p => p.Name).ToArray()
: new[] { "Value" };
using (doc)
{
var root = doc.RootElement;
var rows = items.Select(item =>
if (root.ValueKind == JsonValueKind.Array)
{
if (item.ValueKind == JsonValueKind.Object)
var items = root.EnumerateArray().ToList();
if (items.Count == 0)
{
return headers.Select(h =>
item.TryGetProperty(h, out var val)
? val.ValueKind == JsonValueKind.Null ? "" : val.ToString()
: "").ToArray();
Console.WriteLine("(no results)");
return;
}
return new[] { item.ToString() };
});
OutputFormatter.WriteTable(rows, headers);
}
else if (root.ValueKind == JsonValueKind.Object)
{
var headers = new[] { "Property", "Value" };
var rows = root.EnumerateObject().Select(p =>
new[] { p.Name, p.Value.ValueKind == JsonValueKind.Null ? "" : p.Value.ToString() });
OutputFormatter.WriteTable(rows, headers);
}
else
{
Console.WriteLine(root.ToString());
var headers = items[0].ValueKind == JsonValueKind.Object
? items[0].EnumerateObject().Select(p => p.Name).ToArray()
: new[] { "Value" };
var rows = items.Select(item =>
{
if (item.ValueKind == JsonValueKind.Object)
{
return headers.Select(h =>
item.TryGetProperty(h, out var val)
? val.ValueKind == JsonValueKind.Null ? "" : val.ToString()
: "").ToArray();
}
return new[] { item.ToString() };
});
OutputFormatter.WriteTable(rows, headers);
}
else if (root.ValueKind == JsonValueKind.Object)
{
var headers = new[] { "Property", "Value" };
var rows = root.EnumerateObject().Select(p =>
new[] { p.Name, p.Value.ValueKind == JsonValueKind.Null ? "" : p.Value.ToString() });
OutputFormatter.WriteTable(rows, headers);
}
else
{
Console.WriteLine(root.ToString());
}
}
}
}
+12 -4
View File
@@ -42,8 +42,8 @@ public static class DebugCommands
cmd.SetAction(async (ParseResult result) =>
{
var instanceId = result.GetValue(idOption);
var format = result.GetValue(formatOption) ?? "json";
var config = CliConfig.Load();
var format = CommandHelpers.ResolveFormat(result, formatOption, config);
var url = result.GetValue(urlOption);
if (string.IsNullOrWhiteSpace(url))
@@ -57,13 +57,21 @@ public static class DebugCommands
return 1;
}
var username = result.GetValue(usernameOption);
var password = result.GetValue(passwordOption);
if (!CommandHelpers.IsValidManagementUrl(url))
{
OutputFormatter.WriteError(
$"Invalid management URL '{url}'. Expected an absolute http/https URL (e.g. http://localhost:9001).",
"INVALID_URL");
return 1;
}
var username = CommandHelpers.ResolveCredential(result.GetValue(usernameOption), config.Username);
var password = CommandHelpers.ResolveCredential(result.GetValue(passwordOption), config.Password);
if (string.IsNullOrWhiteSpace(username) || string.IsNullOrWhiteSpace(password))
{
OutputFormatter.WriteError(
"Credentials required. Use --username and --password options.",
"Credentials required. Use --username/--password or set SCADALINK_USERNAME/SCADALINK_PASSWORD.",
"NO_CREDENTIALS");
return 1;
}
+100 -8
View File
@@ -52,17 +52,106 @@ public static class InstanceCommands
{
var id = result.GetValue(idOption);
var bindingsJson = result.GetValue(bindingsOption)!;
var pairs = System.Text.Json.JsonSerializer.Deserialize<List<List<System.Text.Json.JsonElement>>>(bindingsJson)
?? throw new InvalidOperationException("Invalid bindings JSON");
var bindings = pairs.Select(p =>
(p[0].GetString()!, p[1].GetInt32())).ToList();
if (!TryParseBindings(bindingsJson, out var bindings, out var error))
{
OutputFormatter.WriteError(error!, "INVALID_ARGUMENT");
return 1;
}
return await CommandHelpers.ExecuteCommandAsync(
result, urlOption, formatOption, usernameOption, passwordOption,
new SetConnectionBindingsCommand(id, bindings));
new SetConnectionBindingsCommand(id, bindings!));
});
return cmd;
}
/// <summary>
/// Parses the <c>--bindings</c> argument — a JSON array of
/// <c>[attributeName, dataConnectionId]</c> pairs — into a typed list.
/// Returns <c>false</c> with a descriptive <paramref name="error"/> instead of
/// throwing when the JSON is malformed, a pair has the wrong arity, or an element
/// has the wrong type.
/// </summary>
internal static bool TryParseBindings(
string json,
out List<(string, int)>? bindings,
out string? error)
{
bindings = null;
error = null;
try
{
var pairs = System.Text.Json.JsonSerializer
.Deserialize<List<List<System.Text.Json.JsonElement>>>(json);
if (pairs == null)
{
error = "Bindings JSON must be a non-null array of [attributeName, dataConnectionId] pairs.";
return false;
}
var result = new List<(string, int)>(pairs.Count);
foreach (var pair in pairs)
{
if (pair.Count != 2)
{
error = "Each binding must be a [attributeName, dataConnectionId] pair of exactly two elements.";
return false;
}
if (pair[0].ValueKind != System.Text.Json.JsonValueKind.String)
{
error = "The first element of each binding (attributeName) must be a string.";
return false;
}
if (pair[1].ValueKind != System.Text.Json.JsonValueKind.Number
|| !pair[1].TryGetInt32(out var connectionId))
{
error = "The second element of each binding (dataConnectionId) must be an integer.";
return false;
}
result.Add((pair[0].GetString()!, connectionId));
}
bindings = result;
return true;
}
catch (System.Text.Json.JsonException ex)
{
error = $"Invalid bindings JSON: {ex.Message}";
return false;
}
}
/// <summary>
/// Parses the <c>--overrides</c> argument — a JSON object of
/// <c>attributeName -> value</c> pairs — into a typed dictionary. Returns
/// <c>false</c> with a descriptive <paramref name="error"/> instead of throwing
/// when the JSON is malformed or null.
/// </summary>
internal static bool TryParseOverrides(
string json,
out Dictionary<string, string?>? overrides,
out string? error)
{
overrides = null;
error = null;
try
{
var parsed = System.Text.Json.JsonSerializer
.Deserialize<Dictionary<string, string?>>(json);
if (parsed == null)
{
error = "Overrides JSON must be a non-null object of attribute name -> value pairs.";
return false;
}
overrides = parsed;
return true;
}
catch (System.Text.Json.JsonException ex)
{
error = $"Invalid overrides JSON: {ex.Message}";
return false;
}
}
private static Command BuildList(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var siteIdOption = new Option<int?>("--site-id") { Description = "Filter by site ID" };
@@ -178,11 +267,14 @@ public static class InstanceCommands
{
var id = result.GetValue(idOption);
var overridesJson = result.GetValue(overridesOption)!;
var overrides = System.Text.Json.JsonSerializer.Deserialize<Dictionary<string, string?>>(overridesJson)
?? throw new InvalidOperationException("Invalid overrides JSON");
if (!TryParseOverrides(overridesJson, out var overrides, out var error))
{
OutputFormatter.WriteError(error!, "INVALID_ARGUMENT");
return 1;
}
return await CommandHelpers.ExecuteCommandAsync(
result, urlOption, formatOption, usernameOption, passwordOption,
new SetInstanceOverridesCommand(id, overrides));
new SetInstanceOverridesCommand(id, overrides!));
});
return cmd;
}
+2 -1
View File
@@ -7,8 +7,9 @@ var rootCommand = new RootCommand("ScadaLink CLI — manage the ScadaLink SCADA
var urlOption = new Option<string>("--url") { Description = "Management API URL", Recursive = true };
var usernameOption = new Option<string>("--username") { Description = "LDAP username", Recursive = true };
var passwordOption = new Option<string>("--password") { Description = "LDAP password", Recursive = true };
// No DefaultValueFactory: format precedence (explicit --format -> config/env -> "json")
// is resolved by CommandHelpers.ResolveFormat, which needs to distinguish an absent flag.
var formatOption = new Option<string>("--format") { Description = "Output format (json or table)", Recursive = true };
formatOption.DefaultValueFactory = _ => "json";
rootCommand.Add(urlOption);
rootCommand.Add(usernameOption);
+2
View File
@@ -59,6 +59,8 @@ For the Docker test environment, see `docker/README.md` for a ready-to-use confi
|----------|-------------|
| `SCADALINK_MANAGEMENT_URL` | Management API URL (overrides config file) |
| `SCADALINK_FORMAT` | Default output format (overrides config file) |
| `SCADALINK_USERNAME` | LDAP username (fallback when `--username` is not supplied) |
| `SCADALINK_PASSWORD` | LDAP password (fallback when `--password` is not supplied). Preferred over `--password` on the command line, which leaks into process listings and shell history. |
## Output
+2 -2
View File
@@ -11,8 +11,8 @@
<InternalsVisibleTo Include="ScadaLink.CLI.Tests" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.AspNetCore.SignalR.Client" Version="9.0.3" />
<PackageReference Include="System.CommandLine" Version="2.0.5" />
<PackageReference Include="Microsoft.AspNetCore.SignalR.Client" />
<PackageReference Include="System.CommandLine" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
@@ -7,23 +7,37 @@ namespace ScadaLink.CentralUI.Auth;
/// <summary>
/// Bridges ASP.NET Core cookie authentication with Blazor Server's auth state.
/// The cookie middleware has already validated and decrypted the cookie by the time
/// the Blazor circuit is established, so we just read HttpContext.User.
/// <para>
/// The cookie middleware validates and decrypts the cookie during the initial
/// HTTP request that establishes the Blazor circuit. This provider is registered
/// <c>Scoped</c>, so it is constructed within that request's DI scope while
/// <see cref="IHttpContextAccessor.HttpContext"/> is still valid. We snapshot
/// the authenticated principal <b>once</b> in the constructor and serve that
/// snapshot for the lifetime of the circuit.
/// </para>
/// <para>
/// We must NOT read <see cref="IHttpContextAccessor"/> on every
/// <see cref="GetAuthenticationStateAsync"/> call (CentralUI-004): for the
/// lifetime of a long-lived SignalR circuit <c>HttpContext</c> is <c>null</c>
/// (or, worse, a stale/foreign context), so a later re-evaluation —
/// e.g. <c>&lt;AuthorizeView&gt;</c> re-rendering — would otherwise see an
/// unauthenticated principal and render the wrong UI.
/// </para>
/// </summary>
public class CookieAuthenticationStateProvider : ServerAuthenticationStateProvider
{
private readonly IHttpContextAccessor _httpContextAccessor;
private readonly Task<AuthenticationState> _circuitAuthState;
public CookieAuthenticationStateProvider(IHttpContextAccessor httpContextAccessor)
{
_httpContextAccessor = httpContextAccessor;
// Snapshot the principal at circuit-construction time. HttpContext is
// valid here (initial HTTP request) and will not be afterwards.
var user = httpContextAccessor.HttpContext?.User
?? new ClaimsPrincipal(new ClaimsIdentity());
_circuitAuthState = Task.FromResult(new AuthenticationState(user));
}
public override Task<AuthenticationState> GetAuthenticationStateAsync()
{
var user = _httpContextAccessor.HttpContext?.User
?? new ClaimsPrincipal(new ClaimsIdentity());
return Task.FromResult(new AuthenticationState(user));
}
=> _circuitAuthState;
}
@@ -0,0 +1,93 @@
using Microsoft.AspNetCore.Components.Authorization;
using ScadaLink.Commons.Entities.Sites;
using ScadaLink.Security;
namespace ScadaLink.CentralUI.Auth;
/// <summary>
/// Resolves the set of sites the current user is permitted to operate on, from
/// the <c>SiteId</c> claims attached at login (CentralUI-002).
/// <para>
/// The design (Component-CentralUI, CLAUDE.md "Security &amp; Auth") makes the
/// Deployment role site-scoped: a Deployment user mapped through an LDAP group
/// with site-scope rules carries one <see cref="JwtTokenService.SiteIdClaimType"/>
/// claim per permitted site (the claim value is the integer <c>Site.Id</c>).
/// A Deployment user with no <c>SiteId</c> claim — and any Admin/Design user — is
/// system-wide.
/// </para>
/// <para>
/// Deployment and Monitoring pages must filter every site/instance list through
/// <see cref="FilterSitesAsync"/> and re-check <see cref="IsSiteAllowedAsync"/>
/// before any cross-site command, so a scoped user cannot view or act on sites
/// outside their grant.
/// </para>
/// </summary>
public sealed class SiteScopeService
{
private readonly AuthenticationStateProvider _authStateProvider;
private (bool IsSystemWide, IReadOnlySet<int> Sites)? _cached;
public SiteScopeService(AuthenticationStateProvider authStateProvider)
{
_authStateProvider = authStateProvider;
}
/// <summary>
/// True when the user is not restricted to a site subset (no <c>SiteId</c>
/// claims). System-wide users see and act on every site.
/// </summary>
public async Task<bool> IsSystemWideAsync()
=> (await ResolveAsync()).IsSystemWide;
/// <summary>
/// The set of <c>Site.Id</c> values the user may operate on. Empty for a
/// system-wide user (callers should consult <see cref="IsSystemWideAsync"/>
/// or use the filter/allowed helpers, which already account for that).
/// </summary>
public async Task<IReadOnlySet<int>> PermittedSiteIdsAsync()
=> (await ResolveAsync()).Sites;
/// <summary>
/// Returns the subset of <paramref name="sites"/> the user is permitted to
/// see. A system-wide user gets the full list back unchanged.
/// </summary>
public async Task<List<Site>> FilterSitesAsync(IEnumerable<Site> sites)
{
var (isSystemWide, allowed) = await ResolveAsync();
if (isSystemWide)
return sites.ToList();
return sites.Where(s => allowed.Contains(s.Id)).ToList();
}
/// <summary>
/// True when the user may operate on the site with the given <c>Site.Id</c>.
/// Must be re-checked server-side before any mutating cross-site command.
/// </summary>
public async Task<bool> IsSiteAllowedAsync(int siteId)
{
var (isSystemWide, allowed) = await ResolveAsync();
return isSystemWide || allowed.Contains(siteId);
}
private async Task<(bool IsSystemWide, IReadOnlySet<int> Sites)> ResolveAsync()
{
if (_cached is { } cached)
return cached;
var state = await _authStateProvider.GetAuthenticationStateAsync();
var siteClaims = state.User.FindAll(JwtTokenService.SiteIdClaimType);
var ids = new HashSet<int>();
foreach (var claim in siteClaims)
{
if (int.TryParse(claim.Value, out var id))
ids.Add(id);
}
// No SiteId claims => system-wide. This mirrors SiteScopeAuthorizationHandler:
// absence of scope rules means an unrestricted deployer.
var result = (IsSystemWide: ids.Count == 0, Sites: (IReadOnlySet<int>)ids);
_cached = result;
return result;
}
}
@@ -0,0 +1,41 @@
namespace ScadaLink.CentralUI.Components;
/// <summary>
/// Converts <c>&lt;input type="datetime-local"&gt;</c> values — which are always
/// expressed in the user's <i>browser-local</i> time zone — into UTC
/// <see cref="DateTimeOffset"/>s for querying.
/// <para>
/// CLAUDE.md mandates UTC throughout the system, but a <c>datetime-local</c>
/// value carries no offset, so it must be <i>converted</i> to UTC, not relabelled
/// as UTC. Relabelling (the CentralUI-008 bug) shifts every query window by the
/// user's offset for any non-UTC browser.
/// </para>
/// </summary>
public static class BrowserTime
{
/// <summary>
/// Converts a browser-local <paramref name="localValue"/> to UTC using the
/// browser's <c>Date.getTimezoneOffset()</c> result.
/// </summary>
/// <param name="localValue">
/// The wall-clock value from a <c>datetime-local</c> input, or <c>null</c>.
/// </param>
/// <param name="browserUtcOffsetMinutes">
/// The value of JavaScript <c>new Date().getTimezoneOffset()</c>: the number
/// of minutes that, <b>added</b> to local time, yields UTC. It is positive
/// for time zones behind UTC (e.g. +300 for UTC-5) and negative for zones
/// ahead (e.g. -120 for UTC+2).
/// </param>
/// <returns>The equivalent instant in UTC, or <c>null</c> when the input is null.</returns>
public static DateTimeOffset? LocalInputToUtc(DateTime? localValue, int browserUtcOffsetMinutes)
{
if (localValue is not { } local)
return null;
// getTimezoneOffset() is defined as (UTC - local) in minutes, so
// UTC = local + offset.
var utc = DateTime.SpecifyKind(local, DateTimeKind.Unspecified)
.AddMinutes(browserUtcOffsetMinutes);
return new DateTimeOffset(utc, TimeSpan.Zero);
}
}
@@ -65,17 +65,22 @@
</Authorized>
</AuthorizeView>
@* Monitoring — visible to all authenticated users *@
@* Monitoring — Health Dashboard is all-roles; Event Logs and
Parked Messages are Deployment-role only (Component-CentralUI). *@
<div role="presentation" class="nav-section-header">Monitoring</div>
<li class="nav-item">
<NavLink class="nav-link" href="/monitoring/health">Health Dashboard</NavLink>
</li>
<li class="nav-item">
<NavLink class="nav-link" href="/monitoring/event-logs">Event Logs</NavLink>
</li>
<li class="nav-item">
<NavLink class="nav-link" href="/monitoring/parked-messages">Parked Messages</NavLink>
</li>
<AuthorizeView Policy="@AuthorizationPolicies.RequireDeployment">
<Authorized Context="monitoringContext">
<li class="nav-item">
<NavLink class="nav-link" href="/monitoring/event-logs">Event Logs</NavLink>
</li>
<li class="nav-item">
<NavLink class="nav-link" href="/monitoring/parked-messages">Parked Messages</NavLink>
</li>
</Authorized>
</AuthorizeView>
@* Audit Log — Admin only *@
<AuthorizeView Policy="@AuthorizationPolicies.RequireAdmin">
@@ -194,15 +194,12 @@
try
{
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
_siteConnections.Clear();
foreach (var site in _sites)
{
var connections = await SiteRepository.GetDataConnectionsBySiteIdAsync(site.Id);
if (connections.Count > 0)
{
_siteConnections[site.Id] = connections.ToList();
}
}
// CentralUI-012: fetch all data connections in one query and group
// them by site, instead of issuing one query per site (N+1).
_siteConnections = (await SiteRepository.GetAllDataConnectionsAsync())
.GroupBy(c => c.SiteId)
.ToDictionary(g => g.Key, g => g.ToList());
}
catch (Exception ex)
{
@@ -11,6 +11,7 @@
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
@inject ITemplateEngineRepository TemplateEngineRepository
@inject ISiteRepository SiteRepository
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@inject DebugStreamService DebugStreamService
@inject IJSRuntime JS
@implements IDisposable
@@ -292,11 +293,20 @@
private string? _initError;
// CentralUI-009: the stream callbacks (onEvent/onTerminated) run on an
// Akka/gRPC thread and capture `this` and `_toast`. Once the component is
// disposed, an in-flight callback must no-op rather than touch a disposed
// component (InvokeAsync would throw ObjectDisposedException) or a disposed
// ToastNotification.
private volatile bool _disposed;
protected override async Task OnInitializedAsync()
{
try
{
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
// Site scoping (CentralUI-002): a scoped Deployment user may only
// debug sites they are permitted on.
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
}
catch (Exception ex)
{
@@ -358,6 +368,14 @@
_siteInstances.Clear();
_selectedInstanceId = 0;
if (_selectedSiteId == 0) return;
// Site scoping (CentralUI-002): re-check the claim server-side — a query
// string or stale localStorage value could name a site outside the grant.
if (!await SiteScope.IsSiteAllowedAsync(_selectedSiteId))
{
_selectedSiteId = 0;
_toast.ShowError("You are not permitted to debug instances on that site.");
return;
}
try
{
_siteInstances = (await TemplateEngineRepository.GetInstancesBySiteIdAsync(_selectedSiteId))
@@ -385,15 +403,18 @@
_selectedInstanceId,
onEvent: evt =>
{
// CentralUI-009: the component may have been disposed while
// this event was in flight on the Akka/gRPC thread.
if (_disposed) return;
switch (evt)
{
case AttributeValueChanged av:
UpsertWithCap(_attributeValues, av.AttributeName, av);
_ = InvokeAsync(StateHasChanged);
SafeInvokeStateHasChanged();
break;
case AlarmStateChanged al:
UpsertWithCap(_alarmStates, al.AlarmName, al);
_ = InvokeAsync(StateHasChanged);
SafeInvokeStateHasChanged();
break;
}
},
@@ -401,8 +422,11 @@
{
_connected = false;
_session = null;
_ = InvokeAsync(() =>
// CentralUI-009: skip the toast/render if already disposed.
if (_disposed) return;
_ = SafeInvokeAsync(() =>
{
if (_disposed) return;
_toast.ShowError("Debug stream terminated (site disconnected).");
StateHasChanged();
});
@@ -535,8 +559,31 @@
_ => "—"
};
/// <summary>
/// Runs <paramref name="action"/> on the render thread, guarded against the
/// component being disposed mid-flight (CentralUI-009): <c>InvokeAsync</c>
/// throws <see cref="ObjectDisposedException"/> once the circuit is gone.
/// </summary>
private async Task SafeInvokeAsync(Action action)
{
if (_disposed) return;
try
{
await InvokeAsync(action);
}
catch (ObjectDisposedException)
{
// Component disposed between the guard and the dispatch — ignore.
}
}
private void SafeInvokeStateHasChanged() => _ = SafeInvokeAsync(StateHasChanged);
public void Dispose()
{
// CentralUI-009: mark disposed first so any in-flight stream callback
// sees the flag and no-ops, then stop the stream synchronously.
_disposed = true;
if (_session != null)
{
DebugStreamService.StopStream(_session.SessionId);
@@ -7,6 +7,7 @@
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
@inject IDeploymentManagerRepository DeploymentManagerRepository
@inject ITemplateEngineRepository TemplateEngineRepository
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@implements IDisposable
<div class="container-fluid mt-3">
@@ -245,13 +246,23 @@
_errorMessage = null;
try
{
_records = (await DeploymentManagerRepository.GetAllDeploymentRecordsAsync())
.OrderByDescending(r => r.DeployedAt)
.ToList();
// Build instance name lookup
// Build instance lookups first — site scoping (CentralUI-002) filters
// deployment records by the site of their instance.
var instances = await TemplateEngineRepository.GetAllInstancesAsync();
_instanceNames = instances.ToDictionary(i => i.Id, i => i.UniqueName);
var instanceSiteIds = instances.ToDictionary(i => i.Id, i => i.SiteId);
var systemWide = await SiteScope.IsSystemWideAsync();
var permittedSiteIds = systemWide
? null
: await SiteScope.PermittedSiteIdsAsync();
_records = (await DeploymentManagerRepository.GetAllDeploymentRecordsAsync())
.Where(r => permittedSiteIds == null
|| (instanceSiteIds.TryGetValue(r.InstanceId, out var sid)
&& permittedSiteIds.Contains(sid)))
.OrderByDescending(r => r.DeployedAt)
.ToList();
_totalPages = Math.Max(1, (int)Math.Ceiling(_records.Count / (double)PageSize));
if (_currentPage > _totalPages) _currentPage = 1;
@@ -11,6 +11,7 @@
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
@inject ITemplateEngineRepository TemplateEngineRepository
@inject ISiteRepository SiteRepository
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@inject InstanceService InstanceService
@inject IFlatteningPipeline FlatteningPipeline
@inject AuthenticationStateProvider AuthStateProvider
@@ -377,6 +378,17 @@
return;
}
// Site scoping (CentralUI-002): a scoped Deployment user must not be
// able to configure or deploy an instance on a site outside their
// grant by navigating straight to its URL.
if (!await SiteScope.IsSiteAllowedAsync(_instance.SiteId))
{
_instance = null;
_errorMessage = "You are not permitted to manage instances on this site.";
_loading = false;
return;
}
// Identity
var template = await TemplateEngineRepository.GetTemplateByIdAsync(_instance.TemplateId);
_templateName = template?.Name ?? $"#{_instance.TemplateId}";
@@ -8,6 +8,7 @@
@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]
@inject ITemplateEngineRepository TemplateEngineRepository
@inject ISiteRepository SiteRepository
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@inject InstanceService InstanceService
@inject AuthenticationStateProvider AuthStateProvider
@inject NavigationManager NavigationManager
@@ -93,7 +94,9 @@
try
{
_templates = (await TemplateEngineRepository.GetAllTemplatesAsync()).ToList();
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
// Site scoping (CentralUI-002): a scoped Deployment user may only
// create instances on sites they are permitted on.
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
_allAreas.Clear();
foreach (var site in _sites)
@@ -124,6 +127,13 @@
if (string.IsNullOrWhiteSpace(_createName)) { _formError = "Instance name is required."; return; }
if (_createTemplateId == 0) { _formError = "Select a template."; return; }
if (_createSiteId == 0) { _formError = "Select a site."; return; }
// Site scoping (CentralUI-002): re-check server-side before the mutating
// command, independent of what the site dropdown was populated with.
if (!await SiteScope.IsSiteAllowedAsync(_createSiteId))
{
_formError = "You are not permitted to create instances on the selected site.";
return;
}
try
{
@@ -17,6 +17,7 @@
@inject AreaService AreaService
@inject InstanceService InstanceService
@inject AuthenticationStateProvider AuthStateProvider
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@inject NavigationManager NavigationManager
@inject IJSRuntime JSRuntime
@inject IDialogService Dialog
@@ -225,8 +226,13 @@
_errorMessage = null;
try
{
_allInstances = (await TemplateEngineRepository.GetAllInstancesAsync()).ToList();
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
// Site scoping (CentralUI-002): a scoped Deployment user only sees the
// sites — and therefore the areas/instances — they are permitted on.
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
var permittedSiteIds = _sites.Select(s => s.Id).ToHashSet();
_allInstances = (await TemplateEngineRepository.GetAllInstancesAsync())
.Where(i => permittedSiteIds.Contains(i.SiteId))
.ToList();
_templates = (await TemplateEngineRepository.GetAllTemplatesAsync()).ToList();
_allAreas.Clear();
@@ -1,5 +1,6 @@
@page "/monitoring/audit-log"
@using ScadaLink.Security
@using ScadaLink.CentralUI.Components
@using ScadaLink.Commons.Entities.Audit
@using ScadaLink.Commons.Interfaces.Repositories
@attribute [Authorize(Policy = AuthorizationPolicies.RequireAdmin)]
@@ -195,6 +196,12 @@
private DateTime? _filterFrom;
private DateTime? _filterTo;
// The datetime-local filter inputs are in the browser's local time zone.
// This holds new Date().getTimezoneOffset() so the values are converted to
// UTC (CentralUI-008) rather than relabelled. Until JS interop runs it is 0
// (UTC), which is a safe default for a UTC server/browser.
private int _browserUtcOffsetMinutes;
private List<AuditLogEntry>? _entries;
private int _totalCount;
private int _page = 1;
@@ -209,6 +216,23 @@
private int TotalPages => _pageSize > 0 ? Math.Max(1, (_totalCount + _pageSize - 1) / _pageSize) : 1;
private bool HasMore => _page * _pageSize < _totalCount;
protected override async Task OnAfterRenderAsync(bool firstRender)
{
if (!firstRender) return;
try
{
// Date.getTimezoneOffset() returns (UTC - local) in minutes.
_browserUtcOffsetMinutes = await JS.InvokeAsync<int>(
"eval", "new Date().getTimezoneOffset()");
}
catch (Exception ex) when (ex is JSException or JSDisconnectedException
or InvalidOperationException or TaskCanceledException)
{
// Prerender or a disconnected circuit: fall back to UTC (offset 0).
_browserUtcOffsetMinutes = 0;
}
}
private async Task Search()
{
_page = 1;
@@ -239,8 +263,8 @@
user: string.IsNullOrWhiteSpace(_filterUser) ? null : _filterUser.Trim(),
entityType: string.IsNullOrWhiteSpace(_filterEntityType) ? null : _filterEntityType.Trim(),
action: string.IsNullOrWhiteSpace(_filterAction) ? null : _filterAction.Trim(),
from: _filterFrom.HasValue ? new DateTimeOffset(_filterFrom.Value, TimeSpan.Zero) : null,
to: _filterTo.HasValue ? new DateTimeOffset(_filterTo.Value, TimeSpan.Zero) : null,
from: BrowserTime.LocalInputToUtc(_filterFrom, _browserUtcOffsetMinutes),
to: BrowserTime.LocalInputToUtc(_filterTo, _browserUtcOffsetMinutes),
page: _page,
pageSize: _pageSize);
@@ -1,10 +1,11 @@
@page "/monitoring/event-logs"
@attribute [Authorize]
@attribute [Authorize(Policy = ScadaLink.Security.AuthorizationPolicies.RequireDeployment)]
@using ScadaLink.Commons.Entities.Sites
@using ScadaLink.Commons.Interfaces.Repositories
@using ScadaLink.Commons.Messages.RemoteQuery
@using ScadaLink.Communication
@inject ISiteRepository SiteRepository
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@inject CommunicationService CommunicationService
<div class="container-fluid mt-3">
@@ -212,9 +213,16 @@
protected override async Task OnInitializedAsync()
{
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
// Site scoping (CentralUI-002): a scoped Deployment user may only query
// event logs for the sites they are permitted on.
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
}
// _sites is already filtered, so membership IS the scope check.
private bool SelectedSiteIsPermitted =>
!string.IsNullOrEmpty(_selectedSiteId)
&& _sites.Any(s => s.SiteIdentifier == _selectedSiteId);
private async Task Search()
{
_entries = new();
@@ -237,6 +245,14 @@
{
_searching = true;
_errorMessage = null;
// Site scoping (CentralUI-002): re-check before querying — the dropdown is
// filtered, but the selection must not be trusted on its own.
if (!SelectedSiteIsPermitted)
{
_errorMessage = "You are not permitted to view event logs for that site.";
_searching = false;
return;
}
try
{
var request = new EventLogQueryRequest(
@@ -1,11 +1,12 @@
@page "/monitoring/parked-messages"
@attribute [Authorize]
@attribute [Authorize(Policy = ScadaLink.Security.AuthorizationPolicies.RequireDeployment)]
@using ScadaLink.Commons.Entities.Sites
@using ScadaLink.Commons.Interfaces.Repositories
@using ScadaLink.Commons.Messages.RemoteQuery
@using ScadaLink.Commons.Types.Enums
@using ScadaLink.Communication
@inject ISiteRepository SiteRepository
@inject ScadaLink.CentralUI.Auth.SiteScopeService SiteScope
@inject CommunicationService CommunicationService
@inject IJSRuntime JS
@inject IDialogService Dialog
@@ -360,9 +361,17 @@
protected override async Task OnInitializedAsync()
{
_sites = (await SiteRepository.GetAllSitesAsync()).ToList();
// Site scoping (CentralUI-002): a scoped Deployment user may only inspect
// and act on parked messages for the sites they are permitted on.
_sites = await SiteScope.FilterSitesAsync(await SiteRepository.GetAllSitesAsync());
}
// True only when the currently selected SiteIdentifier is one this user is
// permitted on. _sites is already filtered, so membership IS the scope check.
private bool SelectedSiteIsPermitted =>
!string.IsNullOrEmpty(_selectedSiteId)
&& _sites.Any(s => s.SiteIdentifier == _selectedSiteId);
private async Task OnSiteChanged(ChangeEventArgs e)
{
_selectedSiteId = e.Value?.ToString() ?? string.Empty;
@@ -393,6 +402,15 @@
{
_searching = true;
_errorMessage = null;
// Site scoping (CentralUI-002): re-check before querying — the dropdown is
// filtered, but the selection must not be trusted on its own.
if (!SelectedSiteIsPermitted)
{
_errorMessage = "You are not permitted to view parked messages for that site.";
_messages = null;
_searching = false;
return;
}
try
{
var request = new ParkedMessageQueryRequest(
@@ -557,6 +575,7 @@
{
var ids = _selectedIds.ToList();
if (ids.Count == 0) return;
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return; }
var confirmed = await Dialog.ConfirmAsync(
"Retry parked messages",
@@ -587,6 +606,7 @@
{
var ids = _selectedIds.ToList();
if (ids.Count == 0) return;
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return; }
var confirmed = await Dialog.ConfirmAsync(
"Discard parked messages",
@@ -618,6 +638,7 @@
private async Task RetrySingle(ParkedMessageEntry msg)
{
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return; }
_actionInProgress = true;
_activeAction = "Retry";
try
@@ -638,6 +659,7 @@
private async Task<bool> DiscardSingle(ParkedMessageEntry msg)
{
if (!SelectedSiteIsPermitted) { _toast.ShowError("Not permitted for this site."); return false; }
var confirmed = await Dialog.ConfirmAsync(
"Discard parked message",
$"Permanently discard message {ShortId(msg.MessageId)}? This cannot be undone.",
@@ -15,6 +15,7 @@ namespace ScadaLink.CentralUI.Components.Shared;
/// RateOfChange { attributeName, thresholdPerSecond, windowSeconds, direction }
/// HiLo { attributeName, loLo, lo, hi, hiHi,
/// loLoPriority, loPriority, hiPriority, hiHiPriority }
/// Expression { expression }
///
/// All HiLo setpoints and per-setpoint priorities are optional — any subset
/// is valid (e.g., only Hi/HiHi configured for over-temperature protection).
@@ -93,6 +94,10 @@ internal static class AlarmTriggerConfigCodec
model.HiMessage = TryReadString(root, "hiMessage");
model.HiHiMessage = TryReadString(root, "hiHiMessage");
break;
case AlarmTriggerType.Expression:
model.Expression = TryReadString(root, "expression");
break;
}
}
catch (JsonException)
@@ -105,8 +110,10 @@ internal static class AlarmTriggerConfigCodec
/// <summary>
/// Serializes the model to the JSON shape AlarmActor.ParseEvalConfig
/// expects. Always writes <c>attributeName</c> (canonical key) and only
/// the keys relevant to the current trigger type.
/// expects. Writes <c>attributeName</c> (canonical key) for the
/// attribute-bound trigger types and only the keys relevant to the
/// current trigger type. <c>Expression</c> is not bound to a single
/// attribute, so <c>attributeName</c> is omitted for it.
/// </summary>
internal static string Serialize(AlarmTriggerModel model, AlarmTriggerType type)
{
@@ -114,7 +121,8 @@ internal static class AlarmTriggerConfigCodec
using (var w = new Utf8JsonWriter(stream))
{
w.WriteStartObject();
w.WriteString("attributeName", model.AttributeName ?? "");
if (type != AlarmTriggerType.Expression)
w.WriteString("attributeName", model.AttributeName ?? "");
switch (type)
{
@@ -155,6 +163,10 @@ internal static class AlarmTriggerConfigCodec
if (!string.IsNullOrEmpty(model.HiMessage)) w.WriteString("hiMessage", model.HiMessage);
if (!string.IsNullOrEmpty(model.HiHiMessage)) w.WriteString("hiHiMessage", model.HiHiMessage);
break;
case AlarmTriggerType.Expression:
w.WriteString("expression", model.Expression ?? "");
break;
}
w.WriteEndObject();
@@ -241,4 +253,7 @@ internal sealed class AlarmTriggerModel
public string? LoMessage { get; set; }
public string? HiMessage { get; set; }
public string? HiHiMessage { get; set; }
// Expression — boolean C# expression evaluated on attribute updates.
public string? Expression { get; set; }
}
@@ -12,6 +12,10 @@
<div class="border rounded bg-white p-3">
@* ── Monitored attribute ───────────────────────────────────────────── *@
@* Expression triggers reference attributes inside the C# expression itself,
so they do not use the single-attribute picker. *@
@if (TriggerType != AlarmTriggerType.Expression)
{
<div class="mb-3">
<label for="alarm-attr-select" class="form-label small text-uppercase text-muted fw-semibold mb-1">
Monitored attribute
@@ -67,6 +71,7 @@
</div>
}
</div>
}
@* ── Type-specific block ───────────────────────────────────────────── *@
@switch (TriggerType)
@@ -83,6 +88,9 @@
case AlarmTriggerType.HiLo:
@RenderHiLo();
break;
case AlarmTriggerType.Expression:
@RenderExpression();
break;
}
@* ── Hint ──────────────────────────────────────────────────────────── *@
@@ -559,6 +567,30 @@
await Emit();
}
// ── Expression ─────────────────────────────────────────────────────────
private RenderFragment RenderExpression() => __builder =>
{
<label class="form-label small text-uppercase text-muted fw-semibold mb-1">Trigger expression</label>
<MonacoEditor Height="120px"
Language="csharp"
ScriptKind="ScriptAnalysis.ScriptKind.Template"
ShowToolbar="false"
Value="@(_model.Expression ?? string.Empty)"
ValueChanged="OnExpressionChanged"
SelfAttributes="@TriggerAttributeMapper.SelfAttributes(AvailableAttributes)"
Children="@TriggerAttributeMapper.Children(AvailableAttributes)" />
<div class="form-text">
A boolean C# expression — e.g. <code>Attributes["Temperature"] &gt; 80</code>.
</div>
};
private async Task OnExpressionChanged(string value)
{
_model.Expression = value;
await Emit();
}
// ── Hint text ──────────────────────────────────────────────────────────
private string BuildHint()
@@ -582,6 +614,9 @@
AlarmTriggerType.HiLo => BuildHiLoHint(attr),
AlarmTriggerType.Expression =>
"Alarm is active while this expression is true.",
_ => string.Empty
};
}
@@ -150,6 +150,11 @@
public async ValueTask DisposeAsync()
{
// CentralUI-011: if the dialog is disposed while still open (the user
// navigated away), complete the pending task so the awaiting caller
// resumes deterministically instead of hanging forever.
_tcs?.TrySetResult(false);
if (_bodyLocked)
{
await TryUnlockBodyAsync();
@@ -10,7 +10,7 @@ namespace ScadaLink.CentralUI.Components.Shared;
/// trigger; <see cref="Unknown"/> is a stored trigger-type string the runtime
/// does not recognize (preserved as-is by the editor).
/// </summary>
internal enum ScriptTriggerKind { None, Interval, ValueChange, Conditional, Call, Unknown }
internal enum ScriptTriggerKind { None, Interval, ValueChange, Conditional, Call, Expression, Unknown }
/// <summary>A script's trigger as the editor emits it: a type string + config JSON.</summary>
public sealed record ScriptTriggerValue(string? TriggerType, string? Config);
@@ -29,6 +29,9 @@ internal sealed class ScriptTriggerModel
/// <summary>Comparison threshold (Conditional).</summary>
public double? Threshold { get; set; }
/// <summary>Boolean C# expression (Expression).</summary>
public string? Expression { get; set; }
}
/// <summary>
@@ -40,6 +43,7 @@ internal sealed class ScriptTriggerModel
/// ValueChange { attributeName }
/// Conditional { attributeName, operator, threshold }
/// Call { }
/// Expression { expression }
///
/// Parsing also accepts the legacy aliases <c>attribute</c> and <c>value</c> so
/// older configs survive a round-trip through the editor.
@@ -59,6 +63,7 @@ internal static class ScriptTriggerConfigCodec
"valuechange" => ScriptTriggerKind.ValueChange,
"conditional" => ScriptTriggerKind.Conditional,
"call" => ScriptTriggerKind.Call,
"expression" => ScriptTriggerKind.Expression,
_ => ScriptTriggerKind.Unknown
};
}
@@ -70,6 +75,7 @@ internal static class ScriptTriggerConfigCodec
ScriptTriggerKind.ValueChange => "ValueChange",
ScriptTriggerKind.Conditional => "Conditional",
ScriptTriggerKind.Call => "Call",
ScriptTriggerKind.Expression => "Expression",
_ => null
};
@@ -104,6 +110,10 @@ internal static class ScriptTriggerConfigCodec
model.Operator = NormalizeOperator(op);
model.Threshold = TryReadDouble(root, "threshold") ?? TryReadDouble(root, "value");
break;
case ScriptTriggerKind.Expression:
model.Expression = root.TryGetProperty("expression", out var e) ? e.GetString() : null;
break;
}
}
catch (JsonException)
@@ -144,6 +154,10 @@ internal static class ScriptTriggerConfigCodec
w.WriteNumber("threshold", model.Threshold.Value);
break;
case ScriptTriggerKind.Expression:
w.WriteString("expression", model.Expression ?? "");
break;
// Call → empty object.
}
w.WriteEndObject();
@@ -23,6 +23,7 @@
<option value="Interval">Interval — run on a fixed timer</option>
<option value="ValueChange">Value change — run when an attribute changes</option>
<option value="Conditional">Conditional — run when a condition is met</option>
<option value="Expression">Expression — run when a boolean expression becomes true</option>
<option value="Call">Call — run only when invoked by another script</option>
@if (_kind == ScriptTriggerKind.Unknown)
{
@@ -45,6 +46,9 @@
case ScriptTriggerKind.Conditional:
@RenderConditional();
break;
case ScriptTriggerKind.Expression:
@RenderExpression();
break;
case ScriptTriggerKind.Call:
<div class="small text-muted">
No automatic trigger — this script runs only when another script
@@ -62,7 +66,8 @@
}
@* ── Hint ──────────────────────────────────────────────────────────── *@
@if (_kind is ScriptTriggerKind.Interval or ScriptTriggerKind.ValueChange or ScriptTriggerKind.Conditional)
@if (_kind is ScriptTriggerKind.Interval or ScriptTriggerKind.ValueChange
or ScriptTriggerKind.Conditional or ScriptTriggerKind.Expression)
{
<div class="mt-3 pt-2 border-top small text-muted">@BuildHint()</div>
}
@@ -244,6 +249,30 @@
await Emit();
}
// ── Expression ─────────────────────────────────────────────────────────
private RenderFragment RenderExpression() => __builder =>
{
<label class="form-label small text-uppercase text-muted fw-semibold mb-1">Trigger expression</label>
<MonacoEditor Height="120px"
Language="csharp"
ScriptKind="ScriptAnalysis.ScriptKind.Template"
ShowToolbar="false"
Value="@(_model.Expression ?? string.Empty)"
ValueChanged="OnExpressionChanged"
SelfAttributes="@TriggerAttributeMapper.SelfAttributes(AvailableAttributes)"
Children="@TriggerAttributeMapper.Children(AvailableAttributes)" />
<div class="form-text">
A boolean C# expression — e.g. <code>Attributes["Temperature"] &gt; 80</code>.
</div>
};
private async Task OnExpressionChanged(string value)
{
_model.Expression = value;
await Emit();
}
// ── Attribute picker (ValueChange + Conditional) ───────────────────────
private RenderFragment RenderAttributePicker(string label) => __builder =>
@@ -315,6 +344,9 @@
? $"Runs when {attr} changes, if {attr} {_model.Operator} {t.ToString("0.###", CultureInfo.InvariantCulture)}."
: $"Runs when {attr} changes and meets the configured condition — set a threshold above.",
ScriptTriggerKind.Expression =>
"Runs once each time this expression becomes true.",
_ => string.Empty
};
}
@@ -30,6 +30,16 @@
private readonly List<ToastItem> _toasts = new();
private readonly object _lock = new();
// Cancels all pending auto-dismiss delays when the component is disposed
// (CentralUI-010) so their continuations never touch a disposed component.
private readonly CancellationTokenSource _disposalCts = new();
/// <summary>Number of toasts currently displayed.</summary>
public int ToastCount
{
get { lock (_lock) { return _toasts.Count; } }
}
public void ShowSuccess(string message, string title = "Success", int? autoDismissMs = null)
{
AddToast(title, message, ToastType.Success, autoDismissMs);
@@ -52,6 +62,9 @@
private void AddToast(string title, string message, ToastType type, int? autoDismissMs)
{
// If the component is already disposed, do not add or schedule anything.
if (_disposalCts.IsCancellationRequested) return;
var toast = new ToastItem { Title = title, Message = message, Type = type };
lock (_lock)
{
@@ -60,14 +73,41 @@
StateHasChanged();
var dismissMs = autoDismissMs ?? DefaultAutoDismissMs;
_ = Task.Delay(dismissMs).ContinueWith(_ =>
_ = AutoDismissAsync(toast, dismissMs, _disposalCts.Token);
}
/// <summary>
/// Removes a toast after its dismiss delay. The delay is bound to the
/// component's disposal token (CentralUI-010): if the host page is disposed
/// first, the delay is cancelled and the continuation never touches the
/// disposed component — no <see cref="ObjectDisposedException"/> escapes.
/// </summary>
private async Task AutoDismissAsync(ToastItem toast, int dismissMs, CancellationToken token)
{
try
{
lock (_lock)
{
_toasts.Remove(toast);
}
InvokeAsync(StateHasChanged);
});
await Task.Delay(dismissMs, token);
}
catch (OperationCanceledException)
{
return;
}
if (token.IsCancellationRequested) return;
lock (_lock)
{
_toasts.Remove(toast);
}
try
{
await InvokeAsync(StateHasChanged);
}
catch (ObjectDisposedException)
{
// Component disposed between the token check and the render — ignore.
}
}
private void Dismiss(ToastItem toast)
@@ -87,7 +127,11 @@
_ => "bg-secondary text-white"
};
public void Dispose() { }
public void Dispose()
{
_disposalCts.Cancel();
_disposalCts.Dispose();
}
private enum ToastType { Success, Error, Warning, Info }
@@ -0,0 +1,49 @@
using ScadaLink.CentralUI.ScriptAnalysis;
namespace ScadaLink.CentralUI.Components.Shared;
/// <summary>
/// Maps the trigger editors' flattened <see cref="AlarmAttributeChoice"/> list
/// into the metadata the <see cref="MonacoEditor"/> uses to drive C# completion
/// inside an expression trigger:
/// <list type="bullet">
/// <item>Direct + Inherited choices become <see cref="AttributeShape"/>s,
/// surfaced under <c>Attributes["..."]</c>.</item>
/// <item>Composed choices — whose canonical name is dotted, e.g.
/// <c>CoolingTank.Temp</c> — are grouped by their composition-instance prefix
/// into <see cref="CompositionContext"/>s, surfaced under
/// <c>Children["..."].Attributes["..."]</c>.</item>
/// </list>
/// </summary>
public static class TriggerAttributeMapper
{
/// <summary>Direct and inherited attributes, exposed as <c>Attributes["..."]</c>.</summary>
public static IReadOnlyList<AttributeShape> SelfAttributes(
IReadOnlyList<AlarmAttributeChoice> choices) =>
choices
.Where(c => c.Source is "Direct" or "Inherited")
.Select(c => new AttributeShape(c.CanonicalName, c.DataType))
.ToList();
/// <summary>
/// Composed attributes grouped by composition-instance name, exposed as
/// <c>Children["X"].Attributes["Y"]</c>. Entries without a dotted prefix
/// are skipped (no child scope to attach them to).
/// </summary>
public static IReadOnlyList<CompositionContext> Children(
IReadOnlyList<AlarmAttributeChoice> choices) =>
choices
.Where(c => c.Source == "Composed" && c.CanonicalName.Contains('.'))
.Select(c => new
{
Child = c.CanonicalName[..c.CanonicalName.IndexOf('.')],
Member = c.CanonicalName[(c.CanonicalName.IndexOf('.') + 1)..],
c.DataType
})
.GroupBy(x => x.Child, StringComparer.Ordinal)
.Select(g => new CompositionContext(
g.Key,
g.Select(x => new AttributeShape(x.Member, x.DataType)).ToList(),
Array.Empty<ScriptShape>()))
.ToList();
}
@@ -16,8 +16,8 @@
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="4.13.0" />
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Workspaces" Version="4.13.0" />
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Scripting" />
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Workspaces" />
</ItemGroup>
<ItemGroup>
@@ -0,0 +1,106 @@
using System.Text;
namespace ScadaLink.CentralUI.ScriptAnalysis;
/// <summary>
/// Per-call console capture for the Test Run sandbox.
/// <para>
/// Sandbox scripts use <c>System.Console.WriteLine</c> for ad-hoc output. The
/// sandbox needs to capture that output per execution. <c>Console.Out</c> is,
/// however, <b>process-global</b>: redirecting it with <c>Console.SetOut</c> for
/// the duration of one run corrupts any other run executing concurrently —
/// outputs interleave, and whichever run finishes first restores
/// <c>Console.Out</c> while the others are still writing (CentralUI-003).
/// </para>
/// <para>
/// This writer is installed into <c>Console.Out</c>/<c>Console.Error</c>
/// <b>exactly once</b> (see <see cref="Install"/>) and never removed. Each
/// concurrent run pushes its own buffer onto an <see cref="AsyncLocal{T}"/>
/// scope via <see cref="BeginCapture"/>; writes on that run's logical call-tree
/// land in that run's buffer only. Writes made on threads with no active
/// capture scope (i.e. genuine host-process console output) fall through to the
/// original writer. No process-global mutation happens per run.
/// </para>
/// </summary>
internal sealed class SandboxConsoleCapture : TextWriter
{
private static readonly object InstallLock = new();
private static SandboxConsoleCapture? _outInstance;
private static SandboxConsoleCapture? _errorInstance;
private readonly TextWriter _fallback;
private readonly AsyncLocal<StringWriter?> _current = new();
private SandboxConsoleCapture(TextWriter fallback) => _fallback = fallback;
public override Encoding Encoding => _fallback.Encoding;
/// <summary>
/// Installs the routing writers into <see cref="Console.Out"/> and
/// <see cref="Console.Error"/> once for the process. Idempotent and
/// thread-safe. Subsequent calls return the already-installed instances.
/// </summary>
public static (SandboxConsoleCapture Out, SandboxConsoleCapture Error) Install()
{
if (_outInstance != null && _errorInstance != null)
return (_outInstance, _errorInstance);
lock (InstallLock)
{
if (_outInstance == null)
{
_outInstance = new SandboxConsoleCapture(Console.Out);
Console.SetOut(_outInstance);
}
if (_errorInstance == null)
{
_errorInstance = new SandboxConsoleCapture(Console.Error);
Console.SetError(_errorInstance);
}
}
return (_outInstance, _errorInstance);
}
/// <summary>
/// Begins a capture scope on the current logical (async) call-tree. All
/// console writes from this point until the returned scope is disposed are
/// routed into <paramref name="buffer"/> instead of the original writer.
/// The scope is restored on dispose, so nesting and concurrent scopes on
/// other call-trees are unaffected.
/// </summary>
public CaptureScope BeginCapture(StringWriter buffer)
{
var previous = _current.Value;
_current.Value = buffer;
return new CaptureScope(this, previous);
}
public override void Write(char value) => Target.Write(value);
public override void Write(string? value) => Target.Write(value);
public override void Write(char[] buffer, int index, int count) =>
Target.Write(buffer, index, count);
public override void WriteLine() => Target.WriteLine();
public override void WriteLine(string? value) => Target.WriteLine(value);
private TextWriter Target => _current.Value ?? _fallback;
internal readonly struct CaptureScope : IDisposable
{
private readonly SandboxConsoleCapture _owner;
private readonly StringWriter? _previous;
internal CaptureScope(SandboxConsoleCapture owner, StringWriter? previous)
{
_owner = owner;
_previous = previous;
}
public void Dispose() => _owner._current.Value = _previous;
}
}
@@ -19,11 +19,11 @@ public static class ScriptAnalysisEndpoints
group.MapPost("/completions", async (CompletionsRequest req, ScriptAnalysisService svc) =>
Results.Ok(await svc.CompleteAsync(req)));
group.MapPost("/hover", (HoverRequest req, ScriptAnalysisService svc) =>
Results.Ok(svc.Hover(req)));
group.MapPost("/hover", async (HoverRequest req, ScriptAnalysisService svc) =>
Results.Ok(await svc.Hover(req)));
group.MapPost("/signature-help", (SignatureHelpRequest req, ScriptAnalysisService svc) =>
Results.Ok(svc.SignatureHelp(req)));
group.MapPost("/signature-help", async (SignatureHelpRequest req, ScriptAnalysisService svc) =>
Results.Ok(await svc.SignatureHelp(req)));
group.MapPost("/format", (FormatRequest req, ScriptAnalysisService svc) =>
Results.Ok(svc.Format(req)));
@@ -165,8 +165,10 @@ public class ScriptAnalysisService
/// because a shared script has no template siblings in this context.
/// For the SandboxInboundScriptHost surface, every <c>Route</c> call throws
/// because cross-site routing needs a deployed site.
/// Console.Out / Console.Error are redirected per-call so writes from
/// the script land in the result.
/// Console.Out / Console.Error are captured per-call via an AsyncLocal
/// scope (see <see cref="SandboxConsoleCapture"/>) so writes from the script
/// land in the result without mutating process-global Console state — two
/// concurrent Test Runs do not interfere with each other.
/// </summary>
public async Task<SandboxRunResult> RunInSandboxAsync(SandboxRunRequest request, CancellationToken ct)
{
@@ -220,6 +222,20 @@ public class ScriptAnalysisService
SandboxErrorKind.CompileError, 0, markers);
}
// Trust-model gate (CentralUI-001): the documented forbidden-API set is
// enforced HERE, before execution — not merely surfaced as an editor hint.
// Without this, a Design-role user could run arbitrary file/process/
// reflection/network code in the central host process.
var trustViolations = EnforceTrustModel(script.GetCompilation());
if (trustViolations.Count > 0)
{
return new SandboxRunResult(false, null, null, "",
"Script blocked by the trust model — it references forbidden APIs "
+ "(System.IO, System.Diagnostics, System.Reflection, System.Net, threading). "
+ "See the highlighted diagnostics.",
SandboxErrorKind.CompileError, 0, trustViolations);
}
var parameters = ConvertJsonParameters(request.Parameters);
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
@@ -311,6 +327,13 @@ public class ScriptAnalysisService
throw new ScriptSandboxException(
$"Scripts.CallShared(\"{name}\") compile failed: {string.Join("; ", nestedErrors.Select(d => d.GetMessage()))}");
// Trust-model gate (CentralUI-001) — a nested shared script runs
// arbitrary code too, so it must clear the same forbidden-API gate.
if (EnforceTrustModel(built.GetCompilation()).Count > 0)
throw new ScriptSandboxException(
$"Scripts.CallShared(\"{name}\") is blocked by the script trust model — "
+ "the shared script references forbidden APIs.");
lock (compileCacheLock)
{
if (!compileCache.TryGetValue(name, out compiled))
@@ -356,16 +379,19 @@ public class ScriptAnalysisService
Instance = instanceContext,
};
var originalOut = Console.Out;
var originalError = Console.Error;
// Console capture is routed per-call via an AsyncLocal scope (CentralUI-003).
// Console.Out is process-global, so it must NOT be redirected per run — two
// concurrent Test Runs would interleave output and the first to finish would
// restore Console.Out while the other is still writing. SandboxConsoleCapture
// installs routing writers once and scopes capture to this call-tree only.
var (captureOut, captureError) = SandboxConsoleCapture.Install();
var captured = new StringWriter();
using var outScope = captureOut.BeginCapture(captured);
using var errorScope = captureError.BeginCapture(captured);
var stopwatch = Stopwatch.StartNew();
try
{
Console.SetOut(captured);
Console.SetError(captured);
// Run on a thread-pool thread with no SynchronizationContext: a
// bound script's Instance.SetAttribute / Attributes[...] block
// synchronously on cross-site I/O (the API surface is sync by
@@ -416,11 +442,9 @@ public class ScriptAnalysisService
$"{inner.GetType().Name}: {inner.Message}",
SandboxErrorKind.RuntimeError, stopwatch.ElapsedMilliseconds, null);
}
finally
{
Console.SetOut(originalOut);
Console.SetError(originalError);
}
// outScope / errorScope are disposed by their `using` declarations when the
// method returns, restoring the previous capture scope on this call-tree
// without touching process-global Console state.
}
private static Dictionary<string, object?> ConvertJsonParameters(
@@ -698,7 +722,7 @@ public class ScriptAnalysisService
public InlayHintsResponse InlayHints(InlayHintsRequest request) =>
new(Array.Empty<InlayHint>());
public HoverResponse Hover(HoverRequest request)
public async Task<HoverResponse> Hover(HoverRequest request)
{
var script = TryParse(request.CodeText);
if (script == null) return new HoverResponse(null);
@@ -738,13 +762,13 @@ public class ScriptAnalysisService
var rawName = token.ValueText;
if (string.IsNullOrEmpty(rawName)) return new HoverResponse(null);
var shape = ResolveCalledShape(
var shape = await ResolveCalledShape(
call, rawName, request.SiblingScripts, request.Children, request.Parent);
if (shape == null) return new HoverResponse(null);
return new HoverResponse(FormatHover(shape, call));
}
public SignatureHelpResponse SignatureHelp(SignatureHelpRequest request)
public async Task<SignatureHelpResponse> SignatureHelp(SignatureHelpRequest request)
{
var empty = new SignatureHelpResponse(null, null, 0);
var script = TryParse(request.CodeText);
@@ -779,7 +803,7 @@ public class ScriptAnalysisService
var nameArg = inv.ArgumentList.Arguments[0].Expression as LiteralExpressionSyntax;
var scriptName = nameArg?.Token.ValueText ?? "";
var shape = ResolveCalledShape(
var shape = await ResolveCalledShape(
call, scriptName, request.SiblingScripts, request.Children, request.Parent);
if (shape == null) return empty;
@@ -940,22 +964,35 @@ public class ScriptAnalysisService
_ => "script"
};
/// <summary>Resolves the called script's shape from the metadata in scope for its kind.</summary>
private ScriptShape? ResolveCalledShape(
/// <summary>
/// Resolves the called script's shape from the metadata in scope for its kind.
/// CentralUI-013: the shared-script catalog is awaited rather than blocked on
/// with <c>.GetAwaiter().GetResult()</c>, so this method is async — and
/// <see cref="Hover"/> / <see cref="SignatureHelp"/> are async with it.
/// </summary>
private async Task<ScriptShape?> ResolveCalledShape(
ScriptCallInfo call,
string scriptName,
IReadOnlyList<ScriptShape>? siblings,
IReadOnlyList<CompositionContext>? children,
CompositionContext? parent) => call.Kind switch
CompositionContext? parent)
{
ScriptCallKind.Shared => _sharedScripts.GetShapesAsync().GetAwaiter().GetResult()
.FirstOrDefault(s => s.Name == scriptName),
ScriptCallKind.Sibling => siblings?.FirstOrDefault(s => s.Name == scriptName),
ScriptCallKind.Parent => parent?.Scripts.FirstOrDefault(s => s.Name == scriptName),
ScriptCallKind.Child => children?.FirstOrDefault(c => c.Name == call.CompositionName)
?.Scripts.FirstOrDefault(s => s.Name == scriptName),
_ => null
};
switch (call.Kind)
{
case ScriptCallKind.Shared:
var shapes = await _sharedScripts.GetShapesAsync();
return shapes.FirstOrDefault(s => s.Name == scriptName);
case ScriptCallKind.Sibling:
return siblings?.FirstOrDefault(s => s.Name == scriptName);
case ScriptCallKind.Parent:
return parent?.Scripts.FirstOrDefault(s => s.Name == scriptName);
case ScriptCallKind.Child:
return children?.FirstOrDefault(c => c.Name == call.CompositionName)
?.Scripts.FirstOrDefault(s => s.Name == scriptName);
default:
return null;
}
}
/// <summary>
/// SCADA006 — flag <c>Attributes["typo"]</c>,
@@ -1086,15 +1123,25 @@ public class ScriptAnalysisService
return new(AttributeContextKind.None, null);
}
/// <summary>
/// Finds every reference to a forbidden API — the documented script trust model,
/// see <see cref="ForbiddenNamespacePrefixes"/>. Identifiers are resolved against
/// the semantic model, so a forbidden type or member is caught however it is
/// written: bare (<c>File</c>), fully qualified
/// (<c>System.IO.File.WriteAllText</c>), or via an alias — while a user identifier
/// that merely shares a name with a forbidden type (<c>var File = …</c>) does not
/// false-positive. Used both for editor diagnostics and as the pre-execution
/// trust-model gate (see <see cref="EnforceTrustModel"/>).
/// </summary>
private static IEnumerable<DiagnosticMarker> FindForbiddenApiUsages(SyntaxTree tree, SemanticModel model)
{
var root = tree.GetRoot();
// Banned using directives — pure namespace string match is fine here.
// Banned using directives.
foreach (var u in root.DescendantNodes().OfType<UsingDirectiveSyntax>())
{
var name = u.Name?.ToString() ?? "";
if (ForbiddenNamespacePrefixes.Any(p => name == p || name.StartsWith(p + ".")))
if (IsForbiddenName(name))
{
var span = u.GetLocation().GetLineSpan().Span;
yield return new DiagnosticMarker(
@@ -1108,20 +1155,14 @@ public class ScriptAnalysisService
}
}
// Banned type usages — resolved via the semantic model so a user
// identifier named "File" or "Thread" does NOT trigger the diagnostic
// unless it actually resolves to a forbidden type.
// Banned type / member references, resolved via the semantic model. Every
// identifier is checked — including the right-hand side of a member access —
// so a fully-qualified forbidden call (System.IO.File.WriteAllText) cannot
// slip past by avoiding a `using` directive or a bare type name.
foreach (var ident in root.DescendantNodes().OfType<IdentifierNameSyntax>())
{
// Skip the identifier on the right side of a member access — only
// the leftmost (the type or qualifier) is what we want to check.
if (ident.Parent is MemberAccessExpressionSyntax m && m.Name == ident) continue;
var symbol = model.GetSymbolInfo(ident).Symbol;
if (symbol is not INamedTypeSymbol type) continue;
var ns = type.ContainingNamespace?.ToDisplayString() ?? "";
if (!ForbiddenNamespacePrefixes.Any(p => ns == p || ns.StartsWith(p + "."))) continue;
var forbidden = ForbiddenNameFor(model.GetSymbolInfo(ident).Symbol);
if (forbidden == null) continue;
var span = ident.GetLocation().GetLineSpan().Span;
yield return new DiagnosticMarker(
@@ -1130,11 +1171,75 @@ public class ScriptAnalysisService
StartColumn: span.Start.Character + 1,
EndLineNumber: span.End.Line + 1,
EndColumn: span.End.Character + 1,
Message: $"Type '{type.Name}' from forbidden namespace '{ns}' is not allowed in scripts.",
Message: $"'{ident.Identifier.ValueText}' resolves to forbidden API '{forbidden}', " +
"which is not allowed in scripts (script trust model).",
Code: "SCADA002");
}
}
/// <summary>
/// The forbidden namespace/type a symbol implicates, or null if it is allowed.
/// Checks the symbol's namespace and — for a type or member — the type's full
/// name, so an entry like <c>System.Threading.Thread</c> bans that exact type
/// while <c>System.Threading</c> (e.g. <c>CancellationToken</c>) stays allowed.
/// </summary>
private static string? ForbiddenNameFor(ISymbol? symbol)
{
if (symbol == null) return null;
foreach (var name in QualifiedNamesOf(symbol))
if (IsForbiddenName(name))
return name;
return null;
}
/// <summary>Fully-qualified names a symbol reference implicates for trust-model checking.</summary>
private static IEnumerable<string> QualifiedNamesOf(ISymbol symbol)
{
switch (symbol)
{
case INamespaceSymbol { IsGlobalNamespace: false } ns:
yield return ns.ToDisplayString();
break;
case ITypeSymbol type:
if (type.ContainingNamespace is { IsGlobalNamespace: false } tn)
yield return tn.ToDisplayString();
yield return FullTypeName(type);
break;
default:
if (symbol.ContainingType is { } ct)
{
if (ct.ContainingNamespace is { IsGlobalNamespace: false } cn)
yield return cn.ToDisplayString();
yield return FullTypeName(ct);
}
break;
}
}
private static string FullTypeName(ITypeSymbol type) =>
type.ContainingNamespace is { IsGlobalNamespace: false } ns
? ns.ToDisplayString() + "." + type.Name
: type.Name;
private static bool IsForbiddenName(string qualifiedName) =>
ForbiddenNamespacePrefixes.Any(p =>
qualifiedName == p || qualifiedName.StartsWith(p + ".", StringComparison.Ordinal));
/// <summary>
/// Pre-execution trust-model gate (CentralUI-001). Returns the forbidden-API
/// markers (SCADA001/SCADA002) for a compiled script; an empty list means the
/// script is clear to run. This is a static semantic gate, not a process
/// sandbox — reflection-based indirection is still out of its reach; full
/// isolation would require running scripts in a separate constrained process.
/// </summary>
private static IReadOnlyList<DiagnosticMarker> EnforceTrustModel(Compilation compilation)
{
var tree = compilation.SyntaxTrees.FirstOrDefault();
if (tree == null) return Array.Empty<DiagnosticMarker>();
var model = compilation.GetSemanticModel(tree);
return FindForbiddenApiUsages(tree, model).ToList();
}
private static CompletionItem ToCompletionItem(ISymbol symbol)
{
var kind = symbol.Kind switch
@@ -18,6 +18,10 @@ public static class ServiceCollectionExtensions
services.AddScoped<AuthenticationStateProvider, CookieAuthenticationStateProvider>();
services.AddCascadingAuthenticationState();
// Resolves the current user's permitted site set from their SiteId claims
// so Deployment/Monitoring pages can enforce site scoping (CentralUI-002).
services.AddScoped<SiteScopeService>();
// Centralised dialog service: pages inject IDialogService and a single
// <DialogHost /> in MainLayout renders the active dialog. See
// Components/Shared/IDialogService.cs.
@@ -1,11 +1,75 @@
namespace ScadaLink.ClusterInfrastructure;
/// <summary>
/// Cluster configuration model, bound from the <c>ScadaLink:Cluster</c> section
/// of <c>appsettings.json</c> via the Options pattern.
/// <para>
/// This project owns the cluster <em>configuration contract</em>. The actual
/// Akka.NET bootstrap — building the HOCON from these values, starting the
/// <c>ActorSystem</c>, configuring the split-brain resolver and wiring
/// <c>CoordinatedShutdown</c> — lives in <c>ScadaLink.Host</c>
/// (see <c>Component-ClusterInfrastructure.md</c> → "Implementation Note — Code Placement").
/// </para>
/// <para>
/// Node-identity settings (remoting hostname/port, cluster role, site identifier,
/// gRPC port) are deliberately <em>not</em> here — they are owned by
/// <c>ScadaLink.Host.NodeOptions</c> (<c>ScadaLink:Node</c> section). Local SQLite
/// storage paths are owned by the database / store-and-forward options. This class
/// holds only the cluster-formation and failure-detection settings shared by every node.
/// </para>
/// </summary>
public class ClusterOptions
{
/// <summary>
/// The <c>appsettings.json</c> section name this options class binds from.
/// Single source of truth so binding sites do not hard-code the magic string.
/// </summary>
public const string SectionName = "ScadaLink:Cluster";
/// <summary>
/// Akka.NET cluster seed nodes. Both nodes are seed nodes — each node lists
/// itself and its partner — so either can start first and form the cluster.
/// Must contain at least one entry.
/// </summary>
public List<string> SeedNodes { get; set; } = new();
/// <summary>
/// Split-brain resolver strategy. Must be <c>keep-oldest</c> for the two-node
/// clusters ScadaLink uses: quorum strategies (<c>keep-majority</c>,
/// <c>static-quorum</c>) cannot distinguish a crash from a partition with only
/// two nodes and would shut down the whole cluster.
/// </summary>
public string SplitBrainResolverStrategy { get; set; } = "keep-oldest";
/// <summary>
/// Time the cluster membership must remain stable before the split-brain
/// resolver acts to down unreachable nodes. Must be positive. Default 15s.
/// </summary>
public TimeSpan StableAfter { get; set; } = TimeSpan.FromSeconds(15);
/// <summary>
/// Frequency of cluster failure-detector heartbeat messages between nodes.
/// Must be well below <see cref="FailureDetectionThreshold"/>. Default 2s.
/// </summary>
public TimeSpan HeartbeatInterval { get; set; } = TimeSpan.FromSeconds(2);
/// <summary>
/// Time without a heartbeat before a node is considered unreachable
/// (Akka's <c>acceptable-heartbeat-pause</c>). Default 10s.
/// </summary>
public TimeSpan FailureDetectionThreshold { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>
/// Akka's <c>min-nr-of-members</c>. Must be <c>1</c>: after failover only one
/// node runs, and a value of <c>2</c> blocks the cluster singleton (Site Runtime
/// Deployment Manager) — and therefore all data collection — indefinitely.
/// </summary>
public int MinNrOfMembers { get; set; } = 1;
/// <summary>
/// The keep-oldest resolver's <c>down-if-alone</c> flag. When <c>true</c> (the
/// design-doc requirement), the oldest node downs itself if it finds it has no
/// other reachable members, rather than running as an isolated single-node cluster.
/// </summary>
public bool DownIfAlone { get; set; } = true;
}
@@ -0,0 +1,72 @@
using Microsoft.Extensions.Options;
namespace ScadaLink.ClusterInfrastructure;
/// <summary>
/// CI-004: Validates <see cref="ClusterOptions"/> at startup. The values it
/// guards carry cluster-wide consequences — the design doc
/// (<c>Component-ClusterInfrastructure.md</c>) is emphatic that misconfiguring
/// them produces a total cluster shutdown or an indefinitely blocked singleton.
/// Registered with <c>ValidateOnStart()</c> so a bad <c>appsettings.json</c>
/// fails fast at boot rather than failing far from the cause.
/// </summary>
public sealed class ClusterOptionsValidator : IValidateOptions<ClusterOptions>
{
/// <summary>Split-brain resolver strategies safe for ScadaLink's two-node clusters.</summary>
private static readonly HashSet<string> AllowedStrategies = new(StringComparer.OrdinalIgnoreCase)
{
"keep-oldest"
};
public ValidateOptionsResult Validate(string? name, ClusterOptions options)
{
var failures = new List<string>();
if (options.SeedNodes is null || options.SeedNodes.Count == 0)
{
failures.Add("ClusterOptions.SeedNodes must contain at least one seed node.");
}
if (string.IsNullOrWhiteSpace(options.SplitBrainResolverStrategy)
|| !AllowedStrategies.Contains(options.SplitBrainResolverStrategy))
{
failures.Add(
$"ClusterOptions.SplitBrainResolverStrategy must be 'keep-oldest' for a two-node cluster; " +
$"'{options.SplitBrainResolverStrategy}' would risk a total cluster shutdown on a partition.");
}
if (options.MinNrOfMembers != 1)
{
failures.Add(
$"ClusterOptions.MinNrOfMembers must be 1 (was {options.MinNrOfMembers}); " +
"any other value blocks the cluster singleton after failover and halts all data collection.");
}
if (options.StableAfter <= TimeSpan.Zero)
{
failures.Add("ClusterOptions.StableAfter must be a positive duration.");
}
if (options.HeartbeatInterval <= TimeSpan.Zero)
{
failures.Add("ClusterOptions.HeartbeatInterval must be a positive duration.");
}
if (options.FailureDetectionThreshold <= TimeSpan.Zero)
{
failures.Add("ClusterOptions.FailureDetectionThreshold must be a positive duration.");
}
if (options.HeartbeatInterval >= options.FailureDetectionThreshold)
{
failures.Add(
$"ClusterOptions.HeartbeatInterval ({options.HeartbeatInterval}) must be well below " +
$"FailureDetectionThreshold ({options.FailureDetectionThreshold}); otherwise nodes are " +
"declared unreachable before a heartbeat can arrive.");
}
return failures.Count > 0
? ValidateOptionsResult.Fail(failures)
: ValidateOptionsResult.Success;
}
}
@@ -8,8 +8,8 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
<ItemGroup>
@@ -1,18 +1,47 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Options;
namespace ScadaLink.ClusterInfrastructure;
/// <summary>
/// DI registration for the Cluster Infrastructure component.
/// </summary>
public static class ServiceCollectionExtensions
{
/// <summary>
/// Registers the Cluster Infrastructure services. This component owns the
/// cluster <em>configuration contract</em> (<see cref="ClusterOptions"/>); the
/// Akka.NET bootstrap itself lives in <c>ScadaLink.Host</c>
/// (see <c>Component-ClusterInfrastructure.md</c>).
/// <para>
/// Registering the <see cref="ClusterOptionsValidator"/> means a misconfigured
/// <c>ScadaLink:Cluster</c> section (e.g. <c>MinNrOfMembers: 2</c> or a quorum
/// split-brain strategy) throws an <see cref="OptionsValidationException"/> the
/// first time <see cref="IOptions{TOptions}"/> is resolved, rather than booting
/// into a broken cluster.
/// </para>
/// </summary>
public static IServiceCollection AddClusterInfrastructure(this IServiceCollection services)
{
// Phase 0: skeleton only
services.TryAddEnumerable(
ServiceDescriptor.Singleton<IValidateOptions<ClusterOptions>, ClusterOptionsValidator>());
return services;
}
/// <summary>
/// Reserved for cluster-infrastructure actor registration. This component does
/// not register any actors — the Akka.NET bootstrap and actor wiring live in
/// <c>ScadaLink.Host</c>. The method throws rather than silently returning
/// success so that any caller assuming this component registers actors fails
/// fast with a clear cause instead of failing later, far from here.
/// </summary>
/// <exception cref="NotImplementedException">Always thrown.</exception>
public static IServiceCollection AddClusterInfrastructureActors(this IServiceCollection services)
{
// Phase 0: placeholder for Akka actor registration
return services;
throw new NotImplementedException(
"ScadaLink.ClusterInfrastructure registers no actors. The Akka.NET actor system " +
"bootstrap and all cluster actor registration live in ScadaLink.Host " +
"(AkkaHostedService). Do not call AddClusterInfrastructureActors().");
}
}
@@ -14,6 +14,21 @@ public class TemplateAlarm
public string? TriggerConfiguration { get; set; }
public int? OnTriggerScriptId { get; set; }
/// <summary>
/// True when this row was copied from the base template and has not been
/// overridden on the derived template. Changes to the base flow downward
/// for inherited rows; an explicit override flips this to false.
/// Always false on base (non-derived) templates.
/// </summary>
public bool IsInherited { get; set; }
/// <summary>
/// Set on a base alarm. When true, derived templates may not override the
/// alarm — the row is rendered readonly with a 🔒 in the derived UI, and
/// any attempt to update it through the API is rejected.
/// </summary>
public bool LockedInDerived { get; set; }
public TemplateAlarm(string name)
{
Name = name ?? throw new ArgumentNullException(nameof(name));
@@ -0,0 +1,13 @@
namespace ScadaLink.Commons.Messages.Deployment;
/// <summary>
/// Central→site query for the currently-applied deployment state of a single
/// instance. Issued by the Deployment Manager before a re-deploy when a prior
/// deployment record is stuck <c>InProgress</c> or <c>Failed</c> due to a
/// timeout, so the site's actual state can be reconciled against the target
/// revision before re-sending a deployment ("Deployment Identity &amp; Idempotency").
/// </summary>
public record DeploymentStateQueryRequest(
string CorrelationId,
string InstanceUniqueName,
DateTimeOffset Timestamp);
@@ -0,0 +1,15 @@
namespace ScadaLink.Commons.Messages.Deployment;
/// <summary>
/// Site→central response carrying the instance's currently-applied deployment
/// state. If <see cref="IsDeployed"/> is <c>false</c> the instance has no
/// deployed configuration at the site and <see cref="AppliedDeploymentId"/> /
/// <see cref="AppliedRevisionHash"/> are <c>null</c>.
/// </summary>
public record DeploymentStateQueryResponse(
string CorrelationId,
string InstanceUniqueName,
bool IsDeployed,
string? AppliedDeploymentId,
string? AppliedRevisionHash,
DateTimeOffset Timestamp);
@@ -2,21 +2,56 @@ using System.Collections.Frozen;
namespace ScadaLink.Commons.Messages.Management;
/// <summary>
/// Bidirectional name registry for management command records. The registry contains
/// exactly the non-abstract <c>*Command</c> types declared in the
/// <c>ScadaLink.Commons.Messages.Management</c> namespace; these are the commands that
/// travel over the HTTP / ClusterClient management boundary.
/// </summary>
/// <remarks>
/// <see cref="Resolve"/> and <see cref="GetCommandName"/> are symmetric:
/// <c>Resolve(GetCommandName(t))</c> returns <c>t</c> for every type
/// <see cref="GetCommandName"/> accepts. <see cref="GetCommandName"/> rejects any type
/// the registry does not contain rather than computing an unresolvable name.
/// </remarks>
public static class ManagementCommandRegistry
{
private static readonly FrozenDictionary<string, Type> Commands = BuildRegistry();
/// <summary>
/// Names keyed by command type, for the reverse lookup. Keeps
/// <see cref="GetCommandName"/> in lock-step with the forward registry.
/// </summary>
private static readonly FrozenDictionary<Type, string> NamesByType =
Commands.ToFrozenDictionary(kv => kv.Value, kv => kv.Key);
public static Type? Resolve(string commandName)
{
return Commands.GetValueOrDefault(commandName);
}
/// <summary>
/// Returns the registered wire name for a management command type.
/// </summary>
/// <exception cref="ArgumentException">
/// Thrown when <paramref name="commandType"/> is not a registered management
/// command — i.e. not a non-abstract <c>*Command</c> type in the
/// <c>ScadaLink.Commons.Messages.Management</c> namespace. This keeps the method
/// symmetric with <see cref="Resolve"/>: it never yields a name that
/// <see cref="Resolve"/> cannot turn back into the same type.
/// </exception>
public static string GetCommandName(Type commandType)
{
var name = commandType.Name;
return name.EndsWith("Command", StringComparison.Ordinal)
? name[..^"Command".Length]
: name;
ArgumentNullException.ThrowIfNull(commandType);
if (NamesByType.TryGetValue(commandType, out var name))
return name;
throw new ArgumentException(
$"'{commandType.FullName}' is not a registered management command. " +
$"Management commands must be non-abstract '*Command' records declared in " +
$"the '{typeof(ManagementEnvelope).Namespace}' namespace.",
nameof(commandType));
}
private static FrozenDictionary<string, Type> BuildRegistry()
@@ -7,13 +7,22 @@ namespace ScadaLink.Commons.Types;
/// Wraps a JsonElement as a dynamic object for convenient property access in scripts.
/// Supports property access (obj.name), indexing (obj.items[0]), and ToString().
/// </summary>
/// <remarks>
/// The element passed to the constructor is <see cref="JsonElement.Clone()">cloned</see>
/// so the wrapper owns a self-contained copy. This decouples its lifetime from the
/// <see cref="JsonDocument"/> the element originated from: a wrapper built from an
/// element inside a <c>using</c> block remains valid for deferred (e.g. script-time)
/// access after that document has been disposed.
/// </remarks>
public class DynamicJsonElement : DynamicObject
{
private readonly JsonElement _element;
public DynamicJsonElement(JsonElement element)
{
_element = element;
// Clone detaches the element from its owning JsonDocument so accessing it
// later cannot throw ObjectDisposedException once that document is disposed.
_element = element.Clone();
}
public override bool TryGetMember(GetMemberBinder binder, out object? result)
@@ -11,5 +11,11 @@ public enum AlarmTriggerType
/// may carry its own priority; transitions between levels emit a fresh
/// AlarmStateChanged with the corresponding <see cref="AlarmLevel"/>.
/// </summary>
HiLo
HiLo,
/// <summary>
/// Read-only boolean C# expression evaluated on attribute updates. The
/// trigger fires when the expression evaluates to <c>true</c>.
/// </summary>
Expression
}
+10 -10
View File
@@ -24,7 +24,8 @@ public class ScriptParameters : IReadOnlyDictionary<string, object?>
/// Gets a parameter value with typed conversion.
/// <list type="bullet">
/// <item><c>Get&lt;int&gt;("key")</c> — throws if missing, null, or unconvertible.</item>
/// <item><c>Get&lt;int?&gt;("key")</c> — returns null if missing, null, or unconvertible.</item>
/// <item><c>Get&lt;int?&gt;("key")</c> — returns null if the parameter is missing or null;
/// throws if it is present but holds an unconvertible value.</item>
/// <item><c>Get&lt;int[]&gt;("key")</c> — converts list to typed array; throws on first bad element.</item>
/// <item><c>Get&lt;List&lt;int&gt;&gt;("key")</c> — converts list to typed List; throws on first bad element.</item>
/// </list>
@@ -71,18 +72,17 @@ public class ScriptParameters : IReadOnlyDictionary<string, object?>
private T GetNullable<T>(string key, Type underlyingType)
{
// Absent or explicitly-null parameter — the caller did not supply a value.
if (!_inner.TryGetValue(key, out var value) || value is null)
return default!; // null for Nullable<T>
try
{
var converted = ConvertScalar(value, underlyingType, key);
return (T)converted;
}
catch (ScriptParameterException)
{
return default!; // null on conversion failure for nullable
}
// A parameter that is *present but non-null* must be convertible. A value
// that cannot be converted is a caller/script bug, not "not supplied":
// throw with a descriptive message rather than silently returning null
// (which a script would misread as absent). This mirrors Get<T>() and the
// array/list element paths. See Commons-003.
var converted = ConvertScalar(value, underlyingType, key);
return (T)converted;
}
private Array ConvertToArray(string key, Type elementType)
+77 -10
View File
@@ -1,3 +1,7 @@
using System.Runtime.CompilerServices;
[assembly: InternalsVisibleTo("ScadaLink.Commons.Tests")]
namespace ScadaLink.Commons.Types;
/// <summary>
@@ -5,11 +9,29 @@ namespace ScadaLink.Commons.Types;
/// within <see cref="MaxSilence"/>, the <see cref="Stale"/> event fires.
/// Composable into any IDataConnection adapter.
/// </summary>
/// <remarks>
/// Thread-safe: <see cref="Start"/>, <see cref="OnValueReceived"/> and <see cref="Stop"/>
/// may be called from any thread and race the internal timer callback. Each call to
/// <see cref="Start"/> or <see cref="OnValueReceived"/> begins a new monitoring period
/// identified by a generation token; a timer callback only raises <see cref="Stale"/>
/// if it still belongs to the current period. A fresh value, a restart, or a
/// <see cref="Stop"/> arriving while a previous-period callback is in flight bumps the
/// generation, so that callback observes the mismatch and declines to fire — no spurious
/// staleness signal is emitted after the period it was scheduled for has ended.
/// </remarks>
public sealed class StaleTagMonitor : IDisposable
{
private readonly TimeSpan _maxSilence;
private readonly object _gate = new();
private Timer? _timer;
private volatile bool _staleFired;
/// <summary>
/// Monotonically increasing token identifying the current monitoring period.
/// Bumped on every <see cref="Start"/>, <see cref="OnValueReceived"/> and
/// <see cref="Stop"/> so that a timer callback scheduled for an earlier period
/// can detect that it is stale and decline to fire.
/// </summary>
private long _generation;
public StaleTagMonitor(TimeSpan maxSilence)
{
@@ -26,14 +48,25 @@ public sealed class StaleTagMonitor : IDisposable
public TimeSpan MaxSilence => _maxSilence;
/// <summary>
/// Test-only seam invoked by the timer callback after it has been entered but
/// before it acquires the synchronization gate. Allows a test to deterministically
/// interleave a <see cref="Stop"/> / <see cref="OnValueReceived"/> with an in-flight
/// callback to exercise the stale-fire race. Never set in production.
/// </summary>
internal Action? CallbackEnteredHook { get; set; }
/// <summary>
/// Start monitoring. The timer begins counting from now.
/// </summary>
public void Start()
{
_staleFired = false;
_timer?.Dispose();
_timer = new Timer(OnTimerElapsed, null, _maxSilence, Timeout.InfiniteTimeSpan);
lock (_gate)
{
_generation++;
_timer?.Dispose();
_timer = new Timer(OnTimerElapsed, _generation, _maxSilence, Timeout.InfiniteTimeSpan);
}
}
/// <summary>
@@ -41,8 +74,20 @@ public sealed class StaleTagMonitor : IDisposable
/// </summary>
public void OnValueReceived()
{
_staleFired = false;
_timer?.Change(_maxSilence, Timeout.InfiniteTimeSpan);
lock (_gate)
{
// No active monitoring — nothing to reset.
if (_timer is null)
return;
// Bump the generation: any timer callback for the previous period that
// has already been entered will see a generation mismatch and decline to
// raise Stale. The timer is recreated rather than re-armed with
// Change(...) so the new callback carries the new generation token.
_generation++;
_timer.Dispose();
_timer = new Timer(OnTimerElapsed, _generation, _maxSilence, Timeout.InfiniteTimeSpan);
}
}
/// <summary>
@@ -50,8 +95,14 @@ public sealed class StaleTagMonitor : IDisposable
/// </summary>
public void Stop()
{
_timer?.Dispose();
_timer = null;
lock (_gate)
{
// Bumping the generation invalidates any in-flight callback so a stopped
// monitor cannot deliver a Stale signal.
_generation++;
_timer?.Dispose();
_timer = null;
}
}
public void Dispose()
@@ -61,8 +112,24 @@ public sealed class StaleTagMonitor : IDisposable
private void OnTimerElapsed(object? state)
{
if (_staleFired) return;
_staleFired = true;
var scheduledGeneration = (long)state!;
CallbackEnteredHook?.Invoke();
// Only fire if this callback still represents the current period. The check
// and the generation bump happen under the gate, so a concurrent
// OnValueReceived / Stop / Start either completes before this guard (its
// generation bump makes this callback decline) or serializes after it.
lock (_gate)
{
if (_generation != scheduledGeneration)
return;
// Consume this period so a duplicate callback for the same generation
// cannot fire twice; the next Start/OnValueReceived issues a new token.
_generation++;
}
Stale?.Invoke();
}
}
@@ -84,6 +84,15 @@ public class CentralCommunicationActor : ReceiveActor
// Periodic refresh trigger
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
// Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
// Status.Failure. Without this handler the failure was an unhandled message
// (debug-level only) and the refresh failed silently — operators could not
// distinguish "no sites configured" from "database is down". Log at Warning.
Receive<Status.Failure>(failure =>
_log.Warning(failure.Cause,
"Failed to load site addresses from the database; the site ClusterClient "
+ "cache was not refreshed and may be stale or empty"));
// Health monitoring: heartbeats and health reports from sites
Receive<HeartbeatMessage>(HandleHeartbeat);
Receive<SiteHealthReport>(HandleSiteHealthReport);
@@ -296,6 +305,25 @@ public class CentralCommunicationActor : ReceiveActor
}
}
/// <summary>
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
/// CentralCommunicationActor is a long-lived coordinator that owns the per-site
/// ClusterClient map; a transient fault in a child (e.g. a ClusterClient child)
/// must Resume so the child's connection state is preserved rather than wiped by
/// a Restart.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: -1,
withinTimeRange: Timeout.InfiniteTimeSpan,
decider: Decider.From(ex =>
{
_log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
return Directive.Resume;
}));
}
protected override void PreStart()
{
_log.Info("CentralCommunicationActor started");
@@ -28,7 +28,19 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
private const int MaxRetries = 3;
private const string ReconnectTimerKey = "grpc-reconnect";
private const string StabilityTimerKey = "grpc-stability";
internal static TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
/// <summary>
/// How long a freshly-opened gRPC stream must stay up before its retry budget
/// is considered "recovered" and <see cref="_retryCount"/> is reset to 0.
/// Communication-008: the retry count must NOT be reset by individual events —
/// a stream that connects, delivers one event, then fails repeatedly would
/// otherwise reconnect forever and never trip <see cref="MaxRetries"/>. Resetting
/// only after a stable interval bounds a flapping stream.
/// </summary>
internal static TimeSpan StabilityWindow { get; set; } = TimeSpan.FromSeconds(60);
private int _retryCount;
private bool _useNodeA = true;
private bool _stopped;
@@ -66,16 +78,21 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
OpenGrpcStream();
});
// Domain events arriving via Self.Tell from gRPC callback
Receive<AttributeValueChanged>(changed =>
{
_retryCount = 0; // Successful event resets retry count
_onEvent(changed);
});
Receive<AlarmStateChanged>(changed =>
// Domain events arriving via Self.Tell from gRPC callback.
// Communication-008: receiving an event must NOT reset _retryCount — a
// flapping stream that delivers a single event between failures would
// otherwise never trip MaxRetries. The retry budget is recovered only by
// GrpcStreamStable (a stream that has stayed up for StabilityWindow).
Receive<AttributeValueChanged>(changed => _onEvent(changed));
Receive<AlarmStateChanged>(changed => _onEvent(changed));
// Stream has been stably connected for StabilityWindow — recover the
// retry budget so a future transient fault gets a fresh set of retries.
Receive<GrpcStreamStable>(_ =>
{
if (_stopped) return;
_retryCount = 0;
_onEvent(changed);
_log.Debug("gRPC stream for {0} stable, retry count reset", _instanceUniqueName);
});
// gRPC stream error — attempt reconnection
@@ -151,6 +168,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
_grpcCts?.Dispose();
_grpcCts = new CancellationTokenSource();
// Arm the stability timer: if the stream stays up for StabilityWindow the
// retry budget is recovered (Communication-008). Cancelled by HandleGrpcError.
Timers.StartSingleTimer(StabilityTimerKey, new GrpcStreamStable(), StabilityWindow);
var client = _grpcFactory.GetOrCreate(_siteIdentifier, endpoint);
var self = Self;
var ct = _grpcCts.Token;
@@ -171,6 +192,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
{
if (_stopped) return;
// The stream failed before reaching the stability window — its retry
// budget is NOT recovered (Communication-008).
Timers.Cancel(StabilityTimerKey);
_retryCount++;
if (_retryCount > MaxRetries)
@@ -183,6 +208,15 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
return;
}
// Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
// This cancels the local subscription CTS and -- where the channel is still
// alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
// stops the StreamRelayActor for this correlation ID, rather than leaving a
// zombie relay actor until TCP RST / keepalive eventually detects the loss.
var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
previousClient.Unsubscribe(_correlationId);
// Flip to the other node
_useNodeA = !_useNodeA;
@@ -230,3 +264,10 @@ internal record GrpcStreamError(Exception Exception);
/// Internal message to trigger gRPC stream reconnection.
/// </summary>
internal record ReconnectGrpcStream;
/// <summary>
/// Internal message indicating the current gRPC stream has been connected long
/// enough (<see cref="DebugStreamBridgeActor.StabilityWindow"/>) to be considered
/// stable, so the reconnect retry budget can be recovered.
/// </summary>
internal record GrpcStreamStable;
@@ -76,6 +76,11 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
Receive<EnableInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
Receive<DeleteInstanceCommand>(msg => _deploymentManagerProxy.Forward(msg));
// DeploymentManager-006: query-the-site-before-redeploy — forward to
// the Deployment Manager, which owns the deployed-config store and
// answers with the instance's currently-applied deployment identity.
Receive<DeploymentStateQueryRequest>(msg => _deploymentManagerProxy.Forward(msg));
// Pattern 3: Artifact Deployment — forward to artifact handler if registered
Receive<DeployArtifactsCommand>(msg =>
{
@@ -172,6 +177,24 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
}
/// <summary>
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
/// SiteCommunicationActor is a long-lived coordinator routing all message
/// patterns to local handlers; a transient fault in a child must Resume so the
/// child's in-memory state is preserved rather than discarded by a Restart.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: -1,
withinTimeRange: Timeout.InfiniteTimeSpan,
decider: Decider.From(ex =>
{
_log.Warning(ex, "Child actor of SiteCommunicationActor faulted, resuming (state preserved)");
return Directive.Resume;
}));
}
protected override void PreStart()
{
_log.Info("SiteCommunicationActor started for site {0}", _siteId);
@@ -73,6 +73,26 @@ public class CommunicationService
envelope, _options.DeploymentTimeout, cancellationToken);
}
/// <summary>
/// DeploymentManager-006: queries a site for the currently-applied deployment
/// identity of a single instance. Used by the Deployment Manager before a
/// re-deploy to reconcile against the site's actual state. Sent over the
/// existing ClusterClient command/control transport; the Ask times out (no
/// central buffering) if the site is unreachable, and the caller falls
/// through to a normal deploy.
/// </summary>
public async Task<DeploymentStateQueryResponse> QueryDeploymentStateAsync(
string siteId, DeploymentStateQueryRequest request, CancellationToken cancellationToken = default)
{
_logger.LogDebug(
"Sending DeploymentStateQueryRequest to site {SiteId}, instance={Instance}, correlationId={CorrelationId}",
siteId, request.InstanceUniqueName, request.CorrelationId);
var envelope = new SiteEnvelope(siteId, request);
return await GetActor().Ask<DeploymentStateQueryResponse>(
envelope, _options.QueryTimeout, cancellationToken);
}
// ── Pattern 2: Lifecycle ──
public async Task<InstanceLifecycleResponse> DisableInstanceAsync(
@@ -127,20 +127,36 @@ public class DebugStreamService
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
timeoutCts.CancelAfter(TimeSpan.FromSeconds(30));
DebugViewSnapshot snapshot;
try
{
var snapshot = await snapshotTcs.Task.WaitAsync(timeoutCts.Token);
_logger.LogInformation("Debug stream {SessionId} started for {Instance} on site {Site}",
sessionId, instanceUniqueName, siteIdentifier);
return new DebugStreamSession(sessionId, snapshot);
snapshot = await snapshotTcs.Task.WaitAsync(timeoutCts.Token);
}
catch (OperationCanceledException)
catch (Exception ex)
{
StopStream(sessionId);
throw new TimeoutException($"Timed out waiting for debug snapshot from {instanceUniqueName} on site {siteIdentifier}.");
// Any failure before the snapshot arrives — the 30s timeout, or the stream
// terminating early (site disconnect / gRPC failure, surfaced by
// onTerminatedWrapper as an InvalidOperationException) — must deterministically
// tear down the bridge actor and its site-side subscription. Use the local
// actor reference: a racing onTerminatedWrapper may already have removed the
// session, which would make StopStream a no-op. StopDebugStream is idempotent
// (the actor may already be stopping itself).
_sessions.TryRemove(sessionId, out _);
bridgeActor.Tell(new StopDebugStream());
if (ex is OperationCanceledException)
throw new TimeoutException(
$"Timed out waiting for debug snapshot from {instanceUniqueName} on site {siteIdentifier}.");
throw new InvalidOperationException(
$"Debug stream for {instanceUniqueName} on site {siteIdentifier} terminated before a snapshot was received.",
ex);
}
_logger.LogInformation("Debug stream {SessionId} started for {Instance} on site {Site}",
sessionId, instanceUniqueName, siteIdentifier);
return new DebugStreamSession(sessionId, snapshot);
}
/// <summary>
@@ -13,21 +13,45 @@ namespace ScadaLink.Communication.Grpc;
/// SiteStreamGrpcServer. The central-side DebugStreamBridgeActor uses this
/// to open server-streaming calls for individual instances.
/// </summary>
public class SiteStreamGrpcClient : IAsyncDisposable
public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
{
private readonly GrpcChannel? _channel;
private readonly SiteStreamService.SiteStreamServiceClient? _client;
private readonly ILogger? _logger;
private readonly ConcurrentDictionary<string, CancellationTokenSource> _subscriptions = new();
/// <summary>
/// The HTTP/2 keepalive ping delay actually applied to this client's channel.
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
/// </summary>
internal TimeSpan KeepAlivePingDelay { get; }
/// <summary>
/// The HTTP/2 keepalive ping timeout actually applied to this client's channel.
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
/// </summary>
internal TimeSpan KeepAlivePingTimeout { get; }
public SiteStreamGrpcClient(string endpoint, ILogger logger)
: this(endpoint, logger, new CommunicationOptions())
{
}
/// <summary>
/// Creates a client whose HTTP/2 keepalive is taken from <see cref="CommunicationOptions"/>
/// rather than hard-coded, satisfying the design doc's "gRPC Connection Keepalive"
/// section which states these values are configurable.
/// </summary>
public SiteStreamGrpcClient(string endpoint, ILogger logger, CommunicationOptions options)
{
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay;
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout;
_channel = GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
{
HttpHandler = new SocketsHttpHandler
{
KeepAlivePingDelay = TimeSpan.FromSeconds(15),
KeepAlivePingTimeout = TimeSpan.FromSeconds(10),
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay,
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout,
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always
}
});
@@ -57,6 +81,32 @@ public class SiteStreamGrpcClient : IAsyncDisposable
_subscriptions[correlationId] = cts;
}
/// <summary>
/// Registers a subscription's CancellationTokenSource for a correlation ID.
/// If an entry already exists for that correlation ID (a reconnect race where two
/// <see cref="SubscribeAsync"/> calls briefly share an ID), the prior CTS is
/// cancelled and disposed so it cannot leak. Internal for testability.
/// </summary>
internal void RegisterSubscription(string correlationId, CancellationTokenSource cts)
{
if (_subscriptions.TryGetValue(correlationId, out var prior) && !ReferenceEquals(prior, cts))
{
prior.Cancel();
prior.Dispose();
}
_subscriptions[correlationId] = cts;
}
/// <summary>
/// Removes the subscription entry for a correlation ID only if the stored CTS is
/// exactly the one supplied. A racing replacement stream may already own the slot,
/// in which case this is a no-op. Internal for testability.
/// </summary>
internal void RemoveSubscription(string correlationId, CancellationTokenSource cts)
{
_subscriptions.TryRemove(new KeyValuePair<string, CancellationTokenSource>(correlationId, cts));
}
/// <summary>
/// Opens a server-streaming subscription for a specific instance.
/// This is a long-running async method; the caller launches it as a background task.
@@ -74,7 +124,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_subscriptions[correlationId] = cts;
RegisterSubscription(correlationId, cts);
var request = new InstanceStreamRequest
{
@@ -103,7 +153,8 @@ public class SiteStreamGrpcClient : IAsyncDisposable
}
finally
{
_subscriptions.TryRemove(correlationId, out _);
// Remove only our own entry -- a racing reconnect may already own the slot.
RemoveSubscription(correlationId, cts);
}
}
@@ -178,7 +229,13 @@ public class SiteStreamGrpcClient : IAsyncDisposable
_ => AlarmLevel.None
};
public async ValueTask DisposeAsync()
/// <summary>
/// Releases all subscription CancellationTokenSources and the underlying
/// gRPC channel. All teardown here is synchronous (CTS disposal and
/// <see cref="GrpcChannel.Dispose"/>), so a synchronous <see cref="Dispose"/>
/// can release everything without sync-over-async blocking.
/// </summary>
private void ReleaseResources()
{
foreach (var cts in _subscriptions.Values)
{
@@ -187,9 +244,22 @@ public class SiteStreamGrpcClient : IAsyncDisposable
}
_subscriptions.Clear();
if (_channel is not null)
_channel.Dispose();
_channel?.Dispose();
}
await ValueTask.CompletedTask;
public virtual ValueTask DisposeAsync()
{
ReleaseResources();
return ValueTask.CompletedTask;
}
/// <summary>
/// Synchronous disposal. All resources held by this client are released
/// synchronously, so callers (e.g. <see cref="SiteStreamGrpcClientFactory.Dispose"/>)
/// need not block on the async disposal path.
/// </summary>
public virtual void Dispose()
{
ReleaseResources();
}
}
@@ -1,5 +1,6 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ScadaLink.Communication.Grpc;
@@ -12,22 +13,43 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
{
private readonly ConcurrentDictionary<string, SiteStreamGrpcClient> _clients = new();
private readonly ILoggerFactory _loggerFactory;
private readonly CommunicationOptions _options;
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory)
: this(loggerFactory, Options.Create(new CommunicationOptions()))
{
_loggerFactory = loggerFactory;
}
/// <summary>
/// Returns an existing client for the site or creates a new one.
/// DI constructor — flows <see cref="CommunicationOptions"/> into every created
/// <see cref="SiteStreamGrpcClient"/> so the configured gRPC keepalive settings
/// are applied rather than hard-coded defaults.
/// </summary>
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory, IOptions<CommunicationOptions> options)
{
_loggerFactory = loggerFactory;
_options = options.Value;
}
/// <summary>
/// Returns an existing client for the site or creates a new one. The new
/// client is created via <see cref="CreateClient"/> and tracked so the
/// factory's <see cref="Dispose"/> / <see cref="DisposeAsync"/> release it.
/// </summary>
public virtual SiteStreamGrpcClient GetOrCreate(string siteIdentifier, string grpcEndpoint)
{
return _clients.GetOrAdd(siteIdentifier, _ =>
{
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
return new SiteStreamGrpcClient(grpcEndpoint, logger);
});
return _clients.GetOrAdd(siteIdentifier, _ => CreateClient(grpcEndpoint));
}
/// <summary>
/// Creates a single <see cref="SiteStreamGrpcClient"/>. Overridable so tests
/// can substitute a tracking client while still exercising the factory's real
/// caching and disposal machinery.
/// </summary>
protected virtual SiteStreamGrpcClient CreateClient(string grpcEndpoint)
{
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
return new SiteStreamGrpcClient(grpcEndpoint, logger, _options);
}
/// <summary>
@@ -50,8 +72,19 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
_clients.Clear();
}
/// <summary>
/// Synchronous disposal. Communication-007: this used to block on
/// <c>DisposeAsync().AsTask().GetAwaiter().GetResult()</c> (sync-over-async,
/// a stall/deadlock risk during host shutdown). Each
/// <see cref="SiteStreamGrpcClient"/> releases all of its resources
/// synchronously, so we dispose them directly with no async path.
/// </summary>
public void Dispose()
{
DisposeAsync().AsTask().GetAwaiter().GetResult();
foreach (var client in _clients.Values)
{
client.Dispose();
}
_clients.Clear();
}
}
@@ -3,6 +3,7 @@ using System.Threading.Channels;
using Akka.Actor;
using Grpc.Core;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using GrpcStatus = Grpc.Core.Status;
namespace ScadaLink.Communication.Grpc;
@@ -19,17 +20,48 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
private readonly ILogger<SiteStreamGrpcServer> _logger;
private readonly ConcurrentDictionary<string, StreamEntry> _activeStreams = new();
private readonly int _maxConcurrentStreams;
private readonly TimeSpan _maxStreamLifetime;
private volatile bool _ready;
private long _actorCounter;
public SiteStreamGrpcServer(
/// <summary>
/// Test-only constructor — kept <c>internal</c> so the DI container sees a
/// single public constructor and is not faced with an ambiguous choice.
/// </summary>
internal SiteStreamGrpcServer(
ISiteStreamSubscriber streamSubscriber,
ILogger<SiteStreamGrpcServer> logger,
int maxConcurrentStreams = 100)
: this(streamSubscriber, logger, maxConcurrentStreams, TimeSpan.FromHours(4))
{
}
/// <summary>
/// DI constructor — binds <see cref="CommunicationOptions.GrpcMaxConcurrentStreams"/>
/// and <see cref="CommunicationOptions.GrpcMaxStreamLifetime"/> so the documented
/// concurrency limit and the 4-hour zombie-stream session timeout are honoured
/// rather than hard-coded.
/// </summary>
public SiteStreamGrpcServer(
ISiteStreamSubscriber streamSubscriber,
ILogger<SiteStreamGrpcServer> logger,
IOptions<CommunicationOptions> options)
: this(streamSubscriber, logger,
options.Value.GrpcMaxConcurrentStreams,
options.Value.GrpcMaxStreamLifetime)
{
}
private SiteStreamGrpcServer(
ISiteStreamSubscriber streamSubscriber,
ILogger<SiteStreamGrpcServer> logger,
int maxConcurrentStreams,
TimeSpan maxStreamLifetime)
{
_streamSubscriber = streamSubscriber;
_logger = logger;
_maxConcurrentStreams = maxConcurrentStreams;
_maxStreamLifetime = maxStreamLifetime;
}
/// <summary>
@@ -49,6 +81,12 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
/// </summary>
public int ActiveStreamCount => _activeStreams.Count;
/// <summary>Effective max concurrent stream limit. Exposed for tests.</summary>
internal int MaxConcurrentStreams => _maxConcurrentStreams;
/// <summary>Effective per-stream session lifetime. Exposed for tests.</summary>
internal TimeSpan MaxStreamLifetime => _maxStreamLifetime;
public override async Task SubscribeInstance(
InstanceStreamRequest request,
IServerStreamWriter<SiteStreamEvent> responseStream,
@@ -69,6 +107,11 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
throw new RpcException(new GrpcStatus(StatusCode.ResourceExhausted, "Max concurrent streams reached"));
using var streamCts = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken);
// Session timeout (design doc "gRPC Connection Keepalive": 4-hour third layer
// of dead-client detection) — forces a long-lived zombie stream to terminate
// even if keepalive PINGs never detect the loss.
if (_maxStreamLifetime > TimeSpan.Zero && _maxStreamLifetime != Timeout.InfiniteTimeSpan)
streamCts.CancelAfter(_maxStreamLifetime);
var entry = new StreamEntry(streamCts);
_activeStreams[request.CorrelationId] = entry;
@@ -9,6 +9,7 @@
<ItemGroup>
<InternalsVisibleTo Include="ScadaLink.Communication.Tests" />
<InternalsVisibleTo Include="ScadaLink.IntegrationTests" />
</ItemGroup>
<ItemGroup>
@@ -16,13 +17,13 @@
</ItemGroup>
<ItemGroup>
<PackageReference Include="Akka" Version="1.5.62" />
<PackageReference Include="Akka.Remote" Version="1.5.62" />
<PackageReference Include="Akka.Cluster" Version="1.5.62" />
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.62" />
<PackageReference Include="Google.Protobuf" Version="3.29.3" />
<PackageReference Include="Grpc.Net.Client" Version="2.71.0" />
<PackageReference Include="Grpc.Tools" Version="2.71.0" PrivateAssets="All" />
<PackageReference Include="Akka" />
<PackageReference Include="Akka.Remote" />
<PackageReference Include="Akka.Cluster" />
<PackageReference Include="Akka.Cluster.Tools" />
<PackageReference Include="Google.Protobuf" />
<PackageReference Include="Grpc.Net.Client" />
<PackageReference Include="Grpc.Tools" PrivateAssets="All" />
</ItemGroup>
<ItemGroup>
@@ -22,8 +22,10 @@ public class ExternalSystemDefinitionConfiguration : IEntityTypeConfiguration<Ex
.IsRequired()
.HasMaxLength(50);
// Stored encrypted at rest (EncryptedStringConverter). Ciphertext is larger than
// the plaintext, so the column is sized generously to avoid truncation.
builder.Property(e => e.AuthConfiguration)
.HasMaxLength(4000);
.HasMaxLength(8000);
builder.HasMany<ExternalSystemMethod>()
.WithOne()
@@ -72,9 +74,11 @@ public class DatabaseConnectionDefinitionConfiguration : IEntityTypeConfiguratio
.IsRequired()
.HasMaxLength(200);
// Stored encrypted at rest (EncryptedStringConverter). Ciphertext is larger than
// the plaintext, so the column is sized generously to avoid truncation.
builder.Property(d => d.ConnectionString)
.IsRequired()
.HasMaxLength(4000);
.HasMaxLength(8000);
builder.HasIndex(d => d.Name).IsUnique();
}
@@ -53,8 +53,10 @@ public class SmtpConfigurationConfiguration : IEntityTypeConfiguration<SmtpConfi
.IsRequired()
.HasMaxLength(50);
// Stored encrypted at rest (EncryptedStringConverter). Ciphertext is larger than
// the plaintext, so the column is sized generously to avoid truncation.
builder.Property(s => s.Credentials)
.HasMaxLength(4000);
.HasMaxLength(8000);
builder.Property(s => s.TlsMode)
.HasMaxLength(50);
@@ -6,20 +6,50 @@ namespace ScadaLink.ConfigurationDatabase;
/// <summary>
/// Factory for creating DbContext instances at design time (used by dotnet ef tooling).
/// Reads connection string from Host's appsettings.Central.json.
/// Resolves the connection string from the Host's appsettings files, or — for environments
/// where those files are not present — from the
/// <c>SCADALINK_DESIGNTIME_CONNECTIONSTRING</c> environment variable.
/// </summary>
/// <remarks>
/// There is deliberately no hardcoded fallback connection string. A credential literal in
/// source is committed to version control, encourages copy-paste of <c>sa</c> /
/// <c>TrustServerCertificate=True</c> into real environments, and can silently point
/// <c>dotnet ef</c> tooling at an unintended database. If no connection string can be
/// resolved, this factory fails loudly with an actionable message.
/// </remarks>
public class DesignTimeDbContextFactory : IDesignTimeDbContextFactory<ScadaLinkDbContext>
{
private const string EnvironmentVariableName = "SCADALINK_DESIGNTIME_CONNECTIONSTRING";
private const string ConfigurationKey = "ScadaLink:Database:ConfigurationDb";
public ScadaLinkDbContext CreateDbContext(string[] args)
{
var configuration = new ConfigurationBuilder()
.SetBasePath(Path.Combine(Directory.GetCurrentDirectory(), "..", "ScadaLink.Host"))
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Central.json", optional: true)
.Build();
var configurationBuilder = new ConfigurationBuilder();
var connectionString = configuration["ScadaLink:Database:ConfigurationDb"]
?? "Server=localhost,1433;Database=ScadaLink_Config;User Id=sa;Password=YourPassword;TrustServerCertificate=True";
// The Host's appsettings files are an optional source — only wire them up when the
// Host directory actually exists, otherwise SetBasePath throws DirectoryNotFoundException
// (e.g. when this factory is exercised from a test runner with no sibling Host folder).
var hostDirectory = Path.Combine(Directory.GetCurrentDirectory(), "..", "ScadaLink.Host");
if (Directory.Exists(hostDirectory))
{
configurationBuilder
.SetBasePath(hostDirectory)
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Central.json", optional: true);
}
var configuration = configurationBuilder.Build();
var connectionString = configuration[ConfigurationKey]
?? Environment.GetEnvironmentVariable(EnvironmentVariableName);
if (string.IsNullOrWhiteSpace(connectionString))
{
throw new InvalidOperationException(
"No design-time database connection string was found. Set the configuration " +
$"key '{ConfigurationKey}' in the Host's appsettings file, or set the " +
$"'{EnvironmentVariableName}' environment variable, before running dotnet ef tooling.");
}
var optionsBuilder = new DbContextOptionsBuilder<ScadaLinkDbContext>();
optionsBuilder.UseSqlServer(connectionString);
@@ -0,0 +1,49 @@
using Microsoft.AspNetCore.DataProtection;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
namespace ScadaLink.ConfigurationDatabase;
/// <summary>
/// EF Core value converter that encrypts a string column at rest using ASP.NET
/// Data Protection. Plaintext is protected when written to the database and
/// transparently unprotected when read back, so secret-bearing columns
/// (SMTP credentials, external-system auth config, database connection strings)
/// are never persisted verbatim.
/// </summary>
/// <remarks>
/// The protector is purpose-scoped so ciphertext from one column cannot be
/// unprotected as another. Data Protection keys are persisted to the
/// configuration database itself (see <see cref="ScadaLinkDbContext"/> implementing
/// <c>IDataProtectionKeyContext</c>), so all central nodes share the same key ring
/// and can decrypt each other's writes.
/// </remarks>
public sealed class EncryptedStringConverter : ValueConverter<string?, string?>
{
/// <summary>The Data Protection purpose string shared by all encrypted configuration columns.</summary>
public const string ProtectorPurpose = "ScadaLink.ConfigurationDatabase.EncryptedColumn";
public EncryptedStringConverter(IDataProtector protector)
: base(
plaintext => plaintext == null ? null : protector.Protect(plaintext),
ciphertext => ciphertext == null ? null : Unprotect(protector, ciphertext))
{
}
private static string Unprotect(IDataProtector protector, string ciphertext)
{
// A row that predates encryption (or test fixtures inserting raw text) is not valid
// protected payload. Unprotect throws CryptographicException in that case; surface a
// clearer message rather than a bare crypto failure.
try
{
return protector.Unprotect(ciphertext);
}
catch (System.Security.Cryptography.CryptographicException ex)
{
throw new InvalidOperationException(
"Failed to decrypt an encrypted configuration column. The Data Protection key " +
"ring may be unavailable, or the stored value was not written by this system.",
ex);
}
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,40 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace ScadaLink.ConfigurationDatabase.Migrations
{
/// <inheritdoc />
public partial class AddDerivedAlarmFields : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<bool>(
name: "IsInherited",
table: "TemplateAlarms",
type: "bit",
nullable: false,
defaultValue: false);
migrationBuilder.AddColumn<bool>(
name: "LockedInDerived",
table: "TemplateAlarms",
type: "bit",
nullable: false,
defaultValue: false);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "IsInherited",
table: "TemplateAlarms");
migrationBuilder.DropColumn(
name: "LockedInDerived",
table: "TemplateAlarms");
}
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,82 @@
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace ScadaLink.ConfigurationDatabase.Migrations
{
/// <inheritdoc />
public partial class EncryptSecretColumns : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AlterColumn<string>(
name: "Credentials",
table: "SmtpConfigurations",
type: "nvarchar(max)",
maxLength: 8000,
nullable: true,
oldClrType: typeof(string),
oldType: "nvarchar(4000)",
oldMaxLength: 4000,
oldNullable: true);
migrationBuilder.AlterColumn<string>(
name: "AuthConfiguration",
table: "ExternalSystemDefinitions",
type: "nvarchar(max)",
maxLength: 8000,
nullable: true,
oldClrType: typeof(string),
oldType: "nvarchar(4000)",
oldMaxLength: 4000,
oldNullable: true);
migrationBuilder.AlterColumn<string>(
name: "ConnectionString",
table: "DatabaseConnectionDefinitions",
type: "nvarchar(max)",
maxLength: 8000,
nullable: false,
oldClrType: typeof(string),
oldType: "nvarchar(4000)",
oldMaxLength: 4000);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.AlterColumn<string>(
name: "Credentials",
table: "SmtpConfigurations",
type: "nvarchar(4000)",
maxLength: 4000,
nullable: true,
oldClrType: typeof(string),
oldType: "nvarchar(max)",
oldMaxLength: 8000,
oldNullable: true);
migrationBuilder.AlterColumn<string>(
name: "AuthConfiguration",
table: "ExternalSystemDefinitions",
type: "nvarchar(4000)",
maxLength: 4000,
nullable: true,
oldClrType: typeof(string),
oldType: "nvarchar(max)",
oldMaxLength: 8000,
oldNullable: true);
migrationBuilder.AlterColumn<string>(
name: "ConnectionString",
table: "DatabaseConnectionDefinitions",
type: "nvarchar(4000)",
maxLength: 4000,
nullable: false,
oldClrType: typeof(string),
oldType: "nvarchar(max)",
oldMaxLength: 8000);
}
}
}
@@ -232,8 +232,8 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
b.Property<string>("ConnectionString")
.IsRequired()
.HasMaxLength(4000)
.HasColumnType("nvarchar(4000)");
.HasMaxLength(8000)
.HasColumnType("nvarchar(max)");
b.Property<int>("MaxRetries")
.HasColumnType("int");
@@ -263,8 +263,8 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
SqlServerPropertyBuilderExtensions.UseIdentityColumn(b.Property<int>("Id"));
b.Property<string>("AuthConfiguration")
.HasMaxLength(4000)
.HasColumnType("nvarchar(4000)");
.HasMaxLength(8000)
.HasColumnType("nvarchar(max)");
b.Property<string>("AuthType")
.IsRequired()
@@ -632,8 +632,8 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
.HasColumnType("int");
b.Property<string>("Credentials")
.HasMaxLength(4000)
.HasColumnType("nvarchar(4000)");
.HasMaxLength(8000)
.HasColumnType("nvarchar(max)");
b.Property<string>("FromAddress")
.IsRequired()
@@ -917,9 +917,15 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.Property<bool>("IsInherited")
.HasColumnType("bit");
b.Property<bool>("IsLocked")
.HasColumnType("bit");
b.Property<bool>("LockedInDerived")
.HasColumnType("bit");
b.Property<string>("Name")
.IsRequired()
.HasMaxLength(200)
@@ -27,17 +27,15 @@ public class TemplateEngineRepository : ITemplateEngineRepository
.FirstOrDefaultAsync(t => t.Id == id, cancellationToken);
}
/// <summary>
/// Loads a template together with its child members — Attributes, Alarms,
/// Scripts and Compositions — eager-loaded so callers get the full template
/// aggregate in a single round-trip. "Children" here refers to the template's
/// member collections, not derived/sub templates.
/// </summary>
public async Task<Template?> GetTemplateWithChildrenAsync(int id, CancellationToken cancellationToken = default)
{
var template = await GetTemplateByIdAsync(id, cancellationToken);
if (template == null) return null;
// Load all templates that have this template as parent
var children = await _context.Templates
.Where(t => t.ParentTemplateId == id)
.ToListAsync(cancellationToken);
return template;
return await GetTemplateByIdAsync(id, cancellationToken);
}
public async Task<IReadOnlyList<Template>> GetAllTemplatesAsync(CancellationToken cancellationToken = default)
@@ -8,16 +8,17 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.EntityFrameworkCore" Version="10.0.7" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="10.0.7" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.7">
<PackageReference Include="Microsoft.EntityFrameworkCore" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="10.0.7" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.7" />
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.7" />
<PackageReference Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" Version="10.0.7" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
<PackageReference Include="Microsoft.AspNetCore.DataProtection" />
<PackageReference Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" />
</ItemGroup>
<ItemGroup>
@@ -1,3 +1,4 @@
using Microsoft.AspNetCore.DataProtection;
using Microsoft.AspNetCore.DataProtection.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore;
using ScadaLink.Commons.Entities.Audit;
@@ -15,10 +16,24 @@ namespace ScadaLink.ConfigurationDatabase;
public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
{
private readonly IDataProtectionProvider? _dataProtectionProvider;
public ScadaLinkDbContext(DbContextOptions<ScadaLinkDbContext> options) : base(options)
{
}
/// <summary>
/// Creates a context with an explicit Data Protection provider used to encrypt
/// secret-bearing configuration columns at rest. The runtime resolves this overload
/// via DI; design-time tooling uses the single-argument overload.
/// </summary>
public ScadaLinkDbContext(DbContextOptions<ScadaLinkDbContext> options, IDataProtectionProvider dataProtectionProvider)
: base(options)
{
_dataProtectionProvider = dataProtectionProvider
?? throw new ArgumentNullException(nameof(dataProtectionProvider));
}
// Templates
public DbSet<Template> Templates => Set<Template>();
public DbSet<TemplateAttribute> TemplateAttributes => Set<TemplateAttribute>();
@@ -73,5 +88,38 @@ public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.ApplyConfigurationsFromAssembly(typeof(ScadaLinkDbContext).Assembly);
ApplySecretColumnEncryption(modelBuilder);
}
/// <summary>
/// Applies encryption-at-rest to columns that hold authentication secrets
/// (SMTP credentials, external-system auth config, database connection strings)
/// so they are never persisted as plaintext.
/// </summary>
/// <remarks>
/// When no Data Protection provider is supplied (design-time <c>dotnet ef</c> tooling,
/// which only emits schema and never reads or writes secret data), an ephemeral provider
/// is used. The encrypted-column type is <c>nvarchar</c> either way, so the generated
/// schema is identical regardless of which provider is in effect. The runtime path always
/// receives the DI-registered provider whose keys are persisted to this database.
/// </remarks>
private void ApplySecretColumnEncryption(ModelBuilder modelBuilder)
{
IDataProtectionProvider provider = _dataProtectionProvider ?? new EphemeralDataProtectionProvider();
var converter = new EncryptedStringConverter(
provider.CreateProtector(EncryptedStringConverter.ProtectorPurpose));
modelBuilder.Entity<SmtpConfiguration>()
.Property(s => s.Credentials)
.HasConversion(converter);
modelBuilder.Entity<ExternalSystemDefinition>()
.Property(e => e.AuthConfiguration)
.HasConversion(converter);
modelBuilder.Entity<DatabaseConnectionDefinition>()
.Property(d => d.ConnectionString)
.HasConversion((Microsoft.EntityFrameworkCore.Storage.ValueConversion.ValueConverter)converter);
}
}
@@ -15,10 +15,28 @@ public static class ServiceCollectionExtensions
/// </summary>
public static IServiceCollection AddConfigurationDatabase(this IServiceCollection services, string connectionString)
{
services.AddDbContext<ScadaLinkDbContext>(options =>
// The DbContext is constructed via the (options, IDataProtectionProvider) overload so
// secret-bearing configuration columns are encrypted at rest. AddDataProtection below
// registers IDataProtectionProvider as a singleton; resolving it here does not recurse
// because key-ring loading is lazy (first Protect/Unprotect), not triggered by
// CreateProtector during model building.
services.AddDbContext<ScadaLinkDbContext>((serviceProvider, options) =>
{
options.UseSqlServer(connectionString)
.ConfigureWarnings(w => w.Ignore(
Microsoft.EntityFrameworkCore.Diagnostics.RelationalEventId.PendingModelChangesWarning)));
Microsoft.EntityFrameworkCore.Diagnostics.RelationalEventId.PendingModelChangesWarning));
});
// AddDbContext registers ScadaLinkDbContext via EF's activator, which only injects
// DbContextOptions. Override that registration (last registration wins for resolution)
// with a factory that also supplies the IDataProtectionProvider, so the encrypting
// value converter for secret columns is always wired up at runtime.
services.AddScoped(serviceProvider =>
{
var options = serviceProvider.GetRequiredService<DbContextOptions<ScadaLinkDbContext>>();
var protectionProvider = serviceProvider.GetRequiredService<IDataProtectionProvider>();
return new ScadaLinkDbContext(options, protectionProvider);
});
services.AddScoped<ISecurityRepository, SecurityRepository>();
services.AddScoped<ICentralUiRepository, CentralUiRepository>();
@@ -38,13 +56,27 @@ public static class ServiceCollectionExtensions
}
/// <summary>
/// Registers the ScadaLinkDbContext with no connection string (for backward compatibility / Phase 0 stubs).
/// This overload is a no-op placeholder; callers should migrate to the overload that accepts a connection string.
/// Obsolete parameterless overload. This previously registered nothing, which meant a
/// central node wired up with it failed late and opaquely — the first repository
/// resolution threw a DI exception far from the actual misconfiguration. Use
/// <see cref="AddConfigurationDatabase(IServiceCollection, string)"/> and pass the
/// configured connection string.
/// </summary>
/// <exception cref="InvalidOperationException">
/// Always thrown. The connection string is required; there is no valid no-op registration.
/// </exception>
[Obsolete(
"AddConfigurationDatabase() with no connection string registers nothing and is not a " +
"valid configuration. Call AddConfigurationDatabase(connectionString) instead.",
error: true)]
public static IServiceCollection AddConfigurationDatabase(this IServiceCollection services)
{
// Retained for backward compatibility during migration.
// Site nodes do not use the configuration database, so this is intentionally a no-op.
return services;
// Defence-in-depth: even if a caller suppresses the compile-time obsolete error,
// fail fast at wire-up time rather than silently registering nothing and surfacing
// an opaque DI resolution failure much later.
throw new InvalidOperationException(
"AddConfigurationDatabase() requires a connection string. Call " +
"AddConfigurationDatabase(connectionString) with the configured " +
"'ScadaLink:Database:ConfigurationDb' value.");
}
}
@@ -8,6 +8,19 @@ public class AuditService : IAuditService
{
private readonly ScadaLinkDbContext _context;
/// <summary>
/// Serializer options for audit <c>afterState</c> payloads. Audit writes commit in the
/// same transaction as the change they record, so a serialization exception here would
/// roll back the entire business operation. Reference cycles (common when an EF entity
/// with loaded navigations is passed in) are ignored rather than thrown, and depth is
/// bounded so a pathological graph cannot produce an unbounded payload.
/// </summary>
private static readonly JsonSerializerOptions AuditSerializerOptions = new()
{
ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles,
MaxDepth = 32
};
public AuditService(ScadaLinkDbContext context)
{
_context = context ?? throw new ArgumentNullException(nameof(context));
@@ -26,7 +39,7 @@ public class AuditService : IAuditService
{
Timestamp = DateTimeOffset.UtcNow,
AfterStateJson = afterState != null
? JsonSerializer.Serialize(afterState)
? SerializeAfterState(afterState)
: null
};
@@ -34,4 +47,27 @@ public class AuditService : IAuditService
// to ensure atomicity with the entity change.
await _context.AuditLogEntries.AddAsync(entry, cancellationToken);
}
/// <summary>
/// Serializes the caller-supplied after-state, tolerating arbitrary object shapes.
/// Reference cycles are ignored via <see cref="AuditSerializerOptions"/>. If serialization
/// still fails (e.g. <c>MaxDepth</c> exceeded), the audit entry is preserved with a
/// diagnostic placeholder rather than throwing — a serialization failure must never
/// roll back the business operation the audit entry is recording.
/// </summary>
private static string SerializeAfterState(object afterState)
{
try
{
return JsonSerializer.Serialize(afterState, AuditSerializerOptions);
}
catch (Exception ex) when (ex is JsonException or NotSupportedException)
{
return JsonSerializer.Serialize(new
{
AuditSerializationError = ex.Message,
StateType = afterState.GetType().FullName
});
}
}
}
@@ -55,6 +55,13 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
/// </summary>
private readonly HashSet<string> _unresolvedTags = new();
/// <summary>
/// DataConnectionLayer-010: tags whose retry SubscribeAsync is currently in flight.
/// They are excluded from the next retry tick so a slow attempt is not duplicated
/// (which would leak monitored items / subscription ids).
/// </summary>
private readonly HashSet<string> _resolutionInFlight = new();
/// <summary>
/// Subscribers: instanceUniqueName → IActorRef (the Instance Actor).
/// </summary>
@@ -80,6 +87,15 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
private int _consecutiveUnstableDisconnects;
private DateTimeOffset _lastConnectedAt;
/// <summary>
/// DataConnectionLayer-011: monotonically increasing tag that identifies the
/// current adapter instance. Subscription callbacks capture the generation in
/// effect when they were created; a <see cref="TagValueReceived"/> whose
/// generation no longer matches comes from a disposed adapter and is dropped so
/// stale pre-failover device data is never forwarded to Instance Actors.
/// </summary>
private int _adapterGeneration;
/// <summary>
/// Captured Self reference for use from non-actor threads (event handlers, callbacks).
/// Akka.NET's Self property is only valid inside the actor's message loop.
@@ -171,6 +187,11 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
case UnsubscribeTagsRequest:
Stash.Stash();
break;
case SubscribeCompleted sc:
// A subscribe started while Connected can complete after a transition;
// apply it so its state survives into the next ReSubscribeAll.
HandleSubscribeCompleted(sc);
break;
case GetHealthReport:
ReplyWithHealthReport();
break;
@@ -182,12 +203,6 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
// ── Connected State ──
/// <summary>
/// Minimum time connected before we consider the connection stable.
/// If we disconnect before this, it counts as an unstable connection toward failover.
/// </summary>
private static readonly TimeSpan StableConnectionThreshold = TimeSpan.FromSeconds(60);
private void BecomeConnected()
{
_log.Info("[{0}] Entering Connected state", _connectionName);
@@ -207,6 +222,15 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
case SubscribeTagsRequest req:
HandleSubscribe(req);
break;
case SubscribeCompleted sc:
// In Connected state, a connection-level subscribe failure must drive
// the reconnection state machine (DataConnectionLayer-004).
if (HandleSubscribeCompleted(sc))
{
_log.Warning("[{0}] Connection-level subscribe failure — entering Reconnecting", _connectionName);
BecomeReconnecting();
}
break;
case UnsubscribeTagsRequest req:
HandleUnsubscribe(req);
break;
@@ -249,7 +273,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
// If we were connected for less than the stability threshold, this counts
// as an unstable cycle (e.g., connect succeeded but heartbeat went stale).
var connectionDuration = DateTimeOffset.UtcNow - _lastConnectedAt;
if (_lastConnectedAt != default && connectionDuration < StableConnectionThreshold)
if (_lastConnectedAt != default && connectionDuration < _options.StableConnectionThreshold)
{
_consecutiveUnstableDisconnects++;
_log.Warning("[{0}] Unstable connection (lasted {1:F0}s) — consecutive unstable disconnects: {2}/{3}",
@@ -284,6 +308,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_connectionDetails = newConfig;
_adapter.Disconnected += OnAdapterDisconnected;
// DataConnectionLayer-011: new adapter — bump the generation so callbacks
// from the disposed adapter are recognised as stale and dropped.
_adapterGeneration++;
_log.Warning("[{0}] Failing over from {1} to {2} (unstable connection pattern)",
_connectionName, previousEndpoint, _activeEndpoint);
@@ -292,7 +320,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_ = _siteEventLogger.LogEventAsync(
"connection", "Warning", null, _connectionName,
$"Failover from {previousEndpoint} to {_activeEndpoint} (unstable connection)",
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {StableConnectionThreshold.TotalSeconds:F0}s");
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {_options.StableConnectionThreshold.TotalSeconds:F0}s");
}
}
@@ -338,6 +366,11 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
case TagResolutionFailed:
// Ignore — stale results from previous connection; ReSubscribeAll runs after reconnect
break;
case SubscribeCompleted sc:
// A subscribe started while Connected can complete after a transition;
// apply it so its state survives into the next ReSubscribeAll.
HandleSubscribeCompleted(sc);
break;
case GetHealthReport:
ReplyWithHealthReport();
break;
@@ -424,6 +457,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
// Wire disconnect handler on new adapter
_adapter.Disconnected += OnAdapterDisconnected;
// DataConnectionLayer-011: new adapter — bump the generation so callbacks
// from the disposed adapter are recognised as stale and dropped.
_adapterGeneration++;
_log.Warning("[{0}] Failing over from {1} to {2}",
_connectionName, previousEndpoint, _activeEndpoint);
@@ -466,18 +503,30 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
if (!_subscriptionsByInstance.ContainsKey(request.InstanceUniqueName))
_subscriptionsByInstance[request.InstanceUniqueName] = new HashSet<string>();
var instanceTags = _subscriptionsByInstance[request.InstanceUniqueName];
var self = Self;
var sender = Sender;
// DataConnectionLayer-011: capture the current adapter generation so callbacks
// from this adapter can be distinguished from a later (post-failover) adapter.
var generation = _adapterGeneration;
// Snapshot the already-subscribed tag set on the actor thread. The background
// task below must NOT read or mutate actor state — it performs only adapter
// I/O and reports results back via a SubscribeCompleted message, which is
// applied to actor state on the actor thread (see HandleSubscribeCompleted).
var alreadySubscribed = new HashSet<string>(_subscriptionIds.Keys);
Task.Run(async () =>
{
var results = new List<SubscribeTagResult>(request.TagPaths.Count);
var tagsToSeed = new List<string>();
foreach (var tagPath in request.TagPaths)
{
if (_subscriptionIds.ContainsKey(tagPath))
if (alreadySubscribed.Contains(tagPath))
{
// Already subscribed — just track for this instance
instanceTags.Add(tagPath);
// Already subscribed by another instance — just track for this one.
results.Add(new SubscribeTagResult(tagPath, AlreadySubscribed: true, Success: true, null, null));
tagsToSeed.Add(tagPath);
continue;
}
@@ -485,35 +534,35 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
{
var subId = await _adapter.SubscribeAsync(tagPath, (path, value) =>
{
self.Tell(new TagValueReceived(path, value));
self.Tell(new TagValueReceived(path, value, generation));
});
_subscriptionIds[tagPath] = subId;
instanceTags.Add(tagPath);
_totalSubscribed++;
_resolvedTags++;
results.Add(new SubscribeTagResult(tagPath, AlreadySubscribed: false, Success: true, subId, null));
tagsToSeed.Add(tagPath);
}
catch (Exception ex)
{
// WP-12: Tag path resolution failure — mark as unresolved, retry later
_unresolvedTags.Add(tagPath);
instanceTags.Add(tagPath);
_totalSubscribed++;
self.Tell(new TagResolutionFailed(tagPath, ex.Message));
// DataConnectionLayer-004: distinguish a connection-level fault
// (adapter not connected / transport down) from a genuine
// node-not-found. Connection-level faults must drive the
// reconnection state machine, not be retried as unresolved tags.
var connectionLevel = IsConnectionLevelFailure(ex);
results.Add(new SubscribeTagResult(
tagPath, AlreadySubscribed: false, Success: false, null, ex.Message,
ConnectionLevelFailure: connectionLevel));
}
}
// Initial read — seed current values for all resolved tags so the Instance Actor
// doesn't stay Uncertain until the next OPC UA data change notification
foreach (var tagPath in instanceTags)
// Initial read — seed current values for resolved tags so the Instance Actor
// doesn't stay Uncertain until the next OPC UA data change notification.
// Tell is thread-safe, so seeded values are delivered directly as messages.
foreach (var tagPath in tagsToSeed)
{
if (_unresolvedTags.Contains(tagPath)) continue;
try
{
var readResult = await _adapter.ReadAsync(tagPath);
if (readResult.Success && readResult.Value != null)
{
self.Tell(new TagValueReceived(tagPath, readResult.Value));
self.Tell(new TagValueReceived(tagPath, readResult.Value, generation));
}
}
catch
@@ -522,11 +571,77 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
}
}
return new SubscribeTagsResponse(
request.CorrelationId, request.InstanceUniqueName, true, null, DateTimeOffset.UtcNow);
}).PipeTo(sender);
return new SubscribeCompleted(request, sender, results);
}).PipeTo(self);
}
// Start tag resolution retry timer if we have unresolved tags
/// <summary>
/// Applies the result of an asynchronous subscribe on the actor thread. ALL mutation
/// of subscription state and counters happens here — never on the background task —
/// so the actor model's single-threaded state guarantee holds.
/// Returns <c>true</c> if any tag failed at connection level (DataConnectionLayer-004),
/// signalling the caller (only the Connected state) to enter Reconnecting.
/// </summary>
private bool HandleSubscribeCompleted(SubscribeCompleted msg)
{
var instanceName = msg.Request.InstanceUniqueName;
if (!_subscriptionsByInstance.TryGetValue(instanceName, out var instanceTags))
{
// The instance was unsubscribed while the subscribe I/O was in flight.
instanceTags = new HashSet<string>();
_subscriptionsByInstance[instanceName] = instanceTags;
}
// DataConnectionLayer-004: if any tag failed because the adapter is not
// connected (a connection-level fault), the subscribe needs the reconnection
// state machine, not the tag-resolution retry. Drive a disconnect and let the
// request be re-stashed/retried after reconnect via ReSubscribeAll.
var connectionLevelFailure = msg.Results.Any(r => !r.Success && r.ConnectionLevelFailure);
foreach (var result in msg.Results)
{
instanceTags.Add(result.TagPath);
// Re-check against current state: another subscribe may have resolved the
// same tag while this request's I/O was in flight.
if (result.AlreadySubscribed || _subscriptionIds.ContainsKey(result.TagPath))
continue;
if (result.Success)
{
_subscriptionIds[result.TagPath] = result.SubscriptionId!;
_totalSubscribed++;
_resolvedTags++;
}
else if (result.ConnectionLevelFailure)
{
// Connection-level fault — do not count as an unresolved tag.
// ReSubscribeAll after reconnect derives the tag from
// _subscriptionsByInstance (already updated above).
_log.Warning("[{0}] Subscribe for {1} failed at connection level: {2}",
_connectionName, result.TagPath, result.Error);
}
else
{
// WP-12: genuine tag resolution failure — mark unresolved so the
// periodic retry timer picks it up.
_unresolvedTags.Add(result.TagPath);
_totalSubscribed++;
_log.Debug("[{0}] Tag resolution failed for {1}: {2}",
_connectionName, result.TagPath, result.Error);
// DataConnectionLayer-004 / design doc Tag Path Resolution step 2:
// mark the attribute quality `bad` so the Instance Actor sees a
// signal rather than staying Uncertain indefinitely.
if (_subscribers.TryGetValue(instanceName, out var subscriber))
{
subscriber.Tell(new TagValueUpdate(
_connectionName, result.TagPath, null, QualityCode.Bad, DateTimeOffset.UtcNow));
}
}
}
// Start the tag-resolution retry timer if any tags are unresolved.
if (_unresolvedTags.Count > 0)
{
Timers.StartPeriodicTimer(
@@ -535,6 +650,30 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_options.TagResolutionRetryInterval,
_options.TagResolutionRetryInterval);
}
msg.ReplyTo.Tell(new SubscribeTagsResponse(
msg.Request.CorrelationId, instanceName, true, null, DateTimeOffset.UtcNow));
// The caller (Connected state only) decides whether to enter Reconnecting.
// In Connecting/Reconnecting the connection is not established anyway, so the
// existing reconnect cycle handles recovery without a re-trigger here.
return connectionLevelFailure;
}
/// <summary>
/// DataConnectionLayer-004: classifies a subscribe exception as a connection-level
/// fault (adapter not connected / transport down) versus a genuine tag-resolution
/// failure (the node does not exist on the device). Connection-level faults must
/// drive the reconnection state machine; resolution failures are retried on the
/// tag-resolution timer.
/// </summary>
private static bool IsConnectionLevelFailure(Exception ex)
{
var baseEx = ex is AggregateException agg ? agg.GetBaseException() : ex;
return baseEx is InvalidOperationException
or System.Net.Sockets.SocketException
or TimeoutException
or System.IO.IOException;
}
private void HandleUnsubscribe(UnsubscribeTagsRequest request)
@@ -558,14 +697,34 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_ = _adapter.UnsubscribeAsync(subId);
_subscriptionIds.Remove(tagPath);
_unresolvedTags.Remove(tagPath);
_resolutionInFlight.Remove(tagPath);
_totalSubscribed--;
if (!_unresolvedTags.Contains(tagPath))
_resolvedTags--;
// DataConnectionLayer-006: drop the tag's tracked quality so it is no
// longer counted by PushBadQualityForAllTags (which sets _tagsBadQuality
// from _lastTagQuality.Count). Leaving it here drifts the quality
// counters above _totalSubscribed across disconnect cycles.
if (_lastTagQuality.Remove(tagPath, out var droppedQuality))
{
switch (droppedQuality)
{
case QualityCode.Good: _tagsGoodQuality--; break;
case QualityCode.Bad: _tagsBadQuality--; break;
case QualityCode.Uncertain: _tagsUncertainQuality--; break;
}
}
}
}
_subscriptionsByInstance.Remove(request.InstanceUniqueName);
_subscribers.Remove(request.InstanceUniqueName);
// DataConnectionLayer-006: keep the reported quality counters in sync after the
// unsubscribed tags' buckets were decremented above.
_healthCollector.UpdateTagQuality(_connectionName, _tagsGoodQuality, _tagsBadQuality, _tagsUncertainQuality);
_healthCollector.UpdateTagResolution(_connectionName, _totalSubscribed, _resolvedTags);
}
// ── Write Support (WP-11) ──
@@ -575,15 +734,29 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_log.Debug("[{0}] Writing to tag {1}", _connectionName, request.TagPath);
var sender = Sender;
// DataConnectionLayer-005: bound the write with WriteTimeout. A hung device
// write (TCP black-hole) would otherwise never complete, so PipeTo never
// fires and the calling script gets no DCL-level error. The CancellationToken
// is passed to the adapter; on timeout we translate cancellation into a
// failed WriteTagResponse so the failure is returned synchronously (WP-11).
var cts = new CancellationTokenSource(_options.WriteTimeout);
// WP-11: Write through DCL to device, failure returned synchronously
_adapter.WriteAsync(request.TagPath, request.Value).ContinueWith(t =>
_adapter.WriteAsync(request.TagPath, request.Value, cts.Token).ContinueWith(t =>
{
cts.Dispose();
if (t.IsCompletedSuccessfully)
{
var result = t.Result;
return new WriteTagResponse(
request.CorrelationId, result.Success, result.ErrorMessage, DateTimeOffset.UtcNow);
}
if (t.IsCanceled || t.Exception?.GetBaseException() is OperationCanceledException)
{
return new WriteTagResponse(
request.CorrelationId, false,
$"Write timeout after {_options.WriteTimeout.TotalSeconds:F0}s", DateTimeOffset.UtcNow);
}
return new WriteTagResponse(
request.CorrelationId, false, t.Exception?.GetBaseException().Message, DateTimeOffset.UtcNow);
}).PipeTo(sender);
@@ -599,16 +772,29 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
return;
}
_log.Debug("[{0}] Retrying resolution for {1} unresolved tags", _connectionName, _unresolvedTags.Count);
var self = Self;
var toResolve = _unresolvedTags.ToList();
// DataConnectionLayer-010: only dispatch retries for tags that do not already
// have an attempt in flight. A slow SubscribeAsync overlapping the next tick
// would otherwise produce duplicate concurrent subscribes for the same tag.
var toResolve = _unresolvedTags.Where(t => !_resolutionInFlight.Contains(t)).ToList();
if (toResolve.Count == 0)
{
_log.Debug("[{0}] Tag-resolution retry skipped — {1} attempt(s) still in flight",
_connectionName, _resolutionInFlight.Count);
return;
}
_log.Debug("[{0}] Retrying resolution for {1} unresolved tags", _connectionName, toResolve.Count);
var generation = _adapterGeneration;
foreach (var tagPath in toResolve)
{
_resolutionInFlight.Add(tagPath);
_adapter.SubscribeAsync(tagPath, (path, value) =>
{
self.Tell(new TagValueReceived(path, value));
self.Tell(new TagValueReceived(path, value, generation));
}).ContinueWith(t =>
{
if (t.IsCompletedSuccessfully)
@@ -656,13 +842,25 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
var self = Self;
_subscriptionIds.Clear();
_unresolvedTags.Clear();
_resolutionInFlight.Clear();
_resolvedTags = 0;
// DataConnectionLayer-006: reset the quality tracking too. Otherwise tags
// resolved for the first time after reconnect (never in _lastTagQuality) only
// increment their bucket and the totals drift above _totalSubscribed. They are
// repopulated from fresh TagValueReceived messages once subscriptions activate.
_lastTagQuality.Clear();
_tagsGoodQuality = 0;
_tagsBadQuality = 0;
_tagsUncertainQuality = 0;
_healthCollector.UpdateTagQuality(_connectionName, _tagsGoodQuality, _tagsBadQuality, _tagsUncertainQuality);
var generation = _adapterGeneration;
foreach (var tagPath in allTags)
{
_adapter.SubscribeAsync(tagPath, (path, value) =>
{
self.Tell(new TagValueReceived(path, value));
self.Tell(new TagValueReceived(path, value, generation));
}).ContinueWith(t =>
{
if (t.IsCompletedSuccessfully)
@@ -688,6 +886,9 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
private void HandleTagResolutionSucceeded(TagResolutionSucceeded msg)
{
// DataConnectionLayer-010: the retry attempt for this tag has completed.
_resolutionInFlight.Remove(msg.TagPath);
if (_unresolvedTags.Remove(msg.TagPath))
{
_subscriptionIds[msg.TagPath] = msg.SubscriptionId;
@@ -707,6 +908,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
_log.Debug("[{0}] Tag resolution still failing for {1}: {2}",
_connectionName, msg.TagPath, msg.Error);
// DataConnectionLayer-010: the retry attempt for this tag has completed —
// it is eligible for the next retry tick again.
_resolutionInFlight.Remove(msg.TagPath);
// Track as unresolved so periodic retry picks it up
if (_unresolvedTags.Add(msg.TagPath))
{
@@ -720,6 +925,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
private void HandleTagValueReceived(TagValueReceived msg)
{
// DataConnectionLayer-011: drop values delivered by a disposed adapter. After a
// failover the old adapter's OPC UA SDK threads may still fire callbacks; those
// carry a stale generation and must not be forwarded to Instance Actors.
if (msg.AdapterGeneration != _adapterGeneration)
{
_log.Debug("[{0}] Dropping stale tag value for {1} from adapter generation {2} (current {3})",
_connectionName, msg.TagPath, msg.AdapterGeneration, _adapterGeneration);
return;
}
// Fan out to all subscribed instances
foreach (var (instanceName, tags) in _subscriptionsByInstance)
{
@@ -760,9 +975,14 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
internal record AttemptConnect;
internal record ConnectResult(bool Success, string? Error);
internal record AdapterDisconnected;
internal record TagValueReceived(string TagPath, TagValue Value);
internal record TagValueReceived(string TagPath, TagValue Value, int AdapterGeneration);
internal record TagResolutionFailed(string TagPath, string Error);
internal record TagResolutionSucceeded(string TagPath, string SubscriptionId);
internal record RetryTagResolution;
internal record SubscribeTagResult(
string TagPath, bool AlreadySubscribed, bool Success, string? SubscriptionId, string? Error,
bool ConnectionLevelFailure = false);
internal record SubscribeCompleted(
SubscribeTagsRequest Request, IActorRef ReplyTo, IReadOnlyList<SubscribeTagResult> Results);
public record GetHealthReport;
}
@@ -125,8 +125,20 @@ public class DataConnectionManagerActor : ReceiveActor
}
/// <summary>
/// OneForOneStrategy with Restart for connection actors — a failed connection
/// should restart and attempt reconnection.
/// OneForOneStrategy with Resume for connection actors.
///
/// DataConnectionLayer-002: a DataConnectionActor is a long-lived, stateful
/// coordinator — its in-memory subscription registry (_subscriptionsByInstance,
/// _subscriptionIds, _subscribers) is the only record of which Instance Actors
/// subscribed to which tags, and there is no durable store to rebuild it from.
/// Restart would create a fresh instance and silently discard that registry,
/// breaking the design doc's "transparent re-subscribe" guarantee (WP-10):
/// subscribers would never be re-subscribed and would sit at stale quality with
/// no error. Resume keeps the actor instance and its state intact, so a transient
/// exception in a message handler does not lose subscription state. The actor's
/// own Become/Stash reconnect state machine already recovers connection-level
/// faults, so it does not need a restart to re-establish the connection.
/// This matches the ScadaLink convention of Resume for coordinator actors.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
@@ -135,8 +147,8 @@ public class DataConnectionManagerActor : ReceiveActor
withinTimeRange: TimeSpan.FromMinutes(1),
decider: Decider.From(ex =>
{
_log.Warning(ex, "DataConnectionActor threw exception, restarting");
return Directive.Restart;
_log.Warning(ex, "DataConnectionActor threw exception, resuming (subscription state preserved)");
return Directive.Resume;
}));
}
}

Some files were not shown because too many files have changed in this diff Show More