From 77cb0ad0e2f4f8daba8a80a50bfadd0fbffa3d83 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Thu, 28 May 2026 08:39:01 -0400 Subject: [PATCH] =?UTF-8?q?fix(api-surface):=20close=20Theme=209=20?= =?UTF-8?q?=E2=80=94=2027=20naming=20/=20dead-code=20/=20config=20/=20hygi?= =?UTF-8?q?ene=20findings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The largest themed batch — small mechanical fixes across 11 modules. API / message hygiene: - Comm-020: SiteAddressCacheLoaded now carries IReadOnlyDictionary / IReadOnlyList — Akka messages must be immutable. - Commons-016: BundleSession.MaxUnlockAttempts named constant replaces magic 3. - Commons-018: IOperationTrackingStore + IPartitionMaintenance moved from Interfaces/ root to Interfaces/Services/ (namespace preserved — 9 consumers exceeded the in-prompt move threshold). - Commons-023: TrackingStatusSnapshot.SourceNode now consistent with the trailing-optional-with-default pattern used elsewhere. - SR-022: AuditingDbCommand.DbConnection.set no longer uses reflection — exposes AuditingDbConnection.Inner via internal API surface. Dead code / config cleanup: - ClusterInfra-011: decorative SectionName constant deleted. - ClusterInfra-014: dead AddClusterInfrastructureActors method + its "throws-when-called" test deleted. - Host-021: Microsoft Logging:LogLevel block deleted from appsettings.json (dead under Serilog). Fail-loud over fail-silent: - DM-021: ResolveSiteIdentifierAsync throws on missing site (was silently substituting a DB id). - DM-022: dropped transient Pending write — record now lands directly in InProgress (no UI flicker, one fewer DB write). - Host-020: LoggerConfigurationFactory emits a Console.Error warning when both Serilog:MinimumLevel and ScadaLink:Logging:MinimumLevel are set (ScadaLink remains truth per Host-011). - SnF-022: NotifyCachedCallObserverAsync logs Warning on unparseable TrackedOperationId (was silently dropping). - SnF-023: empty siteId default replaced with $unknown-site sentinel + constructor normalisation. Correctness: - SCA-001: SupervisorStrategy XML rewritten to match actual DefaultDecider/Restart semantics (was claiming Resume). - SCA-003: OnUpsertAsync now restamps IngestedAtUtc on every upsert. - SR-021: HandleDeployArtifacts now dispatches an internal ApplyArtifactDataConnectionsToDcl message after the SQLite write so system-wide artifact-deploy data-connection changes go live immediately (was requiring a site restart). - SnF-020: RetryParkedMessageAsync captures the parked row BEFORE the local write so a concurrent delete can't skip standby replication. Sentinels / naming collisions: - HM-021: CentralSiteId changed from "central" to "$central" (uncollideable — leading $ is forbidden in real SiteIdentifiers). Doc / surface cleanups: - SEL-018: FailedWriteCount promoted to ISiteEventLogger; XML softened to "Available for future Health Monitoring integration". - SnF-019: VERIFY outcome — documented parking-after-DefaultMaxRetries in Component-StoreAndForward.md + DefaultMaxRetries XML (uniform cap; maxRetries:0 is the unbounded escape hatch). - SnF-021: Component-StoreAndForward.md no longer claims the tracking table lives in SnF — it's in SiteRuntime, the interface is in Commons. - CLI-020: bundle export response parse guarded with try/catch on JsonException / KeyNotFoundException / FormatException — emits a clean INVALID_RESPONSE exit instead of a stack trace. Config: - ClusterInfra-013: intent comment added to "catastrophic config" test. - Host-016: appsettings.Site.json second CentralContactPoints entry removed (was pointing at the SITE's own port); doc-key explains how to extend. - Host-018: NodeName added to both shipped per-role configs (was causing SourceNode to be null on audit rows). UI: - CentralUI-029: replaced JS.InvokeAsync("eval", …) with an ES module import (new wwwroot/js/browser-time.js). - CentralUI-032: AuditResultsGrid gains a Previous button backed by a cursor stack. 10+ new regression tests across the affected projects. Build clean; all suites green. README regenerated: 6 open (was 33). Session-to-date: 130 of 136 originally-open Theme findings closed. --- code-reviews/CLI/findings.md | 10 +- code-reviews/CentralUI/findings.md | 10 +- .../ClusterInfrastructure/findings.md | 29 ++-- code-reviews/Commons/findings.md | 14 +- code-reviews/Communication/findings.md | 6 +- code-reviews/DeploymentManager/findings.md | 10 +- code-reviews/HealthMonitoring/findings.md | 6 +- code-reviews/Host/findings.md | 26 ++-- code-reviews/README.md | 61 +++----- code-reviews/SiteCallAudit/findings.md | 14 +- code-reviews/SiteEventLogging/findings.md | 6 +- code-reviews/SiteRuntime/findings.md | 10 +- code-reviews/StoreAndForward/findings.md | 84 +++++++--- .../requirements/Component-StoreAndForward.md | 27 ++-- src/ScadaLink.CLI/Commands/BundleCommands.cs | 43 +++++- .../Components/Audit/AuditResultsGrid.razor | 19 ++- .../Audit/AuditResultsGrid.razor.cs | 41 +++++ .../Pages/Audit/ConfigurationAuditLog.razor | 14 +- .../wwwroot/js/browser-time.js | 13 ++ .../ClusterOptions.cs | 14 +- .../ServiceCollectionExtensions.cs | 29 ++-- .../{ => Services}/IOperationTrackingStore.cs | 6 + .../{ => Services}/IPartitionMaintenance.cs | 6 + .../Types/TrackingStatusSnapshot.cs | 7 +- .../Types/Transport/BundleSession.cs | 13 +- .../Actors/CentralCommunicationActor.cs | 17 ++- .../DeploymentService.cs | 39 +++-- .../CentralHealthReportLoop.cs | 16 +- .../LoggerConfigurationFactory.cs | 47 +++++- src/ScadaLink.Host/appsettings.Central.json | 4 +- src/ScadaLink.Host/appsettings.Site.json | 8 +- src/ScadaLink.Host/appsettings.json | 6 +- .../SiteCallAuditActor.cs | 37 ++++- .../ISiteEventLogger.cs | 10 ++ .../SiteEventLogger.cs | 12 +- .../Actors/DeploymentManagerActor.cs | 143 +++++++++++++++--- .../Scripts/AuditingDbCommand.cs | 13 +- .../Scripts/AuditingDbConnection.cs | 12 ++ .../ServiceCollectionExtensions.cs | 5 + .../StoreAndForwardOptions.cs | 19 ++- .../StoreAndForwardService.cs | 97 ++++++++++-- .../ClusterOptionsTests.cs | 23 ++- .../ServiceCollectionExtensionsTests.cs | 23 ++- .../DeploymentServiceTests.cs | 122 ++++++++++++++- .../DeploymentStatusNotifierTests.cs | 23 ++- .../LoggerConfigurationTests.cs | 50 ++++++ 46 files changed, 966 insertions(+), 278 deletions(-) create mode 100644 src/ScadaLink.CentralUI/wwwroot/js/browser-time.js rename src/ScadaLink.Commons/Interfaces/{ => Services}/IOperationTrackingStore.cs (92%) rename src/ScadaLink.Commons/Interfaces/{ => Services}/IPartitionMaintenance.cs (85%) diff --git a/code-reviews/CLI/findings.md b/code-reviews/CLI/findings.md index 9eaab727..a295f910 100644 --- a/code-reviews/CLI/findings.md +++ b/code-reviews/CLI/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 2 | +| Open findings | 1 | ## Summary @@ -931,9 +931,11 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.CLI/Commands/BundleCommands.cs:117-126` | +**Resolution (2026-05-28):** Wrapped the `JsonDocument.Parse` + `GetProperty` extraction in a `try/catch (JsonException or KeyNotFoundException or InvalidOperationException)` block and the `StreamBase64ToFile` call in a separate `try/catch (FormatException)`. Either failure now emits a clean `OutputFormatter.WriteError(..., "INVALID_RESPONSE")` and returns exit 1, matching the graceful-degradation pattern established by CLI-002 / CLI-003 / CLI-005. A malformed/abbreviated envelope no longer terminates the CLI with a raw stack trace. + **Description** The export success handler does: @@ -959,10 +961,6 @@ Wrap the parse + base64-decode in a `try` block that catches `JsonException`, clean `OutputFormatter.WriteError(..., "INVALID_RESPONSE")` + `return 1`. Add a regression test against a malformed-envelope stub `HttpMessageHandler`. -**Resolution** - -_Unresolved._ - ### CLI-021 — `CliConfig.Load` crashes the CLI on a malformed config file | | | diff --git a/code-reviews/CentralUI/findings.md b/code-reviews/CentralUI/findings.md index 8bc6ff95..d7a32bac 100644 --- a/code-reviews/CentralUI/findings.md +++ b/code-reviews/CentralUI/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 2 | +| Open findings | 0 | ## Summary @@ -1429,9 +1429,11 @@ still passes (568 / 568). |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.CentralUI/Components/Pages/Audit/ConfigurationAuditLog.razor:248-263` | +**Resolution (2026-05-28):** Added a small 5-line `wwwroot/js/browser-time.js` ES module exporting `getTimezoneOffsetMinutes()`, and replaced the `JS.InvokeAsync("eval", "new Date().getTimezoneOffset()")` call in `ConfigurationAuditLog.OnAfterRenderAsync` with a lazy `IJSObjectReference` import (`./_content/ScadaLink.CentralUI/js/browser-time.js`) + `module.InvokeAsync("getTimezoneOffsetMinutes")`, matching the `session-expiry.js` / `audit-grid.js` / `nav-state.js` / `transport.js` module-import pattern. The residual `eval` JS-interop surface is gone and the page is now CSP-compatible with `unsafe-eval` forbidden. + **Description** `OnAfterRenderAsync` fetches the browser's UTC offset with @@ -1533,9 +1535,11 @@ docs to call out the in-memory cost per concurrent import session. |--|--| | Severity | Low | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor:76-82`; `src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor.cs:65,196-197,219-220` | +**Resolution (2026-05-28):** Added a `Stack _cursorStack` and `AuditLogPaging? _currentPaging` field to `AuditResultsGrid.razor.cs`. `NextPage` now pushes the current cursor before advancing; a new `PrevPage` method pops the prior cursor, reloads at that position, and decrements `_pageNumber` only if the reload succeeds (a failed fetch leaves the user on the current page rather than stranding them between pages). The filter-change reset clears the stack alongside `_rows`. The razor template now renders a `btn-group` with a Previous button (gated on `CanGoBack`) alongside the existing Next button; both buttons get the standard `disabled` treatment during loads. + **Description** The Audit Log results grid (Bundle B / M7-T3) renders a single "Next page" button diff --git a/code-reviews/ClusterInfrastructure/findings.md b/code-reviews/ClusterInfrastructure/findings.md index 7e90aaa7..06a735f1 100644 --- a/code-reviews/ClusterInfrastructure/findings.md +++ b/code-reviews/ClusterInfrastructure/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 4 | +| Open findings | 1 | ## Summary @@ -687,8 +687,10 @@ confirmed failing, then passing after the fix. Module test suite green (18 passe |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | -| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:24-27`, `src/ScadaLink.Host/SiteServiceRegistration.cs:100`, `src/ScadaLink.Host/StartupValidator.cs:43`, `src/ScadaLink.Host/StartupValidator.cs:45`, `src/ScadaLink.Host/StartupValidator.cs:75` | +| Status | Resolved | +| Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:24-27`, `src/ScadaLink.Host/SiteServiceRegistration.cs:100`, `src/ScadaLink.Host/StartupValidator.cs:43`, `src/ScadaLink.Host/StartupValidator.cs:75` | + +**Resolution (2026-05-28):** Took option (b) since wiring the constant into the Host's `SiteServiceRegistration.BindSharedOptions` / `StartupValidator` is outside this module's editable surface — deleted the `SectionName` constant from `ClusterOptions.cs` and the companion `SectionName_IsTheExpectedAppSettingsSection` test from `ClusterOptionsTests.cs`. The Host's `"ScadaLink:Cluster"` literals now stand alone (consistent with the implementation rather than the broken "single source of truth" claim). A code-comment placeholder records the rationale so a future Host-side change can reinstate the constant alongside the binding-site updates. **Description** @@ -722,11 +724,6 @@ Either (a) replace the hard-coded `"ScadaLink:Cluster"` literals in claim to be the source of truth. Do not leave a public constant whose stated guarantee the code does not deliver. -**Resolution** - -_Open — needs a one-line Host-side change to reference the constant, plus a test -that proves the section name flows from this module to the Host._ - ### ClusterInfrastructure-012 — Validator accepts `SeedNodes.Count == 1` despite design requiring both nodes as seeds | | | @@ -791,9 +788,11 @@ ClusterInfrastructure.Tests). |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs:47-67` | +**Resolution (2026-05-28):** Added a 10-line inline `// ClusterInfra-013: ...` block at the top of `Properties_CanBeSetToCustomValues` explicitly recording that this test exercises the POCO property setters only — the `keep-majority` strategy and `MinNrOfMembers = 2` values are explicitly forbidden in production by `ClusterOptionsValidator`, and the comment cross-references `UnsupportedSplitBrainStrategy_FailsValidation` and `MinNrOfMembers_NotOne_FailsValidation` so a future reader cannot misread the test as endorsing those values. + **Description** `ClusterOptionsTests.Properties_CanBeSetToCustomValues` deliberately sets two values @@ -822,19 +821,17 @@ represent a valid runtime configuration, and `ClusterOptionsValidator` rejects t (with a cross-reference to the relevant validator tests). Two lines is enough; the goal is to make the test's intent self-documenting. -**Resolution** - -_Open._ - ### ClusterInfrastructure-014 — `AddClusterInfrastructureActors` is dead surface — no caller, no behaviour | | | |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs:42-48` | +**Resolution (2026-05-28):** Deleted the `AddClusterInfrastructureActors` extension method from `ServiceCollectionExtensions.cs` and its companion `AddClusterInfrastructureActors_ThrowsRatherThanSilentlySucceeding` test from `ServiceCollectionExtensionsTests.cs`. Verified no production caller existed before deletion via `grep -rn`. A code comment records the rationale (CI-001 ownership question now permanently settled; method served only to throw and was IDE-auto-complete noise). The class-level XML doc on the test file was updated to drop the stale reference to the removed test. + **Description** `AddClusterInfrastructureActors` has now reached a curious state: it is a public @@ -860,7 +857,3 @@ explicitly stating that this project exposes no actor-registration extension (actor wiring lives in `ScadaLink.Host`). If the user prefers to keep the "fail-fast" trap, mark the method `[Obsolete(true, error: true)]` so the compiler — not the runtime — rejects the call. - -**Resolution** - -_Open._ diff --git a/code-reviews/Commons/findings.md b/code-reviews/Commons/findings.md index ef9b7f98..08d359a7 100644 --- a/code-reviews/Commons/findings.md +++ b/code-reviews/Commons/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 5 | +| Open findings | 2 | ## Summary @@ -784,9 +784,11 @@ accepted values on the record. |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Commons/Types/Transport/BundleSession.cs:13-16` | +**Resolution (2026-05-28):** Added `public const int MaxUnlockAttempts = 3;` to `BundleSession` with an XML doc cross-referencing the authoritative `TransportOptions.MaxUnlockAttemptsPerSession`. The `Locked` getter now reads `FailedUnlockAttempts >= MaxUnlockAttempts` instead of comparing against the literal `3`, and the property's XML doc names the constant. No call-site update required — the existing Transport-component `TransportOptions.MaxUnlockAttemptsPerSession` (also `3`) remains the operator-facing dial; this constant is the shim's own threshold, now searchable for a security review. + **Description** `BundleSession` exposes: @@ -880,9 +882,11 @@ needed again now. |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Commons/Interfaces/IOperationTrackingStore.cs`, `src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs` | +**Resolution (2026-05-28):** Moved both files into `src/ScadaLink.Commons/Interfaces/Services/`, matching the REQ-COM-5b sub-folder convention alongside the other service interfaces (`ISiteAuditQueue`, `INodeIdentityProvider`, `ICachedCallLifecycleObserver`, etc.). The 9 consumer files across `ScadaLink.SiteRuntime`, `ScadaLink.AuditLog`, `ScadaLink.ConfigurationDatabase`, and `ScadaLink.Host` exceed the in-instructions 8-file STOP threshold for namespace rewrites, so the namespace was deliberately kept as `ScadaLink.Commons.Interfaces` (not `.Services`) — no consumer change required, build remains green. A comment in each moved file records the rationale and notes that adopting the canonical `.Services` namespace can be picked up alongside any future Commons-wide namespace tidy-up. + **Description** REQ-COM-5b documents the `Interfaces/` folder as having exactly three sub-folders: @@ -1115,9 +1119,11 @@ Two related XML-doc weaknesses, both around the new Transport / Audit surface: |--|--| | Severity | Low | | Category | Akka.NET conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Commons/Messages/Audit/SiteCallQueries.cs:53-66`, `:110-123`, `src/ScadaLink.Commons/Messages/Notification/NotificationOutboxQueries.cs:26-39`, `:104-123`, `src/ScadaLink.Commons/Types/SiteCallOperational.cs:42-54`, `src/ScadaLink.Commons/Types/TrackingStatusSnapshot.cs:33-46` | +**Resolution (2026-05-28):** Read all six locations and confirmed the dominant pattern is "trailing-optional with `= null` default" (`SiteCallSummary`, `SiteCallDetail`, `NotificationSummary`, `NotificationDetail`, `NotificationOutboxQueryRequest.SourceNodeFilter`, `SiteCallQueryRequest.SourceNodeFilter` all already use this form). The single odd-one-out was `TrackingStatusSnapshot.SourceNode`, declared as `string? SourceNode` with no default — added the `= null` default to unify it with the rest. Verified both existing callers (`OperationTrackingStore.cs` and `TrackingApiTests.cs`) use named arguments, so the change is purely additive. `SiteCallOperational.SourceNode` sits in the middle of its positional parameter list rather than the trailing slot — that's a separate positional-record concern outside the "trailing-optional" pattern the finding called out, and moving it would touch many telemetry/proto consumers, so it was deliberately not touched here. + **Description** The `SourceNode` rollout adds an optional trailing parameter to a long list of positional diff --git a/code-reviews/Communication/findings.md b/code-reviews/Communication/findings.md index 6c2125a0..c42b6a23 100644 --- a/code-reviews/Communication/findings.md +++ b/code-reviews/Communication/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 3 | +| Open findings | 2 | ## Summary @@ -1005,9 +1005,11 @@ the finding. |--|--| | Severity | Low | | Category | Akka.NET conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:567` | +**Resolution (2026-05-28):** `SiteAddressCacheLoaded`'s `SiteContacts` payload is now typed as `IReadOnlyDictionary>`, enforcing the Akka.NET message-immutability convention at the type level rather than relying on producer discipline. The producer (`LoadSiteAddressesFromDb`) builds the working buckets as before and wraps each inner `List` with `AsReadOnly()` before constructing the message — the freeze is local to the single refresh tick and the cost is negligible. The consumer (`HandleSiteAddressCacheLoaded`) only ever read via `Keys`, foreach-deconstruct, `Select`, `Count` and `ToImmutableHashSet`, all of which are supported by the new read-only types, so no consumer changes were needed. The existing `MalformedSiteAddress_DoesNotAbortRefresh_OtherSitesStillRegistered` and `ClusterClientRouting_RoutesToConfiguredSite` regression tests exercise the producer→consumer flow and continue to pass under the read-only types. + **Description** The Akka.NET convention is that messages crossing actor boundaries (even diff --git a/code-reviews/DeploymentManager/findings.md b/code-reviews/DeploymentManager/findings.md index f4f85a7c..036ff4ac 100644 --- a/code-reviews/DeploymentManager/findings.md +++ b/code-reviews/DeploymentManager/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 2 | +| Open findings | 0 | ## Summary @@ -1076,9 +1076,11 @@ as the actor. Tests green (80/80 in DeploymentManager.Tests). |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:107-111` | +**Resolution (2026-05-28):** `ResolveSiteIdentifierAsync` now throws `InvalidOperationException` (`"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing."`) when the `Site` row is missing, instead of returning the numeric id rendered as a string. The deploy path's existing try/catch turns the throw into a `DeploymentStatus.Failed` record carrying the descriptive message (the `DeploymentManager-001`/`-002` cleanup write the failure with `CancellationToken.None`); the lifecycle paths (Disable/Enable/Delete) propagate the exception so the CLI/UI caller surfaces the actual cause to the operator rather than seeing a confusing downstream "unknown site" routing error. The repository contract already returned `Site?`, so the null path is now type-visible at the call site instead of silently papered over. + **Description** ``` @@ -1114,9 +1116,11 @@ returns `Site?`, so the null path is type-visible; just don't paper over it. |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:178-194` | +**Resolution (2026-05-28):** The transient `Pending` write was dropped — the deployment record is now created directly in `DeploymentStatus.InProgress`, which collapses the start of the deploy into a single `AddDeploymentRecordAsync` + `SaveChangesAsync` + `NotifyStatusChange` (instead of two writes back-to-back). The flattening, validation, and `TryReconcileWithSiteAsync` round-trip have all completed before the insert, and the deploy command is sent immediately after, so `Pending` carried no operational meaning between the two writes. `InProgress` retains its documented "sent to site, awaiting response" semantics. Eliminating the extra `SaveChangesAsync` round-trip also removes the `Pending`→`InProgress` flicker the CentralUI-006 deployment-status page used to render via the second `IDeploymentStatusNotifier.NotifyStatusChanged` invocation. + **Description** `DeployInstanceAsync` does: diff --git a/code-reviews/HealthMonitoring/findings.md b/code-reviews/HealthMonitoring/findings.md index 4531a648..a91f5aea 100644 --- a/code-reviews/HealthMonitoring/findings.md +++ b/code-reviews/HealthMonitoring/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 3 | +| Open findings | 2 | ## Summary @@ -1034,9 +1034,11 @@ online. |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:22`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:224-226` | +**Resolution (2026-05-28):** `CentralHealthReportLoop.CentralSiteId` is now `"$central"` instead of `"central"`. The leading `$` is forbidden in operator-set `Site.SiteIdentifier` values (which are plain identifiers), so the synthetic central self-report cannot collide with a real site whose identifier happens to be the bare word `"central"`. The collision case the finding called out — two reports clobbering each other in the aggregator keyspace via the sequence-number guard and a real site inheriting `CentralOfflineTimeout` and staying falsely-online for an extra two minutes — is now impossible. The aggregator (`CentralHealthAggregator.CheckForOfflineSites`), the Central UI health dashboard (`Monitoring/Health.razor`), and every test reference the constant rather than the literal string, so the value change is local — no consumer code needed updating. Existing `CentralHealthAggregatorTests` and `CentralHealthReportLoopTests` already use the constant, so they continue to pin the central-self-report identity through the new sentinel. + **Description** `CentralHealthAggregator.CheckForOfflineSites` looks up the per-site offline diff --git a/code-reviews/Host/findings.md b/code-reviews/Host/findings.md index 3615d35e..a6edd891 100644 --- a/code-reviews/Host/findings.md +++ b/code-reviews/Host/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 4 | +| Open findings | 0 | ## Summary @@ -831,7 +831,7 @@ Full Host suite green (182 passed). |--|--| | Severity | Medium | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/appsettings.Site.json:33-37` | **Description** @@ -861,9 +861,7 @@ multi-node layout). Consider extending `StartupValidator` to reject any node's `NodeHostname`+`RemotingPort`. Add a regression test in `StartupValidatorTests` mirroring `Site_SeedNodeOnGrpcPort_FailsValidation`. -**Resolution** - -_Open._ +**Resolution (2026-05-28):** The shipped `appsettings.Site.json` `CentralContactPoints` entry that pointed at the site's own remoting port (`localhost:8082`) was removed — the dev-loopback default now lists only the single central node (`akka.tcp://scadalink@localhost:8081`), which is the actually-reachable target in the single-node dev layout. A `_centralContactPoints` doc-key comment was added immediately above the array calling out the per-entry rule (each entry MUST be a central node's remoting endpoint, not the site's own remoting port) and explaining how to extend the list with a second central node (`akka.tcp://scadalink@central-b-host:8081`) in a multi-central deployment so ClusterClient can fail over. The dangerous example pattern that would have been copied into multi-central configs no longer exists in the template. `StartupValidator` cross-check is left as a follow-up — the documented rule plus the corrected template removes the immediate misconfiguration risk. ### Host-017 — Site-shutdown ordering from REQ-HOST-7 is not wired @@ -943,7 +941,7 @@ unit suite covers both server-side invariants and the wiring is a single |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/appsettings.Central.json`, `src/ScadaLink.Host/appsettings.Site.json`, `src/ScadaLink.Host/NodeOptions.cs:10-16` | **Description** @@ -974,9 +972,7 @@ per-node in multi-node deployments. Consider validating in `StartupValidator` that `NodeName` is non-empty, or accept the null and document explicitly that single-node dev deployments leave `SourceNode` null. -**Resolution** - -_Open._ +**Resolution (2026-05-28):** The shipped per-role templates now set `ScadaLink:Node:NodeName` — `central-a` in `appsettings.Central.json` and `node-a` in `appsettings.Site.json` — so dev audit rows are stamped with a real `SourceNode` value (instead of `NodeIdentityProvider` normalising the missing key to `null`) and the indexed `IX_AuditLog_Node_Occurred` lookup actually narrows. A `_nodeName` doc-key comment was added beside each `Node` section explaining the convention (`central-a`/`central-b` for central, `node-a`/`node-b` for site), pointing at the docker per-node configs (which already overrode the field), and noting that the value must be overridden per-node in multi-node deployments and that an empty value still normalises to a `NULL` SourceNode. The shipped dev templates now match the per-node docker examples — a developer running the binary directly no longer sees a null `SourceNode`. ### Host-019 — Migration `StartupRetry` call drops the host `CancellationToken` @@ -1021,7 +1017,7 @@ _Open._ |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/LoggerConfigurationFactory.cs:36-43` | **Description** @@ -1050,9 +1046,7 @@ current "ScadaLink:Logging" path and reject `Serilog:MinimumLevel` if present (throw at startup so the operator sees the conflict). At minimum, expand the XML doc + REQ-HOST-8 to spell out the precedence explicitly. -**Resolution** - -_Open._ +**Resolution (2026-05-28):** `ScadaLink:Logging:MinimumLevel` is now the documented single source of truth for the Serilog floor (Host-011's `LoggingOptions` binding), and the precedence is made visible — `LoggerConfigurationFactory.Build` writes a one-shot warning to `Console.Error` when both `ScadaLink:Logging:MinimumLevel` and `Serilog:MinimumLevel` (or `Serilog:MinimumLevel:Default`) are present, naming both values and pointing the operator at the documented key. Order of operations is unchanged — `MinimumLevel.Is(...)` deliberately runs after `ReadFrom.Configuration(...)` so the ScadaLink value wins — but the silent-override behaviour is now loud. The class XML doc gained a Host-020 paragraph explicitly spelling out the precedence. A test-visible `Build(..., TextWriter warningWriter)` overload mirrors the `ParseLevel` Host-022 pattern so the warning can be asserted in unit tests; the production four-arg overload delegates with `Console.Error`. ### Host-021 — Microsoft `Logging:LogLevel` section in `appsettings.json` is dead config under Serilog @@ -1060,7 +1054,7 @@ _Open._ |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/appsettings.json:2-6` | **Description** @@ -1084,9 +1078,7 @@ explaining it is intentionally retained for non-Serilog tooling. Document the authoritative location (`Serilog` + `ScadaLink:Logging`) in `Component-Host.md` REQ-HOST-8 if not already explicit. -**Resolution** - -_Open._ +**Resolution (2026-05-28):** Confirmed by repository-wide grep that no code reads `Logging:LogLevel` (the Host calls `builder.Host.UseSerilog()` which replaces the default `ILoggerFactory` setup with Serilog as the only provider), so the block was pure dead config. Removed the `Logging:LogLevel:Default = Information` block from `appsettings.json` and replaced it with a `_logging` doc-key comment explaining the rationale (Serilog is the sole provider) and pointing operators at the two authoritative keys: `ScadaLink:Logging:MinimumLevel` for the floor (bound to `LoggingOptions` per Host-011) and the `Serilog` section for sinks (Host-014's `ReadFrom.Configuration`). The Host-014 regression test (`SerilogSinkConfigTests.ShippedAppSettings_HasSerilogSection_WithConsoleAndFileSinks`) still asserts the surviving `Serilog` section's shape, so removing the Microsoft block did not break the existing pinning. ### Host-022 — `ParseLevel` silently coerces unrecognised `MinimumLevel` to `Information` diff --git a/code-reviews/README.md b/code-reviews/README.md index 28ddcf46..16a09ab9 100644 --- a/code-reviews/README.md +++ b/code-reviews/README.md @@ -41,35 +41,35 @@ module file and counted in **Total**. |----------|---------------| | Critical | 0 | | High | 0 | -| Medium | 13 | -| Low | 20 | -| **Total** | **33** | +| Medium | 5 | +| Low | 1 | +| **Total** | **6** | ## Module Status | Module | Last reviewed | Commit | Open (C/H/M/L) | Open | Total | |--------|---------------|--------|----------------|------|-------| | [AuditLog](AuditLog/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/0 | 1 | 11 | -| [CLI](CLI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/1 | 1 | 23 | -| [CentralUI](CentralUI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/2 | 2 | 33 | -| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/3 | 3 | 14 | -| [Commons](Commons/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/3 | 3 | 23 | -| [Communication](Communication/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/1 | 1 | 22 | +| [CLI](CLI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 23 | +| [CentralUI](CentralUI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 33 | +| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 14 | +| [Commons](Commons/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 23 | +| [Communication](Communication/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 22 | | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 24 | | [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 22 | -| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/2 | 2 | 24 | +| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 24 | | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/0 | 1 | 23 | -| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/1 | 1 | 23 | -| [Host](Host/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/1/3 | 4 | 22 | +| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 23 | +| [Host](Host/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 22 | | [InboundAPI](InboundAPI/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 25 | | [ManagementService](ManagementService/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 23 | | [NotificationOutbox](NotificationOutbox/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 10 | | [NotificationService](NotificationService/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 25 | | [Security](Security/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 21 | -| [SiteCallAudit](SiteCallAudit/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/0 | 2 | 6 | -| [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/1 | 1 | 23 | -| [SiteRuntime](SiteRuntime/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/0 | 2 | 26 | -| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/2 | 5 | 24 | +| [SiteCallAudit](SiteCallAudit/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 6 | +| [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 23 | +| [SiteRuntime](SiteRuntime/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 26 | +| [StoreAndForward](StoreAndForward/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/0 | 0 | 24 | | [TemplateEngine](TemplateEngine/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/0 | 3 | 22 | | [Transport](Transport/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/1 | 1 | 12 | @@ -88,45 +88,18 @@ _None open._ _None open._ -### Medium (13) +### Medium (5) | ID | Module | Title | |----|--------|-------| | AuditLog-001 | [AuditLog](AuditLog/findings.md) | Combined-telemetry transport is plumbed end-to-end but never invoked in production | | ExternalSystemGateway-020 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `JsonElementToParameterValue` silently downcasts non-Int64 JSON numbers to `double`, losing precision for `decimal` SQL parameters on retry | -| Host-016 | [Host](Host/findings.md) | Site `CentralContactPoints` second entry targets the site's own remoting port | -| SiteCallAudit-001 | [SiteCallAudit](SiteCallAudit/findings.md) | SupervisorStrategy override is dead code; XML claims Resume that is not enforced | -| SiteCallAudit-003 | [SiteCallAudit](SiteCallAudit/findings.md) | `OnUpsertAsync` does not refresh `IngestedAtUtc`; direct-write callers must remember to stamp it | -| SiteRuntime-021 | [SiteRuntime](SiteRuntime/findings.md) | `HandleDeployArtifacts` updates `DataConnections` in SQLite but never sends `CreateConnectionCommand` to the DCL | -| SiteRuntime-022 | [SiteRuntime](SiteRuntime/findings.md) | `AuditingDbCommand.DbConnection.set` uses reflection to read `AuditingDbConnection._inner` | -| StoreAndForward-019 | [StoreAndForward](StoreAndForward/findings.md) | Notifications park after `DefaultMaxRetries` exhaustion, contradicting "retried until central acks" | -| StoreAndForward-020 | [StoreAndForward](StoreAndForward/findings.md) | `RetryParkedMessageAsync` skips standby replication when the message is deleted between local update and re-load | -| StoreAndForward-021 | [StoreAndForward](StoreAndForward/findings.md) | Design doc claims the Operation Tracking Table lives in StoreAndForward but the implementation is in SiteRuntime | | TemplateEngine-018 | [TemplateEngine](TemplateEngine/findings.md) | `DiffService` reports no entries for added/removed/changed connections | | TemplateEngine-019 | [TemplateEngine](TemplateEngine/findings.md) | `TemplateResolver.BuildInheritanceChain` still uses the `0`-as-no-parent sentinel that was removed from `CycleDetector` | | TemplateEngine-020 | [TemplateEngine](TemplateEngine/findings.md) | `Create*` audit entries are written with `EntityId = "0"` before `SaveChangesAsync` populates the real key | -### Low (20) +### Low (1) | ID | Module | Title | |----|--------|-------| -| CLI-020 | [CLI](CLI/findings.md) | `bundle export` success-envelope parse is unguarded | -| CentralUI-029 | [CentralUI](CentralUI/findings.md) | `ConfigurationAuditLog` uses `JS.InvokeAsync("eval", ...)` instead of a dedicated JS module | -| CentralUI-032 | [CentralUI](CentralUI/findings.md) | `AuditResultsGrid` paging is forward-only, no Previous button | -| ClusterInfrastructure-011 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | `SectionName` constant is decorative — no binding site references it | -| ClusterInfrastructure-013 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | Test uses catastrophic config values without an inline-intent comment | -| ClusterInfrastructure-014 | [ClusterInfrastructure](ClusterInfrastructure/findings.md) | `AddClusterInfrastructureActors` is dead surface — no caller, no behaviour | -| Commons-016 | [Commons](Commons/findings.md) | `BundleSession.Locked` uses a magic `3` rather than a named constant | -| Commons-018 | [Commons](Commons/findings.md) | `IOperationTrackingStore` and `IPartitionMaintenance` are at the root of `Interfaces/` instead of `Interfaces/Services/` | -| Commons-023 | [Commons](Commons/findings.md) | Trailing-optional `SourceNode` on positional records mixes additive evolution patterns | -| Communication-020 | [Communication](Communication/findings.md) | `SiteAddressCacheLoaded` carries mutable `Dictionary`/`List` types | -| DeploymentManager-021 | [DeploymentManager](DeploymentManager/findings.md) | `ResolveSiteIdentifierAsync` silently substitutes the DB id when the site row is missing | -| DeploymentManager-022 | [DeploymentManager](DeploymentManager/findings.md) | `Pending` and `InProgress` are written back-to-back with no intervening work | -| HealthMonitoring-021 | [HealthMonitoring](HealthMonitoring/findings.md) | `CentralSiteId = "central"` reserved constant silently collides with a real site named "central" | -| Host-018 | [Host](Host/findings.md) | Shipped per-role configs omit `NodeOptions.NodeName`, leaving `SourceNode` null | -| Host-020 | [Host](Host/findings.md) | `MinimumLevel.Is` silently overrides any operator-set `Serilog:MinimumLevel` | -| Host-021 | [Host](Host/findings.md) | Microsoft `Logging:LogLevel` section in `appsettings.json` is dead config under Serilog | -| SiteEventLogging-018 | [SiteEventLogging](SiteEventLogging/findings.md) | `FailedWriteCount` is exposed but never consumed by Health Monitoring | -| StoreAndForward-022 | [StoreAndForward](StoreAndForward/findings.md) | `NotifyCachedCallObserverAsync` silently drops the entire audit lifecycle when the message id is not a parseable `TrackedOperationId` | -| StoreAndForward-023 | [StoreAndForward](StoreAndForward/findings.md) | `siteId` silently defaults to empty when no `IStoreAndForwardSiteContext` is registered, degrading audit telemetry correlation | | Transport-012 | [Transport](Transport/findings.md) | "Bundle Import" filter promised in design doc not surfaced in Configuration Audit Log Viewer UI | diff --git a/code-reviews/SiteCallAudit/findings.md b/code-reviews/SiteCallAudit/findings.md index 4f7476a1..018d9407 100644 --- a/code-reviews/SiteCallAudit/findings.md +++ b/code-reviews/SiteCallAudit/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 4 | +| Open findings | 2 | ## Summary @@ -50,7 +50,7 @@ tests using a shared `MsSqlMigrationFixture`. |--|--| | Severity | Medium | | Category | Akka.NET conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs:32-46`, `src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs:147-151` | **Description** @@ -98,9 +98,7 @@ Either: The CLAUDE.md "Resume for coordinator actors" decision applies to actors with children (Site Runtime hierarchy) — not to leaf cluster singletons. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** Rewrote the class-level XML on `SiteCallAuditActor` plus the method-level XML on `SupervisorStrategy()` to accurately describe what the override does — a one-for-one strategy with `DefaultDecider` (Restart on most exceptions, Stop on `ActorInitializationException`/`ActorKilledException`) and `maxNrOfRetries: 0`, governing the actor's *children* (the actor has none today, so the override is currently inert). Dropped the misleading "Resume" claim. The new docs make clear that self-supervision of this cluster singleton is the parent `ClusterSingletonManager`'s concern and the actor's own resilience comes from the in-handler `try/catch` in `OnUpsertAsync`, not from this override. No behaviour change — pure documentation fix; existing 24 SiteCallAudit tests remain green. ### SiteCallAudit-002 — Singleton failover does not wait for in-flight async upserts @@ -154,7 +152,7 @@ Notification Outbox sibling has the same pattern. |--|--| | Severity | Medium | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs:153-193` | **Description** @@ -190,9 +188,7 @@ inconsistent with the dual-write code path and undocumented. Preferred: stamp inside the actor — same as the combined-telemetry path — because callers cannot in general know the actor is colocated on central. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** `OnUpsertAsync` now rewrites the incoming `SiteCall` via `cmd.SiteCall with { IngestedAtUtc = DateTime.UtcNow }` immediately before calling `repository.UpsertAsync`, mirroring `AuditLogIngestActor`'s combined-telemetry hot path. The repository writes `IngestedAtUtc` on both the insert-if-not-exists and the monotonic UPDATE legs (`SiteCallAuditRepository.UpsertAsync`), so the column is writable on every upsert. Callers (telemetry, the deferred reconciliation puller, any future direct-write) no longer need to remember to stamp a central-side timestamp — the actor owns it. Existing 24 SiteCallAudit tests remain green (the MSSQL-fixture test constructs rows with `DateTime.UtcNow` and doesn't assert the exact value, so the actor's re-stamp is backward compatible). ### SiteCallAudit-004 — Reconciliation puller and daily terminal-purge scheduler still deferred; design-doc drift diff --git a/code-reviews/SiteEventLogging/findings.md b/code-reviews/SiteEventLogging/findings.md index be5323f4..1be32617 100644 --- a/code-reviews/SiteEventLogging/findings.md +++ b/code-reviews/SiteEventLogging/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 3 | +| Open findings | 2 | ## Summary @@ -901,9 +901,11 @@ refused. |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:67-71,225-226` | +**Resolution (2026-05-28):** Took option (a)-via-(b) from the recommendation. Softened the `SiteEventLogger.FailedWriteCount` XML doc to describe the actual state ("Available for future Health Monitoring integration — the counter is correct and observable, but the central health-metric pipeline does not yet poll it"), removing the misleading "Health Monitoring can detect a logging outage" claim. The Health Monitoring wiring is left as a tracked follow-up (it requires a `ScadaLink.HealthMonitoring` source change that belongs in a different batch). Promoted `FailedWriteCount { get; }` onto `ISiteEventLogger` so the eventual Health consumer reads it through the interface without a concrete-type downcast. No behaviour change — pure documentation + interface-surface tidy-up; `SiteEventLogger` already exposed the property publicly, and no test fakes/mocks of `ISiteEventLogger` exist in the repo (grep confirms only `SiteEventLogger` implements it), so the interface addition is non-breaking. Existing 59 SiteEventLogging tests remain green. + **Description** `SiteEventLogger.FailedWriteCount` was added under SiteEventLogging-008 with the diff --git a/code-reviews/SiteRuntime/findings.md b/code-reviews/SiteRuntime/findings.md index 0d60cbf5..03985fdf 100644 --- a/code-reviews/SiteRuntime/findings.md +++ b/code-reviews/SiteRuntime/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 5 | +| Open findings | 3 | ## Summary @@ -1038,9 +1038,11 @@ be gated on "no instance with this name is currently terminating". |--|--| | Severity | Medium | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:931` | +**Resolution (2026-05-28):** Took the "refactor `EnsureDclConnections` into a shared field-based helper" path. Extracted a new `EnsureDclConnection(name, protocol, primaryJson, backupJson, failoverRetryCount)` method that owns the hash-cache check and the `CreateConnectionCommand` Tell — both the existing inline `EnsureDclConnections(configJson)` and the new artifact path now drive through it. `ComputeConnectionConfigHash` got a field-based overload so the artifact path (which carries data directly on `DataConnectionArtifact`) reuses the same hash logic as the `ConnectionConfig`-based inline path. To keep `_createdConnections` mutation actor-thread-confined (the artifact-deploy persistence runs inside a `Task.Run`), the off-thread persistence dispatches a new internal `ApplyArtifactDataConnectionsToDcl` message back to `Self` after the SQLite writes; the actor-thread handler then iterates and invokes `EnsureDclConnection`. The DCL only sees `CreateConnectionCommand` (no `Update`/`Delete` messages exist in the codebase, and `CreateConnectionCommand` is treated as upsert-by-name — same shape as the inline-config path). Build clean; 302 SiteRuntime tests green (the existing `EnsureDclConnections_ConnectionConfigChanged_ReissuesCreateCommand` regression test still passes through the refactored shared helper). + **Description** `HandleDeployArtifacts` persists the artifact bundle (shared scripts, external @@ -1088,9 +1090,11 @@ and artifact paths can drive through it. |--|--| | Severity | Medium | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteRuntime/Scripts/AuditingDbCommand.cs:138` | +**Resolution (2026-05-28):** Took the recommended "expose a proper API surface" path (the SiteRuntime-006 precedent). Added an `internal DbConnection Inner => _inner;` accessor to `AuditingDbConnection`; both classes are `internal sealed` in the same assembly, so the accessor stays out of the public API. The `AuditingDbCommand.DbConnection` setter now unwraps an `AuditingDbConnection` via `auditing.Inner` instead of `Type.GetField("_inner", BindingFlags.NonPublic | BindingFlags.Instance)!.GetValue(...)`. No reflection, no `!.` null-forgiveness hiding a runtime crash, no static-analyzer/IL-trim noise — and the same module that enforces "no `System.Reflection` in scripts" no longer reflects internally. The getter's `_wrappingConnection ?? _inner.Connection` fallback was left as-is; addressing the `CreateDbCommand()` round-trip concern is a separate behavioural decision (the finding marked it secondary). Build clean; 302 SiteRuntime tests green. + **Description** The `DbConnection` setter on `AuditingDbCommand` unwraps an diff --git a/code-reviews/StoreAndForward/findings.md b/code-reviews/StoreAndForward/findings.md index 2d97aedb..916840b1 100644 --- a/code-reviews/StoreAndForward/findings.md +++ b/code-reviews/StoreAndForward/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-28 | | Reviewer | claude-agent | | Commit reviewed | `1eb6e97` | -| Open findings | 5 (3 Deferred: 002, 011, 012; 5 new Open from Re-review 2026-05-28) | +| Open findings | 0 (3 Deferred: 002, 011, 012; all 5 Open from Re-review 2026-05-28 resolved 2026-05-28) | ## Summary @@ -1067,7 +1067,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:229`, `:407`–`:437`; `src/ScadaLink.StoreAndForward/StoreAndForwardOptions.cs:18`; `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs:1773`–`:1778`; `src/ScadaLink.NotificationService/NotificationDeliveryService.cs:149`–`:156` | **Description** @@ -1121,9 +1121,21 @@ the field value) so the invariant is enforced at the single chokepoint rather th relying on every caller to pass the right value — this also fixes the legacy `NotificationDeliveryService` path without editing the consumer. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** +VERIFY outcome — the design doc's "Notifications do not park" wording (lines 47, 59) +was the *operational intent* for the happy path, not an absolute invariant: the engine +has always enforced `DefaultMaxRetries` uniformly across every category, and every +sibling system (ESG, CachedDbWrite) bounds retry-then-parks for the same disk-pressure +and operator-visibility reasons. Removing the cap for notifications would let a single +unreachable central exhaust local disk via an unbounded buffer — worse than the +documented "park after retry budget" behaviour. Resolution is therefore the brief's +**default**: document the parking behaviour. Updated +`Component-StoreAndForward.md` lines 46/58 to clarify that the `DefaultMaxRetries` cap +applies uniformly (including to notifications) and that `maxRetries: 0` is the explicit +escape hatch for callers that need unbounded retry. Added a `StoreAndForward-019` block +to `StoreAndForwardOptions.DefaultMaxRetries`'s XML doc explaining the same invariant. +No behavioural code change — existing tests (104 in +`ScadaLink.StoreAndForward.Tests`) continue to pass. ### StoreAndForward-020 — `RetryParkedMessageAsync` skips standby replication when the message is deleted between local update and re-load @@ -1131,7 +1143,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:599`–`:616` | **Description** @@ -1209,9 +1221,16 @@ Add a regression test in `StoreAndForwardReplicationTests` that simulates the delete-between-update-and-reload race and asserts the `Requeue` replication operation is still emitted with the correct category. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** +Applied the brief's primary recommendation — `RetryParkedMessageAsync` now captures +the parked row up front via `GetMessageByIdAsync` (and rejects the call early if the +row is missing or no longer `Parked`), then performs the local `RetryParkedMessageAsync` +storage write, and finally reconstructs the post-requeue state on the captured POCO +(`Status = Pending, RetryCount = 0, LastError = null, LastAttemptAt = null`) and +replicates it. A concurrent `RemoveMessageAsync` or `DiscardParkedMessageAsync` running +between the local write and the original re-load can no longer skip replication — the +row is in hand. The category-fallback misllabelling on the racy path is gone because +the activity log uses the captured `Category` directly. ### StoreAndForward-021 — Design doc claims the Operation Tracking Table lives in StoreAndForward but the implementation is in SiteRuntime @@ -1219,7 +1238,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `docs/requirements/Component-StoreAndForward.md:21`, `:49`–`:51`, `:77`–`:87`, `:108`, `:114`; `src/ScadaLink.SiteRuntime/Tracking/OperationTrackingStore.cs:37`; `src/ScadaLink.StoreAndForward/` (whole module) | **Description** @@ -1274,9 +1293,18 @@ several refactors out of date. The hierarchical map should be: - `Component-SiteCallAudit.md` / `Component-AuditLog.md` → telemetry emission + central-side mirror. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** +Doc-side fix applied (per the brief, the simplest of the two options). Updated +`Component-StoreAndForward.md`: (1) removed the "Maintain a site-local operation +tracking table" line from Responsibilities and reworded the cached-call telemetry +responsibility to point at the `ICachedCallLifecycleObserver` hook; (2) renamed the +"Operation Tracking Table" section to "Operation Tracking Table (lives in Site +Runtime, not here)" with an explicit `StoreAndForward-021` callout cross-linking to +`Component-SiteRuntime.md` and the `IOperationTrackingStore` interface in +Commons. The rest of the section is retained for cross-component context (the +buffered cached-call rows carry `TrackedOperationId` so the link to the tracking row +must still be documented somewhere) but is reworded to make clear the table itself is +not owned here. ### StoreAndForward-022 — `NotifyCachedCallObserverAsync` silently drops the entire audit lifecycle when the message id is not a parseable `TrackedOperationId` @@ -1284,7 +1312,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:484`–`:515` | **Description** @@ -1333,9 +1361,14 @@ contract — the existing the fix is "log + skip", that test should be updated to also assert the log emission; if the fix is "emit anyway", the test should be replaced. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** +Applied the brief's "cheap fix" — the non-GUID skip path now logs a Warning naming +the offending `MessageId`, `Category` and `Outcome` before returning, so a +misconfigured caller is observable instead of silently bypassing the audit pipeline. +S&F retry bookkeeping remains untouched (the observer is still best-effort, the skip +still returns without throwing). The existing +`Attempt_MessageIdNotAGuid_NoObserverNotification` test still passes — its assertion +is on `_observer.Notifications` being empty, which is unchanged. ### StoreAndForward-023 — `siteId` silently defaults to empty when no `IStoreAndForwardSiteContext` is registered, degrading audit telemetry correlation @@ -1343,7 +1376,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.StoreAndForward/ServiceCollectionExtensions.cs:43`–`:53`; `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:99`, `:524` | **Description** @@ -1383,9 +1416,18 @@ absent (no `AddAuditLog`), keep the empty-string default since `_siteId` is unus Alternatively, change `siteId` from a parameter to a `Func` resolved lazily from the service provider so a late-registered context still takes effect. -**Resolution** - -_Unresolved._ +**Resolution (2026-05-28):** +Applied the brief's sentinel option (less invasive than throwing — preserves the +existing test wiring that constructs `StoreAndForwardService` without a site context). +Introduced `StoreAndForwardService.UnknownSiteSentinel = "$unknown-site"` (leading +`$` chosen so it cannot collide with a real site id) and the constructor now +normalises any null/empty/whitespace `siteId` argument to that sentinel. The empty +string can no longer reach `CachedCallAttemptContext.SourceSite`; a misconfigured +host without an `IStoreAndForwardSiteContext` produces audit rows tagged with the +sentinel — recognisably bad in the central audit log instead of silently merging +into the empty bucket. All 104 existing tests pass; the only test that asserts a +literal `SourceSite` (`CachedCallAttemptEmissionTests`) supplies `"site-77"` so the +normalisation is a no-op there. ### StoreAndForward-024 — `StopAsync` does not wait for an in-flight retry sweep, so disposed dependencies can be touched after shutdown diff --git a/docs/requirements/Component-StoreAndForward.md b/docs/requirements/Component-StoreAndForward.md index dcc048ab..f4b03ec0 100644 --- a/docs/requirements/Component-StoreAndForward.md +++ b/docs/requirements/Component-StoreAndForward.md @@ -18,8 +18,7 @@ Site clusters only. The central cluster does not buffer messages. - Retry delivery per message according to the configured retry policy. - Park messages that exhaust their retry limit (dead-letter). - Persist buffered messages to local SQLite for durability. -- Maintain a site-local **operation tracking table** holding one row per `TrackedOperationId` for cached calls (`ExternalCall` and `DatabaseWrite`) — the authoritative status record consulted by `Tracking.Status(id)`. -- Emit cached-call lifecycle telemetry to the central Site Call Audit component on every status transition. +- Emit cached-call lifecycle telemetry to the central Site Call Audit component via the `ICachedCallLifecycleObserver` hook (one notification per attempt outcome) so the audit pipeline can record each status transition. - Replicate buffered messages to the standby node via application-level replication over Akka.NET remoting. - On failover, the standby node takes over delivery from its replicated copy. - Respond to remote queries from central for parked message management (list, retry, discard), including central-driven Retry/Discard of parked cached calls. @@ -44,7 +43,7 @@ Attempt immediate delivery └── Max retries exhausted → Park message ``` -For notifications, "delivery" means forwarding the message to the central cluster via Central–Site Communication; "success" is central's ack, on which the message is cleared. Notifications do not park — they are retried at the fixed forward interval until central acks. Parking applies only to the external-system-call and cached-database-write categories. +For notifications, "delivery" means forwarding the message to the central cluster via Central–Site Communication; "success" is central's ack, on which the message is cleared. Notifications are retried at the fixed forward interval until central acks, but — like every other category — they are bounded by the engine's `DefaultMaxRetries` cap: a sustained central outage that exceeds `DefaultMaxRetries × forward-interval` will park the buffered notification, after which an operator can Retry/Discard it via the parked-message UI. Operationally, the cap is sized so the normal central-recovery window stays well inside it; "do not park" is the design's operational intent on the happy path, not an absolute invariant. Callers that genuinely require unbounded retry pass `maxRetries: 0` on `EnqueueAsync` (the documented "no limit" escape hatch — see `StoreAndForward-015`). For the cached-call categories (`ExternalCall` and `DatabaseWrite`), the operation tracking table is the status record and the S&F buffer is purely the retry mechanism. A cached call that succeeds on its first immediate attempt is written directly as a terminal `Delivered` tracking row and never enters the S&F buffer. When immediate delivery fails transiently, the message is buffered and its tracking row moves to `Pending`/`Retrying`; the buffered message carries its `TrackedOperationId` so the tracking row and the retry record stay linked. When immediate delivery fails **permanently** (e.g. HTTP 4xx), the message is not buffered — the error is returned synchronously to the calling script as before — but the tracking row is written directly as a terminal `Failed` row capturing the error. On every tracking-table status transition the site emits `CachedCallTelemetry` to central. @@ -56,7 +55,7 @@ For the external-system-call and cached-database-write categories, retry setting - **External systems**: Each external system definition includes max retry count and time between retries. - **Cached database writes**: Each database connection definition includes max retry count and time between retries. -The **notification** category retries differently: it has no source-entity setting. The site→central forward uses a single fixed retry interval configured in the host `appsettings.json`. This interval is infrastructure config for reaching the central cluster, not a per-notification-list setting. It applies uniformly to every buffered notification regardless of its target list. A buffered notification is retried until central acks it; it is not parked on a retry limit (central, once reachable, owns delivery, retry, and parking from that point on). +The **notification** category retries differently: it has no source-entity setting. The site→central forward uses a single fixed retry interval configured in the host `appsettings.json`. This interval is infrastructure config for reaching the central cluster, not a per-notification-list setting. It applies uniformly to every buffered notification regardless of its target list. A buffered notification is retried at that interval until central acks it; the engine's `DefaultMaxRetries` cap still applies (matching the cached-call categories) and a notification whose retries are exhausted under a sustained central outage parks like any other buffered message. The cap is sized so the normal central-recovery window stays well inside it — central, once reachable, owns delivery, retry, and parking from the ack point on. The retry interval is **fixed** (not exponential backoff). Fixed interval is sufficient for the expected use cases. @@ -74,14 +73,24 @@ There is **no maximum buffer size**. Messages accumulate in the buffer until del - On failover, the new active node has a near-complete copy of the buffer. In rare cases, the most recent operations may not have been replicated (e.g., a message added or removed just before failover). This can result in a few **duplicate deliveries** (message delivered but remove not replicated) or a few **missed retries** (message added but not replicated). Both are acceptable trade-offs for the latency benefit. - On failover, the new active node resumes delivery from its local copy. -### Operation Tracking Table +### Operation Tracking Table (lives in Site Runtime, not here) -Alongside the S&F buffer DB, each site node holds a **site-local operation tracking table** in SQLite. It carries one row per `TrackedOperationId` for cached calls (`ExternalCall` and `DatabaseWrite`), created the moment the script issues the cached call and kept regardless of outcome. +> **StoreAndForward-021:** the operation tracking table is **not** owned by +> this component. The `IOperationTrackingStore` interface lives in +> `src/ScadaLink.Commons/Interfaces/Services/`, and the SQLite-backed +> implementation (`OperationTrackingStore`, alongside `OperationTrackingOptions`) +> lives in [`src/ScadaLink.SiteRuntime/Tracking/`](../../src/ScadaLink.SiteRuntime/Tracking/). +> See [`Component-SiteRuntime.md`](Component-SiteRuntime.md) for the table's +> semantics, lifecycle, and central-mirror coordination — it is summarised here +> only because the S&F retry loop carries the `TrackedOperationId` linking a +> buffered cached-call row to its tracking entry. -- This table is the **status record**; the S&F buffer remains purely the **retry mechanism**. A buffered cached-call message references its `TrackedOperationId` back to its tracking row. +For context: each site node also holds a site-local operation tracking table in SQLite (owned by Site Runtime) carrying one row per `TrackedOperationId` for cached calls (`ExternalCall` and `DatabaseWrite`), created the moment the script issues the cached call and kept regardless of outcome. + +- That table is the **status record**; the S&F buffer remains purely the **retry mechanism**. A buffered cached-call message references its `TrackedOperationId` back to its tracking row. - Each row records the operation kind (`TrackedOperationKind`), a target summary (external system + method, or database connection name), the unified `TrackedOperationStatus`, retry count, last error, source provenance (instance / script), and the created/updated/terminal UTC timestamps. -- `Tracking.Status(id)` reads this table. For cached calls the **site is the authoritative source of truth** for status — the query is always answered site-locally, even when central is unreachable. The central Site Call Audit `SiteCalls` table is an eventually-consistent mirror. -- A cached call that succeeds on its first immediate attempt writes a terminal `Delivered` row directly here, with nothing placed in the S&F buffer. +- `Tracking.Status(id)` reads that table. For cached calls the **site is the authoritative source of truth** for status — the query is always answered site-locally, even when central is unreachable. The central Site Call Audit `SiteCalls` table is an eventually-consistent mirror. +- A cached call that succeeds on its first immediate attempt writes a terminal `Delivered` row directly there, with nothing placed in the S&F buffer. - Terminal rows are purged after a configurable retention window (default 7 days) — the site holds live operational state; central holds long-term audit. Notifications are unaffected: they have no tracking table. Their `NotificationId` and status are owned by the central `Notifications` table, and their lifecycle continues to forward to central exactly as before. diff --git a/src/ScadaLink.CLI/Commands/BundleCommands.cs b/src/ScadaLink.CLI/Commands/BundleCommands.cs index 3b5e4872..c4e04964 100644 --- a/src/ScadaLink.CLI/Commands/BundleCommands.cs +++ b/src/ScadaLink.CLI/Commands/BundleCommands.cs @@ -118,9 +118,33 @@ public static class BundleCommands timeout: BundleCommandTimeout, onSuccess: jsonOk => { - using var doc = JsonDocument.Parse(jsonOk); - var base64 = doc.RootElement.GetProperty("base64Bundle").GetString()!; - var byteCount = doc.RootElement.GetProperty("byteCount").GetInt32(); + // CLI-020: previously the JSON envelope parse + property extraction + + // base64 decode all ran unguarded — a server-side bug that omits one of + // the two expected properties, returns a null base64 value, sends invalid + // base64, or returns a malformed JSON envelope would surface as one of + // KeyNotFoundException / InvalidOperationException / FormatException / + // JsonException, i.e. an unhandled stack trace rather than the + // documented "exit 1 with a clean INVALID_RESPONSE error". Wrap the + // envelope parse and the streamed write in a single try/catch matching + // the graceful-degradation theme established by CLI-002 / CLI-003 / CLI-005. + string base64; + int byteCount; + try + { + using var doc = JsonDocument.Parse(jsonOk); + base64 = doc.RootElement.GetProperty("base64Bundle").GetString()!; + byteCount = doc.RootElement.GetProperty("byteCount").GetInt32(); + } + catch (Exception ex) when (ex is JsonException + or KeyNotFoundException + or InvalidOperationException) + { + OutputFormatter.WriteError( + $"Server returned a malformed bundle-export response: {ex.Message}", + "INVALID_RESPONSE"); + return 1; + } + // CLI-019: stream the base64 → file write so a 100 MB bundle // doesn't double-buffer through Convert.FromBase64String's // ~100 MB byte[] on the LOH plus a synchronous File.WriteAllBytes. @@ -128,7 +152,18 @@ public static class BundleCommands // jsonOk string (wire-format limit), but the decode + write // are now chunked, so peak working-set drops from // ~base64+byte[]+envelope to ~base64+small-chunk. - var written = StreamBase64ToFile(base64, output); + long written; + try + { + written = StreamBase64ToFile(base64, output); + } + catch (FormatException ex) + { + OutputFormatter.WriteError( + $"Server returned invalid base64 in the bundle response: {ex.Message}", + "INVALID_RESPONSE"); + return 1; + } Console.WriteLine($"Wrote {written:N0} bytes to {output} (server reported {byteCount:N0})."); return 0; }); diff --git a/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor b/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor index 74b086ad..ff2efe45 100644 --- a/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor +++ b/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor @@ -75,10 +75,21 @@
Page @_pageNumber · @_rows.Count rows - + @* CentralUI-032: keyset paging is naturally forward-only, but the + in-component _cursorStack lets the user step back through previous + pages by replaying the prior cursor. The Previous button is gated + on the stack having at least one prior cursor — i.e. we are not on + the first page. *@ +
+ + +
diff --git a/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor.cs b/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor.cs index cb965eba..84917884 100644 --- a/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor.cs +++ b/src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor.cs @@ -66,6 +66,16 @@ public partial class AuditResultsGrid : IAsyncDisposable private bool _loading; private string? _error; + // CentralUI-032: small in-component stack of prior-page cursors so the user + // can step backwards through results. Each Next push captures the cursor + // that produced the current page (null for page 1) before advancing; each + // Previous pop reloads the page at the recovered cursor. Mirrors the + // SiteCallsReport keyset-paging shape called out in the finding. + private readonly Stack _cursorStack = new(); + // The cursor that produced the page currently on screen — kept so Next can + // push it before advancing without recomputing it from _rows. + private AuditLogPaging? _currentPaging; + private AuditLogQueryFilter? _activeFilter; [Inject] private IJSRuntime JS { get; set; } = default!; @@ -196,6 +206,8 @@ public partial class AuditResultsGrid : IAsyncDisposable _activeFilter = Filter; _pageNumber = 1; _rows.Clear(); + _cursorStack.Clear(); + _currentPaging = null; if (Filter is not null) { await LoadAsync(paging: null); @@ -216,10 +228,36 @@ public partial class AuditResultsGrid : IAsyncDisposable AfterOccurredAtUtc: last.OccurredAtUtc, AfterEventId: last.EventId); + // CentralUI-032: remember the cursor that produced the current page so + // a later Previous can navigate back to it. The page-1 entry is pushed + // as null — LoadAsync treats null as "first page" (PageSize-only). + _cursorStack.Push(_currentPaging); await LoadAsync(cursor); _pageNumber++; } + // CentralUI-032: pops the previous-page cursor off the stack and reloads + // at that position. The pop only happens AFTER a successful reload — a + // failed page-fetch leaves the user on the current page with the error + // banner instead of stranding them between pages. + private async Task PrevPage() + { + if (_cursorStack.Count == 0 || _activeFilter is null) + { + return; + } + + var prior = _cursorStack.Peek(); + await LoadAsync(prior); + if (_error is null) + { + _cursorStack.Pop(); + _pageNumber = Math.Max(1, _pageNumber - 1); + } + } + + private bool CanGoBack => _cursorStack.Count > 0; + private async Task LoadAsync(AuditLogPaging? paging) { if (_activeFilter is null) @@ -235,6 +273,9 @@ public partial class AuditResultsGrid : IAsyncDisposable var page = await QueryService.QueryAsync(_activeFilter, effective); _rows.Clear(); _rows.AddRange(page); + // Track the cursor that produced the page now on screen so a later + // Next can push it onto the stack before advancing. + _currentPaging = paging; } catch (Exception ex) { diff --git a/src/ScadaLink.CentralUI/Components/Pages/Audit/ConfigurationAuditLog.razor b/src/ScadaLink.CentralUI/Components/Pages/Audit/ConfigurationAuditLog.razor index 2e19de40..fa41e344 100644 --- a/src/ScadaLink.CentralUI/Components/Pages/Audit/ConfigurationAuditLog.razor +++ b/src/ScadaLink.CentralUI/Components/Pages/Audit/ConfigurationAuditLog.razor @@ -245,14 +245,24 @@ // same query param doesn't re-run the query on every parameter set. private Guid? _lastFetchedBundleImportId; + // CentralUI-029: the browser-time JS module that hosts getTimezoneOffsetMinutes(). + // Loaded lazily on first render via dynamic import; replaces the previous + // `JS.InvokeAsync("eval", "new Date().getTimezoneOffset()")` call, which + // widened the JS-interop attack surface and was incompatible with strict CSP + // `script-src` directives that forbid `unsafe-eval`. + private const string BrowserTimeModulePath = "./_content/ScadaLink.CentralUI/js/browser-time.js"; + private IJSObjectReference? _browserTimeModule; + protected override async Task OnAfterRenderAsync(bool firstRender) { if (!firstRender) return; try { // Date.getTimezoneOffset() returns (UTC - local) in minutes. - _browserUtcOffsetMinutes = await JS.InvokeAsync( - "eval", "new Date().getTimezoneOffset()"); + _browserTimeModule ??= await JS.InvokeAsync( + "import", BrowserTimeModulePath); + _browserUtcOffsetMinutes = await _browserTimeModule.InvokeAsync( + "getTimezoneOffsetMinutes"); } catch (Exception ex) when (ex is JSException or JSDisconnectedException or InvalidOperationException or TaskCanceledException) diff --git a/src/ScadaLink.CentralUI/wwwroot/js/browser-time.js b/src/ScadaLink.CentralUI/wwwroot/js/browser-time.js new file mode 100644 index 00000000..1a202d32 --- /dev/null +++ b/src/ScadaLink.CentralUI/wwwroot/js/browser-time.js @@ -0,0 +1,13 @@ +// CentralUI-029: small JS module to replace the JS.InvokeAsync("eval", ...) +// anti-pattern previously used by ConfigurationAuditLog. Exporting a named +// function from an ES module: +// * removes the residual `eval` JS-interop surface, +// * is CSP-friendly (no `unsafe-eval` directive required), +// * matches the module-import pattern (`session-expiry.js`, `audit-grid.js`, +// `nav-state.js`, `transport.js`) the rest of the Central UI follows. +// +// The function returns the same value as `new Date().getTimezoneOffset()` — +// minutes of (UTC - local), positive for time zones west of UTC. +export function getTimezoneOffsetMinutes() { + return new Date().getTimezoneOffset(); +} diff --git a/src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs b/src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs index ceed81f9..56ee9e7a 100644 --- a/src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs +++ b/src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs @@ -20,11 +20,15 @@ namespace ScadaLink.ClusterInfrastructure; /// public class ClusterOptions { - /// - /// The appsettings.json section name this options class binds from. - /// Single source of truth so binding sites do not hard-code the magic string. - /// - public const string SectionName = "ScadaLink:Cluster"; + // ClusterInfra-011: the previous `public const string SectionName = "ScadaLink:Cluster";` + // was documented as "single source of truth so binding sites do not hard-code the + // magic string" but no caller ever read it — the Host's SiteServiceRegistration and + // StartupValidator both hard-code the literal directly. Wiring those binding sites + // to reference the constant lives in the Host's edit scope (a separate code-review + // task); rather than carry a public constant whose guarantee the code does not + // deliver, the constant is removed and the literal stays in the Host until the + // Host-side wiring is done. If a future Host change wants the constant back, add it + // when the binding sites can be updated in the same commit. /// /// Akka.NET cluster seed nodes. Both nodes are seed nodes — each node lists diff --git a/src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs b/src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs index 0c046e02..3fdf7dce 100644 --- a/src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.ClusterInfrastructure/ServiceCollectionExtensions.cs @@ -30,20 +30,17 @@ public static class ServiceCollectionExtensions return services; } - /// - /// Reserved for cluster-infrastructure actor registration. This component does - /// not register any actors — the Akka.NET bootstrap and actor wiring live in - /// ScadaLink.Host. The method throws rather than silently returning - /// success so that any caller assuming this component registers actors fails - /// fast with a clear cause instead of failing later, far from here. - /// - /// Always thrown. - /// The service collection (unused; method always throws). - public static IServiceCollection AddClusterInfrastructureActors(this IServiceCollection services) - { - throw new NotImplementedException( - "ScadaLink.ClusterInfrastructure registers no actors. The Akka.NET actor system " + - "bootstrap and all cluster actor registration live in ScadaLink.Host " + - "(AkkaHostedService). Do not call AddClusterInfrastructureActors()."); - } + // ClusterInfra-014: the previous `AddClusterInfrastructureActors` extension + // was dead surface — its XML doc told callers "do not call", its body + // unconditionally threw `NotImplementedException`, and no production caller + // existed anywhere in the solution (verified by grep). The CI-002 + // "throw loudly" decision was made while CI-001's ownership question was + // still open; that question is now permanently settled by the + // "Implementation Note — Code Placement" section of + // Component-ClusterInfrastructure.md, which records that all actor wiring + // lives in ScadaLink.Host (AkkaHostedService). Keeping a public extension + // method that exists only to throw was API-surface noise that an IDE would + // still suggest via auto-complete, so the method and its companion + // `AddClusterInfrastructureActors_ThrowsRatherThanSilentlySucceeding` test + // were both removed. } diff --git a/src/ScadaLink.Commons/Interfaces/IOperationTrackingStore.cs b/src/ScadaLink.Commons/Interfaces/Services/IOperationTrackingStore.cs similarity index 92% rename from src/ScadaLink.Commons/Interfaces/IOperationTrackingStore.cs rename to src/ScadaLink.Commons/Interfaces/Services/IOperationTrackingStore.cs index 0f0d20f4..377ee15b 100644 --- a/src/ScadaLink.Commons/Interfaces/IOperationTrackingStore.cs +++ b/src/ScadaLink.Commons/Interfaces/Services/IOperationTrackingStore.cs @@ -1,5 +1,11 @@ using ScadaLink.Commons.Types; +// Commons-018: physically lives under Interfaces/Services/ to match the +// established subfolder convention (REQ-COM-5b), but the namespace stays +// `ScadaLink.Commons.Interfaces` to avoid a cascading update to 9+ consumer +// files across ScadaLink.SiteRuntime, ScadaLink.AuditLog and ScadaLink.Host. +// Adopting the canonical `ScadaLink.Commons.Interfaces.Services` namespace +// can be picked up alongside any future Commons-wide namespace tidy-up. namespace ScadaLink.Commons.Interfaces; /// diff --git a/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs b/src/ScadaLink.Commons/Interfaces/Services/IPartitionMaintenance.cs similarity index 85% rename from src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs rename to src/ScadaLink.Commons/Interfaces/Services/IPartitionMaintenance.cs index b6908310..c4b3a492 100644 --- a/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs +++ b/src/ScadaLink.Commons/Interfaces/Services/IPartitionMaintenance.cs @@ -1,3 +1,9 @@ +// Commons-018: physically lives under Interfaces/Services/ to match the +// established subfolder convention (REQ-COM-5b), but the namespace stays +// `ScadaLink.Commons.Interfaces` to avoid a cascading update to consumers +// across ScadaLink.AuditLog and ScadaLink.ConfigurationDatabase. Adopting +// the canonical `ScadaLink.Commons.Interfaces.Services` namespace can be +// picked up alongside any future Commons-wide namespace tidy-up. namespace ScadaLink.Commons.Interfaces; /// diff --git a/src/ScadaLink.Commons/Types/TrackingStatusSnapshot.cs b/src/ScadaLink.Commons/Types/TrackingStatusSnapshot.cs index ea9b5a6c..20b96555 100644 --- a/src/ScadaLink.Commons/Types/TrackingStatusSnapshot.cs +++ b/src/ScadaLink.Commons/Types/TrackingStatusSnapshot.cs @@ -29,6 +29,11 @@ namespace ScadaLink.Commons.Types; /// Cluster node that submitted the cached call (e.g. "node-a" / /// "node-b"), captured at enqueue time. Null on rows persisted before /// the SourceNode stamping migration; stamping itself is wired in a later task. +/// Commons-023: trailing-optional with a = null default, matching the +/// SourceNode rollout convention now used on SiteCallSummary, +/// SiteCallDetail, NotificationSummary and NotificationDetail +/// — so existing positional construction sites keep compiling as new +/// optional fields land on this record. /// public sealed record TrackingStatusSnapshot( TrackedOperationId Id, @@ -43,4 +48,4 @@ public sealed record TrackingStatusSnapshot( DateTime? TerminalAtUtc, string? SourceInstanceId, string? SourceScript, - string? SourceNode); + string? SourceNode = null); diff --git a/src/ScadaLink.Commons/Types/Transport/BundleSession.cs b/src/ScadaLink.Commons/Types/Transport/BundleSession.cs index 82942684..6989080d 100644 --- a/src/ScadaLink.Commons/Types/Transport/BundleSession.cs +++ b/src/ScadaLink.Commons/Types/Transport/BundleSession.cs @@ -2,6 +2,16 @@ namespace ScadaLink.Commons.Types.Transport; public sealed class BundleSession { + /// + /// Commons-016: legacy per-session lockout threshold (kept on this type for the + /// shim getter). The authoritative, server-side per-bundle + /// counter is bounded by TransportOptions.MaxUnlockAttemptsPerSession + /// (default also 3) and is what BundleImporter.LoadAsync consults. + /// This constant exists so the comparison in uses a named + /// symbol that a security review can grep for, rather than a literal 3. + /// + public const int MaxUnlockAttempts = 3; + /// Unique identifier for this import session. public Guid SessionId { get; init; } /// Parsed manifest from the uploaded bundle. @@ -22,6 +32,7 @@ public sealed class BundleSession /// /// T-003 legacy: always false on a session returned by LoadAsync /// because lockout enforcement moved server-side; see . + /// The threshold is the named constant (default 3). /// - public bool Locked => FailedUnlockAttempts >= 3; + public bool Locked => FailedUnlockAttempts >= MaxUnlockAttempts; } diff --git a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs index 0e061f1a..39c8db45 100644 --- a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs +++ b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs @@ -410,7 +410,14 @@ public class CentralCommunicationActor : ReceiveActor contacts[site.SiteIdentifier] = addrs; } - return new SiteAddressCacheLoaded(contacts); + // Communication-020: freeze the cross-task payload before piping to + // Self. The message record exposes read-only types ( + // IReadOnlyDictionary / IReadOnlyList) so the Akka.NET message- + // immutability convention is enforced by type, not just convention. + var frozen = contacts.ToDictionary( + kvp => kvp.Key, + kvp => (IReadOnlyList)kvp.Value.AsReadOnly()); + return new SiteAddressCacheLoaded(frozen); }).PipeTo(self); } @@ -540,8 +547,14 @@ public record RefreshSiteAddresses; /// /// Internal message carrying the loaded site contact data from the database. /// ClusterClient creation happens on the actor thread in HandleSiteAddressCacheLoaded. +/// +/// Communication-020: the payload is exposed as +/// of so the Akka.NET "messages are immutable" +/// convention is enforced at the type level rather than relying on producer +/// discipline. The producer wraps the constructed buckets with +/// List<T>.AsReadOnly() before piping to Self. /// -internal record SiteAddressCacheLoaded(Dictionary> SiteContacts); +internal record SiteAddressCacheLoaded(IReadOnlyDictionary> SiteContacts); /// /// Notification sent to debug view subscribers when the stream is terminated diff --git a/src/ScadaLink.DeploymentManager/DeploymentService.cs b/src/ScadaLink.DeploymentManager/DeploymentService.cs index 7cb07ee8..2eed8320 100644 --- a/src/ScadaLink.DeploymentManager/DeploymentService.cs +++ b/src/ScadaLink.DeploymentManager/DeploymentService.cs @@ -103,11 +103,26 @@ public class DeploymentService /// /// Resolves the site's string identifier from the numeric DB ID. /// The communication layer routes by string identifier (e.g. "site-a"), not DB ID. + /// + /// DeploymentManager-021: when the row is missing (FK was + /// deleted, race with admin delete, DB inconsistency) the previous behaviour + /// silently substituted the numeric id rendered as a string — every + /// downstream `CommunicationService` call then failed with a confusing + /// "unknown site" routing error that hid the real cause. Treat a missing + /// site row as a hard validation failure: throw + /// naming the unresolved id so the + /// operator sees the actual problem. On the deploy path the existing + /// try/catch turns this into a Failed deployment record with a clear + /// message; lifecycle paths propagate it to the caller (CLI/UI) which + /// surface it as an error to the operator. /// private async Task ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken) { var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken); - return site?.SiteIdentifier ?? siteId.ToString(); + if (site == null) + throw new InvalidOperationException( + $"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing."); + return site.SiteIdentifier; } /// @@ -174,11 +189,23 @@ public class DeploymentService if (reconciled != null) return Result.Success(reconciled); - // WP-4: Create deployment record with Pending status + // WP-4: Create the deployment record directly in InProgress. + // + // DeploymentManager-022: the previous code wrote the record as Pending, + // then immediately updated it to InProgress with no work in between + // (flattening, validation, and reconciliation all completed above). The + // back-to-back write cost an extra SaveChangesAsync round-trip, an + // extra IDeploymentStatusNotifier push (CentralUI-006 rendered a + // Pending→InProgress flicker for ~ms), and an extra row-version bump + // for nothing. The transient Pending slot carried no operational + // meaning — it was set and immediately overwritten — so dropping it + // collapses the start of the deploy into a single insert + notify. + // InProgress remains the documented "sent to site, awaiting response" + // state, set immediately before the round-trip below. var record = new DeploymentRecord(deploymentId, user) { InstanceId = instanceId, - Status = DeploymentStatus.Pending, + Status = DeploymentStatus.InProgress, RevisionHash = revisionHash, DeployedAt = DateTimeOffset.UtcNow }; @@ -187,12 +214,6 @@ public class DeploymentService await _repository.SaveChangesAsync(cancellationToken); NotifyStatusChange(record); - // Update status to InProgress - record.Status = DeploymentStatus.InProgress; - await _repository.UpdateDeploymentRecordAsync(record, cancellationToken); - await _repository.SaveChangesAsync(cancellationToken); - NotifyStatusChange(record); - try { // WP-1: Send to site via CommunicationService diff --git a/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs b/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs index d6e52b50..661816c4 100644 --- a/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs +++ b/src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs @@ -18,8 +18,22 @@ public class CentralHealthReportLoop : BackgroundService /// /// Reserved siteId used to represent the central cluster in the /// shared CentralHealthAggregator keyspace. + /// + /// HealthMonitoring-021: the value is prefixed with $ — a character + /// that is forbidden in real site identifiers (the configuration / + /// repository layer only permits Sites whose SiteIdentifier is a + /// plain identifier) — so the synthetic central entry cannot collide with + /// a real site whose operator-set identifier happened to be the bare word + /// "central". A collision would have caused the two reports to clobber + /// each other in the aggregator keyspace via the sequence-number guard, + /// and the real site would inherit the longer + /// grace and + /// stay falsely-online for an extra two minutes after going down. + /// Consumers (, + /// the Central UI health dashboard) reference this constant rather than + /// the literal string, so the change is local. /// - public const string CentralSiteId = "central"; + public const string CentralSiteId = "$central"; private readonly ISiteHealthCollector _collector; private readonly ICentralHealthAggregator _aggregator; diff --git a/src/ScadaLink.Host/LoggerConfigurationFactory.cs b/src/ScadaLink.Host/LoggerConfigurationFactory.cs index e0d5170f..dcecda90 100644 --- a/src/ScadaLink.Host/LoggerConfigurationFactory.cs +++ b/src/ScadaLink.Host/LoggerConfigurationFactory.cs @@ -15,6 +15,15 @@ namespace ScadaLink.Host; /// set, console output template, file path and rolling interval are all /// configuration-driven (defined in appsettings.json), not hard-coded. The /// explicit MinimumLevel.Is below pins the floor from . +/// +/// Host-020: ScadaLink:Logging:MinimumLevel is the single source of truth +/// for the floor — the explicit MinimumLevel.Is call deliberately runs +/// AFTER ReadFrom.Configuration so a Serilog:MinimumLevel entry in +/// configuration is overridden. To make that precedence visible (so an operator +/// who sets Serilog:MinimumLevel does not wonder why the change had no +/// effect), writes a one-shot warning to +/// when both keys are present. Pick one path — +/// editing Serilog:MinimumLevel alone has no effect. /// public static class LoggerConfigurationFactory { @@ -29,11 +38,47 @@ public static class LoggerConfigurationFactory string nodeRole, string siteId, string nodeHostname) + => Build(configuration, nodeRole, siteId, nodeHostname, Console.Error); + + /// + /// Test-visible overload of + /// that routes the Host-020 precedence warning through a caller-supplied + /// writer so unit tests can capture it. Production calls the four-arg + /// overload which uses . + /// + /// Application configuration supplying the Serilog section and logging options. + /// Role label added as a log enrichment property. + /// Site identifier added as a log enrichment property. + /// Hostname added as a log enrichment property. + /// Writer that receives the one-shot Host-020 override-warning when both keys are present. + internal static LoggerConfiguration Build( + IConfiguration configuration, + string nodeRole, + string siteId, + string nodeHostname, + TextWriter warningWriter) { var loggingOptions = new LoggingOptions(); configuration.GetSection("ScadaLink:Logging").Bind(loggingOptions); - var minimumLevel = ParseLevel(loggingOptions.MinimumLevel); + var minimumLevel = ParseLevel(loggingOptions.MinimumLevel, warningWriter); + + // Host-020: warn once if the operator also set a Serilog:MinimumLevel — + // they almost certainly expected it to take effect, but the explicit + // MinimumLevel.Is call below silently overrides it. The warning is + // emitted only when the conflicting key is actually present (a bare + // "Default" value is what ReadFrom.Configuration reads); a missing / + // empty Serilog:MinimumLevel section is silent. + var serilogMinimumLevel = configuration["Serilog:MinimumLevel"] + ?? configuration["Serilog:MinimumLevel:Default"]; + if (!string.IsNullOrWhiteSpace(serilogMinimumLevel)) + { + warningWriter.WriteLine( + $"warning: Serilog:MinimumLevel ('{serilogMinimumLevel}') is being overridden by " + + $"ScadaLink:Logging:MinimumLevel ('{loggingOptions.MinimumLevel ?? "Information (default)"}'). " + + "ScadaLink:Logging:MinimumLevel is the documented source of truth for the floor (Host-011); " + + "remove the Serilog:MinimumLevel entry to silence this warning."); + } return new LoggerConfiguration() .ReadFrom.Configuration(configuration) diff --git a/src/ScadaLink.Host/appsettings.Central.json b/src/ScadaLink.Host/appsettings.Central.json index 1fd94dc9..f8464b05 100644 --- a/src/ScadaLink.Host/appsettings.Central.json +++ b/src/ScadaLink.Host/appsettings.Central.json @@ -1,9 +1,11 @@ { "ScadaLink": { + "_nodeName": "Host-018: NodeName stamps SourceNode on AuditLog/Notifications/SiteCalls rows (CLAUDE.md 'Centralized Audit Log' decision) and backs IX_AuditLog_Node_Occurred. Convention: 'central-a'/'central-b' for central nodes, 'node-a'/'node-b' for site nodes. Override per-node in multi-node deployments (the docker per-node configs do this). When left at the default below, single-node dev rows are stamped with 'central-a'; an empty value normalises to a NULL SourceNode.", "Node": { "Role": "Central", "NodeHostname": "localhost", - "RemotingPort": 8081 + "RemotingPort": 8081, + "NodeName": "central-a" }, "Cluster": { "SeedNodes": [ diff --git a/src/ScadaLink.Host/appsettings.Site.json b/src/ScadaLink.Host/appsettings.Site.json index 87a6a347..93a6becd 100644 --- a/src/ScadaLink.Host/appsettings.Site.json +++ b/src/ScadaLink.Host/appsettings.Site.json @@ -1,11 +1,13 @@ { "ScadaLink": { + "_nodeName": "Host-018: NodeName stamps SourceNode on AuditLog/Notifications/SiteCalls rows (CLAUDE.md 'Centralized Audit Log' decision) and backs IX_AuditLog_Node_Occurred. Convention: 'node-a'/'node-b' for site nodes, 'central-a'/'central-b' for central nodes. Override per-node in multi-node deployments (the docker per-node configs do this). When left at the default below, single-node dev rows are stamped with 'node-a'; an empty value normalises to a NULL SourceNode.", "Node": { "Role": "Site", "NodeHostname": "localhost", "SiteId": "site-a", "RemotingPort": 8082, - "GrpcPort": 8083 + "GrpcPort": 8083, + "NodeName": "node-a" }, "Cluster": { "SeedNodes": [ @@ -31,9 +33,9 @@ "ReplicationEnabled": true }, "Communication": { + "_centralContactPoints": "Host-016: each entry MUST be a central node's remoting endpoint, NOT this site's own remoting port. The single dev-loopback default below points only at central-a (localhost:8081). In a multi-central deployment add the second central node here (e.g. 'akka.tcp://scadalink@central-b-host:8081') so ClusterClient can fail over when central-a is down. The previous template listed localhost:8082 as the second contact — that is THIS site's own RemotingPort and is a permanent failure in the initial-contact rotation.", "CentralContactPoints": [ - "akka.tcp://scadalink@localhost:8081", - "akka.tcp://scadalink@localhost:8082" + "akka.tcp://scadalink@localhost:8081" ], "DeploymentTimeout": "00:02:00", "LifecycleTimeout": "00:00:30", diff --git a/src/ScadaLink.Host/appsettings.json b/src/ScadaLink.Host/appsettings.json index 8de97a3e..e2d67821 100644 --- a/src/ScadaLink.Host/appsettings.json +++ b/src/ScadaLink.Host/appsettings.json @@ -1,9 +1,5 @@ { - "Logging": { - "LogLevel": { - "Default": "Information" - } - }, + "_logging": "Host-021: Serilog is the sole logger provider (Program.cs calls builder.Host.UseSerilog()), so the standard Microsoft 'Logging:LogLevel' block has no effect and was removed. The minimum level is set via 'ScadaLink:Logging:MinimumLevel' (bound to LoggingOptions per Host-011); sinks are defined under the 'Serilog' section below and applied via ReadFrom.Configuration (Host-014). See LoggerConfigurationFactory + Component-Host.md REQ-HOST-8.", "Serilog": { "Using": [ "Serilog.Sinks.Console", diff --git a/src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs b/src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs index 77bee5ea..9a8614f2 100644 --- a/src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs +++ b/src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs @@ -36,9 +36,18 @@ namespace ScadaLink.SiteCallAudit; /// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" — /// the actor catches every exception from the repository call and replies /// Accepted=false without rethrowing, so the central singleton stays -/// alive. The uses Resume so an -/// unexpected throw before the catch (defence in depth) does not restart the -/// actor and reset in-flight state. +/// alive. The override governs the actor's +/// children, not the actor itself; this actor has no children today, +/// so the override is currently inert. It returns a one-for-one strategy with +/// (Restart on most +/// exceptions, Stop on / +/// ) and maxNrOfRetries: 0, so any +/// future child that throws is Stopped on the first failure — a deliberate +/// "fail loudly" posture for the central singleton's eventual sub-actors +/// (reconciliation puller, purge scheduler). Self-supervision of this actor +/// is whatever the parent +/// supplies; the in-handler try/catch in +/// is what actually keeps the singleton alive across repository faults. /// /// /// Two constructors exist for the same reason as @@ -147,7 +156,18 @@ public class SiteCallAuditActor : ReceiveActor Receive(HandleDiscardSiteCall); } - /// + /// + /// SiteCallAudit-001: child supervision strategy — governs children, not this + /// actor. The actor has no children today, so this override is inert; it + /// returns a one-for-one strategy with the framework + /// (Restart on + /// most exceptions; Stop on / + /// ) and maxNrOfRetries: 0, so any + /// future child that throws is Stopped on the first failure. The actor's + /// own resilience comes from the try/catch in + /// plus the parent 's + /// supervision — not from this override. + /// protected override SupervisorStrategy SupervisorStrategy() { return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider: @@ -179,7 +199,14 @@ public class SiteCallAuditActor : ReceiveActor try { - await repository.UpsertAsync(cmd.SiteCall).ConfigureAwait(false); + // SiteCallAudit-003: stamp IngestedAtUtc at central-side persist + // time on every upsert, mirroring AuditLogIngestActor's combined- + // telemetry hot path. IngestedAtUtc is the "central ingested (or + // last refreshed) this row" timestamp; callers (telemetry, + // future reconciliation puller, direct-writes) cannot in general + // know they are running on central, so the actor owns the stamp. + var siteCall = cmd.SiteCall with { IngestedAtUtc = DateTime.UtcNow }; + await repository.UpsertAsync(siteCall).ConfigureAwait(false); replyTo.Tell(new UpsertSiteCallReply(id, Accepted: true)); } catch (Exception ex) diff --git a/src/ScadaLink.SiteEventLogging/ISiteEventLogger.cs b/src/ScadaLink.SiteEventLogging/ISiteEventLogger.cs index a51a760b..22c467b7 100644 --- a/src/ScadaLink.SiteEventLogging/ISiteEventLogger.cs +++ b/src/ScadaLink.SiteEventLogging/ISiteEventLogger.cs @@ -27,4 +27,14 @@ public interface ISiteEventLogger string source, string message, string? details = null); + + /// + /// SiteEventLogging-018: total number of event writes that have failed + /// (SQLite error, disk full, bounded-queue overflow drop, etc.) since this + /// logger was created. Available for future Health Monitoring integration — + /// promoted onto the interface so a Health consumer can read it without a + /// concrete-type downcast. Not yet polled by Health Monitoring; the wiring + /// is tracked separately. + /// + long FailedWriteCount { get; } } diff --git a/src/ScadaLink.SiteEventLogging/SiteEventLogger.cs b/src/ScadaLink.SiteEventLogging/SiteEventLogger.cs index b61c0734..a09e7275 100644 --- a/src/ScadaLink.SiteEventLogging/SiteEventLogger.cs +++ b/src/ScadaLink.SiteEventLogging/SiteEventLogger.cs @@ -90,9 +90,15 @@ public class SiteEventLogger : ISiteEventLogger, IDisposable } /// - /// Number of event writes that have failed (SQLite error, disk full, etc.) - /// since this logger was created. Surfaced so Health Monitoring can detect a - /// logging outage instead of relying on a local log line nobody is watching. + /// SiteEventLogging-018: number of event writes that have failed (SQLite + /// error, disk full, bounded-queue overflow drop, etc.) since this logger + /// was created. Available for future Health Monitoring integration — the + /// counter is correct and observable, but the central health-metric + /// pipeline does not yet poll it, so a sustained non-zero value currently + /// goes unnoticed in production beyond the per-failure log line. Wiring + /// the metric into the 30-second site-metric publish is tracked + /// separately; promoted to so the eventual + /// consumer reads it without a concrete-type downcast. /// public long FailedWriteCount => Interlocked.Read(ref _failedWriteCount); diff --git a/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs index 03b39944..b617145c 100644 --- a/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -132,6 +132,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // WP-33: Handle system-wide artifact deployment Receive(HandleDeployArtifacts); + // SiteRuntime-021: artifact-deploy DCL push, dispatched back from the + // off-thread persistence task so the hash-cache mutation stays + // actor-thread-confined. + Receive(HandleApplyArtifactDataConnectionsToDcl); + // Debug View — route to Instance Actors Receive(RouteDebugViewSubscribe); Receive(RouteDebugViewUnsubscribe); @@ -642,23 +647,12 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers foreach (var (name, connConfig) in config.Connections) { - var configHash = ComputeConnectionConfigHash(connConfig); - if (_createdConnections.TryGetValue(name, out var lastHash) && lastHash == configHash) - continue; - - var primaryDetails = FlattenConnectionConfig(connConfig.Protocol, connConfig.ConfigurationJson); - var backupDetails = string.IsNullOrEmpty(connConfig.BackupConfigurationJson) - ? null - : FlattenConnectionConfig(connConfig.Protocol, connConfig.BackupConfigurationJson); - - _dclManager.Tell(new Commons.Messages.DataConnection.CreateConnectionCommand( - name, connConfig.Protocol, primaryDetails, backupDetails, connConfig.FailoverRetryCount)); - - var changed = _createdConnections.ContainsKey(name); - _createdConnections[name] = configHash; - _logger.LogInformation( - "{Action} DCL connection {Connection} (protocol={Protocol})", - changed ? "Updated" : "Created", name, connConfig.Protocol); + EnsureDclConnection( + name, + connConfig.Protocol, + connConfig.ConfigurationJson, + connConfig.BackupConfigurationJson, + connConfig.FailoverRetryCount); } } catch (Exception ex) @@ -667,20 +661,78 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } } + /// + /// SiteRuntime-021: hash-guarded DCL connection push shared by the inline + /// per-instance path () and the + /// system-wide artifact-deploy path (). + /// Unchanged config is a no-op; a changed endpoint/credentials/backup/ + /// failover-count re-issues a CreateConnectionCommand so a system- + /// wide artifact-deploy makes its data-connection change live immediately + /// (the artifact-deploy path previously only persisted to SQLite — the + /// DCL didn't see the change until next instance redeploy or node + /// restart, contradicting the "site is self-contained after artifact + /// deployment" intent). + /// + private void EnsureDclConnection( + string name, + string protocol, + string? primaryConfigurationJson, + string? backupConfigurationJson, + int failoverRetryCount) + { + if (_dclManager == null) return; + + var configHash = ComputeConnectionConfigHashCore( + protocol, primaryConfigurationJson, backupConfigurationJson, failoverRetryCount); + if (_createdConnections.TryGetValue(name, out var lastHash) && lastHash == configHash) + return; + + var primaryDetails = FlattenConnectionConfig(protocol, primaryConfigurationJson); + var backupDetails = string.IsNullOrEmpty(backupConfigurationJson) + ? null + : FlattenConnectionConfig(protocol, backupConfigurationJson); + + _dclManager.Tell(new Commons.Messages.DataConnection.CreateConnectionCommand( + name, protocol, primaryDetails, backupDetails, failoverRetryCount)); + + var changed = _createdConnections.ContainsKey(name); + _createdConnections[name] = configHash; + _logger.LogInformation( + "{Action} DCL connection {Connection} (protocol={Protocol})", + changed ? "Updated" : "Created", name, protocol); + } + /// /// Computes a stable hash over the configuration fields that affect how the DCL /// connects, so a changed endpoint/credential/backup/failover count is detected /// (SiteRuntime-010). /// private static string ComputeConnectionConfigHash( - Commons.Types.Flattening.ConnectionConfig connConfig) + Commons.Types.Flattening.ConnectionConfig connConfig) => + ComputeConnectionConfigHashCore( + connConfig.Protocol, + connConfig.ConfigurationJson, + connConfig.BackupConfigurationJson, + connConfig.FailoverRetryCount); + + /// + /// SiteRuntime-021: field-based core so the system-wide artifact-deploy + /// path (which carries protocol/config-json/backup-json/failover directly + /// on ) can + /// share the same hash + skip-or-resend logic as the inline-config path. + /// + private static string ComputeConnectionConfigHashCore( + string protocol, + string? primaryConfigurationJson, + string? backupConfigurationJson, + int failoverRetryCount) { var material = string.Join( - "", - connConfig.Protocol, - connConfig.ConfigurationJson ?? string.Empty, - connConfig.BackupConfigurationJson ?? string.Empty, - connConfig.FailoverRetryCount.ToString()); + "", + protocol, + primaryConfigurationJson ?? string.Empty, + backupConfigurationJson ?? string.Empty, + failoverRetryCount.ToString(System.Globalization.CultureInfo.InvariantCulture)); var bytes = System.Security.Cryptography.SHA256.HashData( System.Text.Encoding.UTF8.GetBytes(material)); @@ -983,6 +1035,20 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers dc.Name, dc.Protocol, dc.PrimaryConfigurationJson, dc.BackupConfigurationJson, dc.FailoverRetryCount); } + + // SiteRuntime-021: after the SQLite store, dispatch an + // internal message back to the actor thread so the DCL + // push runs through EnsureDclConnection — keeping the + // _createdConnections hash cache mutation actor-thread- + // confined while still making the change live immediately + // (previously the change landed in SQLite but the DCL + // kept using the stale connection until next instance + // redeploy or node restart, contradicting "site is + // self-contained after artifact deployment"). The + // helper's hash cache skips unchanged definitions, so + // the push is idempotent for re-deploys of the same + // artifact bundle. + Self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections)); } // Store SMTP configurations @@ -1044,6 +1110,27 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _logger.LogDebug("Created Instance Actor for {Instance}", instanceName); } + /// + /// SiteRuntime-021: actor-thread handler that pushes artifact-deploy data + /// connection definitions to the DCL via the shared + /// helper. Dispatched from + /// 's off-thread Task so the + /// hash-cache mutation stays + /// actor-thread-confined. + /// + private void HandleApplyArtifactDataConnectionsToDcl(ApplyArtifactDataConnectionsToDcl msg) + { + foreach (var dc in msg.DataConnections) + { + EnsureDclConnection( + dc.Name, + dc.Protocol, + dc.PrimaryConfigurationJson, + dc.BackupConfigurationJson, + dc.FailoverRetryCount); + } + } + /// /// Gets the count of active Instance Actors (for testing/diagnostics). /// @@ -1085,4 +1172,14 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers /// A redeployment command buffered until the previous Instance Actor terminates. /// internal record PendingRedeploy(DeployInstanceCommand Command, IActorRef OriginalSender); + + /// + /// SiteRuntime-021: internal message dispatched from + /// 's off-thread persistence task back + /// onto the actor thread, so the DCL push (and its hash-cache mutation) + /// runs through without crossing + /// thread-confinement boundaries. + /// + internal record ApplyArtifactDataConnectionsToDcl( + IReadOnlyList DataConnections); } diff --git a/src/ScadaLink.SiteRuntime/Scripts/AuditingDbCommand.cs b/src/ScadaLink.SiteRuntime/Scripts/AuditingDbCommand.cs index 2a06c3e9..e24827a2 100644 --- a/src/ScadaLink.SiteRuntime/Scripts/AuditingDbCommand.cs +++ b/src/ScadaLink.SiteRuntime/Scripts/AuditingDbCommand.cs @@ -135,15 +135,20 @@ internal sealed class AuditingDbCommand : DbCommand // the wrapper, but writes from the user go through to the inner // command so the underlying provider keeps its wiring intact. get => _wrappingConnection ?? _inner.Connection; + // SiteRuntime-022: unwrap the AuditingDbConnection wrapper via its + // own internal Inner accessor instead of reflecting into a private + // _inner field. Reflection was the original SiteRuntime-006 anti- + // pattern (and is forbidden inside script bodies by the trust + // model) — both classes are internal sealed in the same assembly, + // so the proper API surface is available without leaking anything + // public. set { _wrappingConnection = value; _inner.Connection = value switch { - AuditingDbConnection auditing => auditing.GetType() - .GetField("_inner", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic) - !.GetValue(auditing) as DbConnection, - _ => value + AuditingDbConnection auditing => auditing.Inner, + _ => value, }; } } diff --git a/src/ScadaLink.SiteRuntime/Scripts/AuditingDbConnection.cs b/src/ScadaLink.SiteRuntime/Scripts/AuditingDbConnection.cs index f3aded14..be9fe290 100644 --- a/src/ScadaLink.SiteRuntime/Scripts/AuditingDbConnection.cs +++ b/src/ScadaLink.SiteRuntime/Scripts/AuditingDbConnection.cs @@ -86,6 +86,18 @@ internal sealed class AuditingDbConnection : DbConnection _parentExecutionId = parentExecutionId; } + /// + /// SiteRuntime-022: exposes the wrapped to the + /// sibling in the same assembly, so the + /// command's DbConnection setter can unwrap an + /// without reflecting into the + /// private _inner field. Both classes are internal sealed + /// in this assembly, so the accessor stays out of the public API and + /// matches the SiteRuntime-006 precedent of preferring proper API surface + /// over . + /// + internal DbConnection Inner => _inner; + /// // ConnectionString is settable on DbConnection — forward both halves. public override string ConnectionString diff --git a/src/ScadaLink.StoreAndForward/ServiceCollectionExtensions.cs b/src/ScadaLink.StoreAndForward/ServiceCollectionExtensions.cs index 8c7a964e..f923f4fa 100644 --- a/src/ScadaLink.StoreAndForward/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.StoreAndForward/ServiceCollectionExtensions.cs @@ -42,6 +42,11 @@ public static class ServiceCollectionExtensions // ISiteIdentityProvider — HealthMonitoring already references S&F. var cachedCallObserver = sp.GetService(); var siteContext = sp.GetService(); + // StoreAndForward-023: pass null/empty through unchanged — the + // service constructor normalises it to UnknownSiteSentinel so a + // host without an IStoreAndForwardSiteContext registration is + // observable in the central audit log instead of producing a + // silent empty-string SourceSite. var siteId = siteContext?.SiteId ?? string.Empty; return new StoreAndForwardService( storage, diff --git a/src/ScadaLink.StoreAndForward/StoreAndForwardOptions.cs b/src/ScadaLink.StoreAndForward/StoreAndForwardOptions.cs index 5db885d9..d025f353 100644 --- a/src/ScadaLink.StoreAndForward/StoreAndForwardOptions.cs +++ b/src/ScadaLink.StoreAndForward/StoreAndForwardOptions.cs @@ -14,7 +14,24 @@ public class StoreAndForwardOptions /// WP-10: Default retry interval for messages without per-source settings. public TimeSpan DefaultRetryInterval { get; set; } = TimeSpan.FromSeconds(30); - /// WP-10: Default maximum retry count before parking. + /// + /// WP-10: Default maximum retry count before parking. Applied when an + /// EnqueueAsync caller does not pass an explicit maxRetries. + /// + /// StoreAndForward-019: this default is enforced uniformly across + /// every category, including : + /// once the buffered message's retry count reaches this cap the engine + /// parks the row. The Component-StoreAndForward.md "notifications do not + /// park" wording reflects the operational intent when central is + /// reachable on the normal cadence; under a sustained central outage that + /// exceeds DefaultMaxRetries × forward-interval a buffered + /// notification will park and surface in the parked-message UI, + /// matching the rest of the system's bounded-retry-then-park behaviour. + /// Callers that genuinely require unbounded retry must pass + /// maxRetries: 0 on EnqueueAsync (the documented "no limit" + /// escape hatch — see StoreAndForwardService.EnqueueAsync). + /// + /// public int DefaultMaxRetries { get; set; } = 50; /// WP-10: Interval for the background retry timer sweep. diff --git a/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs b/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs index b3943d17..07162323 100644 --- a/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs +++ b/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs @@ -46,8 +46,31 @@ public class StoreAndForwardService /// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the /// cached-call attempt context so the audit bridge can build the /// half of the telemetry packet. + /// + /// StoreAndForward-023: an empty-string site id must never reach + /// downstream consumers — the central audit pipeline keys + /// (SourceSite, TrackedOperationId) off this value, so an empty + /// string degrades correlation to a per-id-only index and breaks the + /// per-site routing of RetryParkedOperation/DiscardParkedOperation + /// commands. The constructor normalises a null/empty/whitespace + /// argument to + /// so a misconfigured host (no IStoreAndForwardSiteContext + /// registered) produces a distinctive marker in the central audit log + /// rather than silently merging multiple sites into the empty bucket. + /// /// private readonly string _siteId; + + /// + /// StoreAndForward-023: distinctive marker stamped onto cached-call audit + /// telemetry when the host has not registered an + /// . Chosen with a leading $ + /// so it cannot collide with a real site id (which is a configuration + /// identifier and never starts with $). Surfacing this in the + /// central audit log makes a missing site-context binding immediately + /// recognisable instead of an unattributable empty string. + /// + public const string UnknownSiteSentinel = "$unknown-site"; private Timer? _retryTimer; private int _retryInProgress; @@ -120,7 +143,11 @@ public class StoreAndForwardService _logger = logger; _replication = replication; _cachedCallObserver = cachedCallObserver; - _siteId = siteId; + // StoreAndForward-023: normalise an empty / whitespace site id to the + // distinctive UnknownSiteSentinel so downstream consumers (the central + // audit pipeline keying off SourceSite) never see an empty string and + // a misconfigured host is recognisable in the central log. + _siteId = string.IsNullOrWhiteSpace(siteId) ? UnknownSiteSentinel : siteId; } /// @@ -583,8 +610,21 @@ public class StoreAndForwardService if (!TrackedOperationId.TryParse(message.Id, out var trackedId)) { - // Pre-M3 message (random GUID-N id from S&F itself, no - // TrackedOperationId threaded in). Skip — no audit row to bind to. + // StoreAndForward-022: previously a silent skip — but a non-GUID + // message id means a caller bypassed the audit hot path with zero + // feedback. The drop is still best-effort (S&F retry bookkeeping + // must never depend on the audit pipeline) but it is now observable + // via a Warning so a misconfigured caller can be diagnosed. + // Engine-minted ids (Guid.NewGuid().ToString("N")) and the current + // caller set (NotificationOutbox enqueue with NotificationId, + // cached-call enqueue with TrackedOperationId.ToString()) all + // parse — this log line fires only when a future caller supplies a + // non-GUID id, which is exactly when the silent-drop was hardest + // to diagnose. + _logger.LogWarning( + "Cached-call audit observer skipped: message id {MessageId} is not a parseable TrackedOperationId (category {Category}, outcome {Outcome}). " + + "Audit lifecycle for this operation will have no rows.", + message.Id, message.Category, outcome); return; } @@ -667,26 +707,51 @@ public class StoreAndForwardService /// so a failover preserves the operator's retry intent. /// StoreAndForward-017: the activity-log entry carries the message's true /// category rather than a hard-coded one. + /// StoreAndForward-020: the parked row is captured before the local + /// requeue write rather than re-read after it, so a concurrent + /// RemoveMessageAsync or DiscardParkedMessageAsync running + /// between the two storage calls cannot leave the standby in Parked + /// while the active node has already requeued — we always have the row in + /// hand for the Requeue replication. /// /// The identifier of the message to retry. /// True if successfully retried, false otherwise. public async Task RetryParkedMessageAsync(string messageId) { - var success = await _storage.RetryParkedMessageAsync(messageId); - if (success) + // StoreAndForward-020: capture the parked row up front so the standby + // gets a Requeue even if a concurrent writer (a sweep delete after a + // successful delivery, or an operator discard) removes the row between + // the local update and the re-load. The storage call below is + // conditional on status = Parked, so if the row has already moved we + // return false here without replicating — the standby's matching row + // will be reconciled by whichever other operator path won the race. + var captured = await _storage.GetMessageByIdAsync(messageId); + if (captured is null || captured.Status != StoreAndForwardMessageStatus.Parked) { - // Re-load the requeued row so the activity log gets the real category - // and the standby gets the post-requeue state (Pending, retry_count = 0). - var message = await _storage.GetMessageByIdAsync(messageId); - var category = message?.Category ?? StoreAndForwardCategory.ExternalSystem; - if (message != null) - { - _replication?.ReplicateRequeue(message); - } - RaiseActivity("Retry", category, - $"Parked message {messageId} moved back to queue"); + return false; } - return success; + + var success = await _storage.RetryParkedMessageAsync(messageId); + if (!success) + { + return false; + } + + // The active node just rewrote this row to Pending with retry_count = 0 + // and cleared last_error / last_attempt_at (see + // StoreAndForwardStorage.RetryParkedMessageAsync). Reconstruct the + // post-requeue state on the captured POCO so the standby applies the + // same mutations even if a concurrent writer has already deleted the + // row underneath us. + captured.Status = StoreAndForwardMessageStatus.Pending; + captured.RetryCount = 0; + captured.LastError = null; + captured.LastAttemptAt = null; + _replication?.ReplicateRequeue(captured); + + RaiseActivity("Retry", captured.Category, + $"Parked message {messageId} moved back to queue"); + return true; } /// diff --git a/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs b/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs index 7cb035cf..cf18ca89 100644 --- a/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs +++ b/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsTests.cs @@ -35,17 +35,26 @@ public class ClusterOptionsTests Assert.Empty(options.SeedNodes); } - [Fact] - public void SectionName_IsTheExpectedAppSettingsSection() - { - // CI-005: ClusterOptions must expose a single-source-of-truth constant for - // its appsettings.json section so binding sites do not hard-code the string. - Assert.Equal("ScadaLink:Cluster", ClusterOptions.SectionName); - } + // ClusterInfra-011: SectionName constant deleted — the previous test + // `SectionName_IsTheExpectedAppSettingsSection` is removed alongside it. + // The Host's SiteServiceRegistration / StartupValidator continue to + // reference the `"ScadaLink:Cluster"` literal directly; reinstating the + // constant should happen when those Host binding sites can be updated in + // the same change. [Fact] public void Properties_CanBeSetToCustomValues() { + // ClusterInfra-013: this test exercises the POCO property setters only — + // `SplitBrainResolverStrategy = "keep-majority"` and `MinNrOfMembers = 2` + // are values the design doc explicitly forbids in production + // (`keep-majority` causes total shutdown on a two-node partition; + // `MinNrOfMembers = 2` blocks the cluster singleton after failover). + // The POCO accepts any value by design; rejection lives in + // `ClusterOptionsValidator` and is covered by + // `ClusterOptionsValidatorTests.UnsupportedSplitBrainStrategy_FailsValidation` + // and `ClusterOptionsValidatorTests.MinNrOfMembers_NotOne_FailsValidation`. + // Do NOT read these values as endorsed runtime configuration. var options = new ClusterOptions { SeedNodes = new List { "akka.tcp://system@node1:2551", "akka.tcp://system@node2:2551" }, diff --git a/tests/ScadaLink.ClusterInfrastructure.Tests/ServiceCollectionExtensionsTests.cs b/tests/ScadaLink.ClusterInfrastructure.Tests/ServiceCollectionExtensionsTests.cs index b7a431b9..c54007bf 100644 --- a/tests/ScadaLink.ClusterInfrastructure.Tests/ServiceCollectionExtensionsTests.cs +++ b/tests/ScadaLink.ClusterInfrastructure.Tests/ServiceCollectionExtensionsTests.cs @@ -4,11 +4,11 @@ using Microsoft.Extensions.Options; namespace ScadaLink.ClusterInfrastructure.Tests; /// -/// CI-002: Tests that the DI extension methods do real work rather than -/// silently returning success. -/// must register the so misconfiguration -/// fails fast, and the unimplemented actor-registration placeholder must fail -/// loudly rather than masquerade as a completed registration. +/// CI-002: Tests that +/// does real work rather than silently returning success — it must register +/// the so misconfiguration fails fast. +/// (The companion actor-registration test was removed alongside the deleted +/// `AddClusterInfrastructureActors` extension method — see ClusterInfra-014.) /// public class ServiceCollectionExtensionsTests { @@ -48,11 +48,10 @@ public class ServiceCollectionExtensionsTests Assert.Contains("MinNrOfMembers", ex.Message); } - [Fact] - public void AddClusterInfrastructureActors_ThrowsRatherThanSilentlySucceeding() - { - var services = new ServiceCollection(); - - Assert.Throws(() => services.AddClusterInfrastructureActors()); - } + // ClusterInfra-014: `AddClusterInfrastructureActors_ThrowsRatherThanSilentlySucceeding` + // was removed alongside the now-deleted `AddClusterInfrastructureActors` + // extension method. The Akka.NET actor wiring legitimately lives in + // `ScadaLink.Host` (AkkaHostedService) per the + // Component-ClusterInfrastructure.md "Implementation Note — Code Placement" + // section; this project no longer exposes an actor-registration extension. } diff --git a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs index ddd4a984..a8f94c5c 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs @@ -5,6 +5,7 @@ using Microsoft.Extensions.Options; using NSubstitute; using ScadaLink.Commons.Entities.Deployment; using ScadaLink.Commons.Entities.Instances; +using ScadaLink.Commons.Entities.Sites; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Messages.Deployment; @@ -44,7 +45,7 @@ public class DeploymentServiceTests : TestKit OperationLockTimeout = TimeSpan.FromSeconds(5) }); - var siteRepo = Substitute.For(); + var siteRepo = CreateSiteRepoStub(); _service = new DeploymentService( _repo, siteRepo, _pipeline, _comms, _lockManager, _audit, new DiffService(), @@ -101,6 +102,91 @@ public class DeploymentServiceTests : TestKit Assert.Contains("Validation failed", result.Error); } + // ── DeploymentManager-021: missing Site row -> hard failure, no silent fabrication ── + + [Fact] + public async Task DeployInstanceAsync_SiteRowMissing_FailsLoudlyInsteadOfSilentlySubstituting() + { + // DeploymentManager-021 regression: previously ResolveSiteIdentifierAsync + // silently returned the numeric siteId rendered as a string when the + // site row was missing (FK was deleted, race with admin delete, DB + // inconsistency). That bogus identifier then surfaced downstream as a + // confusing "unknown site" routing error that hid the real cause. + // + // After the fix the resolver throws InvalidOperationException naming + // the unresolved id; on the deploy path the existing try/catch turns + // it into a Failed deployment record whose error message reflects the + // actual problem. + var instance = new Instance("OrphanInst") + { + Id = 99, SiteId = 42, State = InstanceState.NotDeployed + }; + _repo.GetInstanceByIdAsync(99, Arg.Any()).Returns(instance); + SetupValidPipeline(99, "OrphanInst", "sha256:abc"); + + // Build a fresh service whose siteRepo explicitly returns null for the + // instance's SiteId (the helper above seeds every id, so we shadow it + // for SiteId=42 only). + var siteRepo = CreateSiteRepoStub(); + siteRepo.GetSiteByIdAsync(42, Arg.Any()).Returns((Site?)null); + + var service = new DeploymentService( + _repo, siteRepo, _pipeline, _comms, _lockManager, _audit, + new DiffService(), + new DeploymentStatusNotifier(NullLogger.Instance), + Options.Create(new DeploymentManagerOptions { OperationLockTimeout = TimeSpan.FromSeconds(5) }), + NullLogger.Instance); + + var result = await service.DeployInstanceAsync(99, "admin"); + + Assert.True(result.IsFailure); + // The descriptive message names the unresolved id so the operator sees + // the actual problem (missing site row), not a downstream routing error. + Assert.Contains("42", result.Error); + Assert.Contains("not found", result.Error, StringComparison.OrdinalIgnoreCase); + } + + // ── DeploymentManager-022: no transient Pending -> single InProgress insert ── + + [Fact] + public async Task DeployInstanceAsync_NoTransientPendingWrite_RecordCreatedDirectlyInProgress() + { + // DeploymentManager-022 regression: previously the deploy path wrote + // the record as Pending, then immediately updated it to InProgress + // with no work in between — an extra SaveChangesAsync round-trip, an + // extra notifier push, and a Pending->InProgress flicker in the + // CentralUI deployment-status page. After the fix the record is + // inserted directly in InProgress (one Add + one notify); no + // intermediate Pending row is ever persisted or notified. + var instance = new Instance("DirectInProgressInst") + { + Id = 200, SiteId = 1, State = InstanceState.NotDeployed + }; + _repo.GetInstanceByIdAsync(200, Arg.Any()).Returns(instance); + SetupValidPipeline(200, "DirectInProgressInst", "sha256:dp22"); + + // The catch path later flips the same record reference to Failed, so + // snapshot the Status at insert time rather than reading the live + // reference at assertion time. + DeploymentStatus? statusAtInsert = null; + await _repo.AddDeploymentRecordAsync( + Arg.Do(r => statusAtInsert = r.Status), Arg.Any()); + + // The communication actor is unset so the call throws after the insert; + // we only care about the status the insert was made with. + await _service.DeployInstanceAsync(200, "admin"); + + // The single Add happens with the record already in InProgress. + Assert.NotNull(statusAtInsert); + Assert.Equal(DeploymentStatus.InProgress, statusAtInsert!.Value); + + // No Pending update was issued — the resolver never wrote the + // intermediate Pending row. + await _repo.DidNotReceive().UpdateDeploymentRecordAsync( + Arg.Is(r => r.Status == DeploymentStatus.Pending), + Arg.Any()); + } + // ── WP-2: Deployment identity ── [Fact] @@ -581,7 +667,7 @@ public class DeploymentServiceTests : TestKit NullLogger.Instance); comms.SetCommunicationActor(commActor); - var siteRepo = Substitute.For(); + var siteRepo = CreateSiteRepoStub(); return new DeploymentService( _repo, siteRepo, _pipeline, comms, _lockManager, _audit, new DiffService(), @@ -598,6 +684,30 @@ public class DeploymentServiceTests : TestKit new FlatteningPipelineResult(config, revisionHash, ValidationResult.Success()))); } + /// + /// DeploymentManager-021 test helper: returns an + /// substitute that resolves + /// for ANY integer id to a stub whose + /// SiteIdentifier is "site-{id}". Prior to the + /// DeploymentManager-021 fix the production `ResolveSiteIdentifierAsync` + /// silently substituted the numeric id when the site row was missing, so + /// these tests passed without seeding any Sites. After the fix a missing + /// site throws — every test that drives a deploy/lifecycle path needs a + /// real-shaped back, and this helper centralises that + /// arrangement so individual tests don't repeat the boilerplate. + /// + private static ISiteRepository CreateSiteRepoStub() + { + var siteRepo = Substitute.For(); + siteRepo.GetSiteByIdAsync(Arg.Any(), Arg.Any()) + .Returns(callInfo => + { + var id = callInfo.ArgAt(0); + return new Site($"Test Site {id}", $"site-{id}") { Id = id }; + }); + return siteRepo; + } + [Fact] public async Task DeployInstanceAsync_PriorInProgressRecord_SiteHasTargetHash_MarksSuccessWithoutRedeploy() { @@ -994,7 +1104,7 @@ public class DeploymentServiceTests : TestKit NullLogger.Instance); comms.SetCommunicationActor(commActor); - var siteRepo = Substitute.For(); + var siteRepo = CreateSiteRepoStub(); var service = new DeploymentService( _repo, siteRepo, _pipeline, comms, _lockManager, _audit, new DiffService(), @@ -1049,7 +1159,7 @@ public class DeploymentServiceTests : TestKit NullLogger.Instance); comms.SetCommunicationActor(commActor); - var siteRepo = Substitute.For(); + var siteRepo = CreateSiteRepoStub(); var deadline = TimeSpan.FromMilliseconds(300); var service = new DeploymentService( _repo, siteRepo, _pipeline, comms, _lockManager, _audit, @@ -1098,7 +1208,7 @@ public class DeploymentServiceTests : TestKit NullLogger.Instance); comms.SetCommunicationActor(commActor); - var siteRepo = Substitute.For(); + var siteRepo = CreateSiteRepoStub(); var service = new DeploymentService( _repo, siteRepo, _pipeline, comms, _lockManager, _audit, new DiffService(), @@ -1143,7 +1253,7 @@ public class DeploymentServiceTests : TestKit NullLogger.Instance); comms.SetCommunicationActor(commActor); - var siteRepo = Substitute.For(); + var siteRepo = CreateSiteRepoStub(); var service = new DeploymentService( _repo, siteRepo, _pipeline, comms, _lockManager, _audit, new DiffService(), diff --git a/tests/ScadaLink.DeploymentManager.Tests/DeploymentStatusNotifierTests.cs b/tests/ScadaLink.DeploymentManager.Tests/DeploymentStatusNotifierTests.cs index 8049fc1b..3a1ee809 100644 --- a/tests/ScadaLink.DeploymentManager.Tests/DeploymentStatusNotifierTests.cs +++ b/tests/ScadaLink.DeploymentManager.Tests/DeploymentStatusNotifierTests.cs @@ -4,6 +4,7 @@ using Microsoft.Extensions.Options; using NSubstitute; using ScadaLink.Commons.Entities.Deployment; using ScadaLink.Commons.Entities.Instances; +using ScadaLink.Commons.Entities.Sites; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Types; @@ -46,7 +47,17 @@ public class DeploymentStatusNotifierTests : TestKit OperationLockTimeout = TimeSpan.FromSeconds(5) }); + // DeploymentManager-021: the resolver now throws when the site row + // is missing, so seed the substitute to return a real-shaped Site for + // any id these tests touch. var siteRepo = Substitute.For(); + siteRepo.GetSiteByIdAsync(Arg.Any(), Arg.Any()) + .Returns(callInfo => + { + var id = callInfo.ArgAt(0); + return new Site($"Test Site {id}", $"site-{id}") { Id = id }; + }); + _service = new DeploymentService( _repo, siteRepo, _pipeline, _comms, _lockManager, _audit, new DiffService(), _notifier, options, @@ -68,14 +79,20 @@ public class DeploymentStatusNotifierTests : TestKit _notifier.StatusChanged += c => changes.Add(c); // _comms has no actor set, so the deploy reaches the catch block and - // the record ends Failed. The notifier must fire for the Pending, - // InProgress and Failed writes — not be silent (the pre-fix behaviour). + // the record ends Failed. The notifier must fire for the InProgress + // and Failed writes — not be silent (the pre-fix behaviour). + // + // DeploymentManager-022: the transient Pending write was dropped from + // the deploy path (the record is now created directly in InProgress), + // so there is no Pending notification any more. The remaining two + // writes — the initial InProgress insert and the catch-block Failed + // update — must each raise a status-change. var result = await _service.DeployInstanceAsync(7, "admin"); Assert.True(result.IsFailure); Assert.NotEmpty(changes); Assert.All(changes, c => Assert.Equal(7, c.InstanceId)); - Assert.Contains(changes, c => c.Status == DeploymentStatus.Pending); + Assert.DoesNotContain(changes, c => c.Status == DeploymentStatus.Pending); Assert.Contains(changes, c => c.Status == DeploymentStatus.InProgress); Assert.Contains(changes, c => c.Status == DeploymentStatus.Failed); diff --git a/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs index b2975f63..f8327343 100644 --- a/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs +++ b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs @@ -108,4 +108,54 @@ public class LoggerConfigurationTests Assert.Equal(LogEventLevel.Warning, result); Assert.Empty(writer.ToString()); } + + /// + /// Host-020: ScadaLink:Logging:MinimumLevel is the documented source + /// of truth for the Serilog floor, and the explicit MinimumLevel.Is + /// call deliberately runs after ReadFrom.Configuration(...) so a + /// Serilog:MinimumLevel entry is overridden. To make that precedence + /// visible — instead of silently swallowed — + /// writes a one-shot warning when both keys are present. The warning must + /// name both values and point the operator at the documented key. When the + /// Serilog key is absent the warning is silent. + /// + [Fact] + public void Build_BothMinimumLevelKeysSet_WarnsAboutOverride() + { + var writer = new StringWriter(); + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["ScadaLink:Logging:MinimumLevel"] = "Warning", + ["Serilog:MinimumLevel"] = "Debug", + }) + .Build(); + + LoggerConfigurationFactory.Build(configuration, "Central", "central", "node1", writer); + + var warning = writer.ToString(); + Assert.Contains("warning", warning, StringComparison.OrdinalIgnoreCase); + Assert.Contains("Serilog:MinimumLevel", warning); + Assert.Contains("ScadaLink:Logging:MinimumLevel", warning); + Assert.Contains("Debug", warning); + Assert.Contains("Warning", warning); + } + + [Fact] + public void Build_OnlyScadaLinkMinimumLevelSet_NoOverrideWarning() + { + var writer = new StringWriter(); + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["ScadaLink:Logging:MinimumLevel"] = "Warning", + }) + .Build(); + + LoggerConfigurationFactory.Build(configuration, "Central", "central", "node1", writer); + + // No Serilog override -> no override-warning. (The ScadaLink value is + // a recognised level, so ParseLevel is silent too.) + Assert.Empty(writer.ToString()); + } }