From fd618cf1dc111d7b213b78e2f3756e514981afb7 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 20 Jun 2026 17:55:12 -0400 Subject: [PATCH] =?UTF-8?q?fix(review):=20full=20code-review=20remediation?= =?UTF-8?q?=20=E2=80=94=205=20High=20+=20Medium/Low=20across=2016=20module?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remediation from the full per-module code review at 4307c381 (findings recorded separately in code-reviews/). Highs fixed: - DeploymentManager-025/SiteRuntime-031: stop broadcasting notification lists + SMTP configs (incl. credentials) to sites; site purges already-persisted rows on apply (enforces the central-only delivery design; clears plaintext SMTP creds at rest). - DataConnectionLayer-023: guard the native-alarm subscribe path against the mid-flight-unsubscribe adapter-feed leak (mirrors the DCL-021 tag-path fix). - SiteEventLogging-024: normalize From/To query bounds to UTC (the -016 fix the audit trail claimed but never committed). - KpiHistory-001: add an in-flight guard to the recorder sample tick. - ScriptAnalysis-001: harden the trust analyzer's TPA-absent fallback (resolve forbidden anchors in the minimal reference set; warn on degraded mode) — anchors added to validation references only, never the compile gate. (InboundAPI-026 left to the feat/ipsen-movein effort per owner decision.) Medium/Low: DM-026 deterministic deploy-status tiebreaker; SR-027/028/029/030 native-alarm leak/phantom-active/delete-during-redeploy fixes; AL-013/014/016; TE-024 (folder-mutation audit rows now persisted)/025; SF-025 gauge-provider clear-on-stop; ESG-025/026; SEC-023/024/025; SCA-007/008/009; plus doc/test accuracy COM-023/024, HOST-025/026, HM-024/025, NS-027/028. Full-solution build 0 warnings; ~3560 tests across 18 touched suites green. --- docs/requirements/Component-AuditLog.md | 13 +- docs/requirements/Component-Communication.md | 34 ++- .../Component-HealthMonitoring.md | 2 +- docs/requirements/Component-Host.md | 2 + docs/requirements/Component-ScriptAnalysis.md | 14 +- docs/requirements/Component-SiteCallAudit.md | 10 +- .../Central/AuditLogIngestActor.cs | 107 +++++---- .../Central/AuditLogPurgeOptions.cs | 10 + .../Observability/ScadaBridgeTelemetry.cs | 25 ++ .../Types/Kpi/KpiSeriesBucketer.cs | 16 +- .../DeploymentManagerRepository.cs | 8 + .../Actors/DataConnectionActor.cs | 36 ++- .../Adapters/MxGatewayDataConnection.cs | 13 ++ .../ArtifactDeploymentService.cs | 85 ++++--- .../DatabaseGateway.cs | 20 +- .../ZB.MOM.WW.ScadaBridge.Host.csproj | 1 + .../KpiHistoryRecorderActor.cs | 42 +++- .../ManagementActor.cs | 19 ++ .../ScriptTrustPolicy.cs | 133 ++++++++++- .../ScriptTrustValidator.cs | 39 +++- .../SiteCallAuditActor.cs | 220 ++++++++++++++---- .../EventLogQueryService.cs | 13 +- .../Actors/DeploymentManagerActor.cs | 89 +++++-- .../Actors/InstanceActor.cs | 26 +++ .../Actors/NativeAlarmActor.cs | 41 +++- .../Actors/SiteReplicationActor.cs | 14 +- .../Messages/NativeAlarmDropped.cs | 23 ++ .../Persistence/SiteStorageService.cs | 24 ++ .../StoreAndForwardService.cs | 85 +++++-- .../Services/TemplateFolderService.cs | 12 + .../Validation/SemanticValidator.cs | 37 --- .../AuditLogOptionsBindingTests.cs | 51 ++++ .../Kpi/KpiSeriesBucketerTests.cs | 56 +++++ .../RepositoryCoverageTests.cs | 25 ++ .../DataConnectionActorAlarmTests.cs | 75 ++++++ .../ArtifactDeploymentServiceTests.cs | 30 ++- .../DatabaseGatewayTests.cs | 71 +++++- .../KpiHistoryRecorderActorTests.cs | 91 +++++++- .../RoleMappingValidationTests.cs | 117 ++++++++++ .../NotificationOutboxKpiSampleSourceTests.cs | 4 +- .../SmtpErrorClassifierTests.cs | 4 +- .../ScriptTrustValidatorTests.cs | 128 ++++++++++ .../SecurityTests.cs | 91 +++++++- .../SiteCallAuditPurgeTests.cs | 45 ++++ .../SiteCallAuditReconciliationTests.cs | 135 +++++++++++ .../EventLogQueryServiceTests.cs | 36 +++ .../Actors/DeploymentManagerRedeployTests.cs | 56 +++++ .../Actors/NativeAlarmActorTests.cs | 75 +++++- .../Persistence/ArtifactStorageTests.cs | 37 +++ .../QueueDepthGaugeTests.cs | 78 +++++++ .../Services/TemplateFolderServiceTests.cs | 116 ++++++++- .../Validation/SemanticValidatorTests.cs | 18 -- 52 files changed, 2239 insertions(+), 313 deletions(-) create mode 100644 src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Messages/NativeAlarmDropped.cs create mode 100644 tests/ZB.MOM.WW.ScadaBridge.ManagementService.Tests/RoleMappingValidationTests.cs rename tests/{ZB.MOM.WW.ScadaBridge.NotificationService.Tests => ZB.MOM.WW.ScadaBridge.NotificationOutbox.Tests}/Kpi/NotificationOutboxKpiSampleSourceTests.cs (98%) diff --git a/docs/requirements/Component-AuditLog.md b/docs/requirements/Component-AuditLog.md index feb26170..650b24ea 100644 --- a/docs/requirements/Component-AuditLog.md +++ b/docs/requirements/Component-AuditLog.md @@ -404,7 +404,7 @@ MS SQL for direct-write events). Unredacted secrets never persist. silently skipped (the global switch already covers them). The DELETE runs under `scadabridge_audit_purger` (the maintenance role); the append-only writer role is unaffected. Batch size is configurable via - `AuditLogPurge:ChannelPurgeBatchSize` (default 5000). Each channel override + `AuditLog:Purge:ChannelPurgeBatchSize` (default 5000). Each channel override runs in its own try/catch, mirroring the per-boundary error-isolation of the partition switch-out loop. Values are validated to be in `[30, RetentionDays]`; keys that are not a recognized `AuditChannel` enum name @@ -498,12 +498,15 @@ the global window; `PerChannelRetentionDays` specifies per-channel windows that are strictly shorter — any channel whose override equals or exceeds the global value is silently ignored (the global partition switch-out already governs it). -`AuditLogPurge` section controls the purge actor cadence and batch size: +The nested `AuditLog:Purge` section controls the purge actor cadence and batch +size: ```jsonc -"AuditLogPurge": { - "IntervalHours": 24, - "ChannelPurgeBatchSize": 5000 +"AuditLog": { + "Purge": { + "IntervalHours": 24, + "ChannelPurgeBatchSize": 5000 + } } ``` diff --git a/docs/requirements/Component-Communication.md b/docs/requirements/Component-Communication.md index 69d85d95..783327ab 100644 --- a/docs/requirements/Component-Communication.md +++ b/docs/requirements/Component-Communication.md @@ -82,14 +82,19 @@ Both central and site clusters. Each side has communication actors that handle m The streaming protocol is defined in `sitestream.proto` (`src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto`): -- **Service**: `SiteStreamService` with a single RPC `SubscribeInstance(InstanceStreamRequest) returns (stream SiteStreamEvent)`. -- **Messages**: `InstanceStreamRequest` (correlation_id, instance_unique_name), `SiteStreamEvent` (correlation_id, oneof event: `AttributeValueUpdate`, `AlarmStateUpdate`). +- **Service**: `SiteStreamService` — hosted on each site node by `SiteStreamGrpcServer` — exposes five RPCs. One is the original real-time **server-streaming** subscription; the other four are **unary request/response** calls added by the Audit Log (#23) and Site Call Audit (#22) components. A unary call is request/response and is distinct from the command/control ClusterClient channel — gRPC on this service is no longer real-time-stream-only: + - `SubscribeInstance(InstanceStreamRequest) returns (stream SiteStreamEvent)` — the real-time debug stream (§6); the only server-streaming RPC. + - `IngestAuditEvents(AuditEventBatch) returns (IngestAck)` — central-side **ingest** receiving surface for Audit Log (#23) telemetry; routes the batch to the central `AuditLogIngestActor` proxy and returns the accepted `EventId`s. (The production *push* path is still ClusterClient via `ClusterClientSiteAuditClient`; this RPC is the gRPC-receiving counterpart.) + - `IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck)` — ingest receiving surface for the combined cached-call telemetry packet (audit row + `SiteCalls` operational upsert written in one transaction). + - `PullAuditEvents(PullAuditEventsRequest) returns (PullAuditEventsResponse)` — central→site **reconciliation pull** for the Audit Log self-heal feed; the site serves `Pending`/`Forwarded` rows from its `ISiteAuditQueue`. + - `PullSiteCalls(PullSiteCallsRequest) returns (PullSiteCallsResponse)` — central→site reconciliation pull for the Site Call Audit (#22) self-heal feed; the site serves operation-tracking rows changed since a cursor from its `IOperationTrackingStore`. A separate RPC from `PullAuditEvents` because the tracking store is the operational source of truth, distinct from the site audit queue. +- **Messages**: `InstanceStreamRequest` (correlation_id, instance_unique_name), `SiteStreamEvent` (correlation_id, oneof event: `AttributeValueUpdate`, `AlarmStateUpdate`); `AuditEventDto`/`AuditEventBatch`/`IngestAck` for ingest; `CachedTelemetryPacket`/`CachedTelemetryBatch` (each packet pairing an `AuditEventDto` with a `SiteCallOperationalDto`); `PullAuditEventsRequest`/`PullAuditEventsResponse` and `PullSiteCallsRequest`/`PullSiteCallsResponse` (each request carries `since_utc` + `batch_size`; each response carries `more_available` to signal a saturated batch). - The `oneof event` pattern is extensible — future event types (health metrics, connection state changes) are added as new fields without breaking existing consumers. -- Proto field numbers are never reused. Old clients ignore unknown `oneof` variants. +- Proto field numbers are never reused; new RPCs and message fields are appended additively. Old clients ignore unknown `oneof` variants. #### Enriched AlarmStateUpdate (Native Alarm Mirror) -`AlarmStateUpdate` carries the read-only native alarm mirror (Computed, native OPC UA, and native MxAccess Gateway alarms) to central over the **existing gRPC real-time stream** — no new transport, no command/control round-trip. The message was extended **additively**: existing fields 1–7 are unchanged, and fields 8–21 carry the enriched native-alarm state. Old clients that only read fields 1–7 continue to work; new fields are populated only where the source provides them. +`AlarmStateUpdate` carries the read-only native alarm mirror (Computed, native OPC UA, and native MxAccess Gateway alarms) to central over the **existing gRPC real-time stream** — no new transport, no command/control round-trip. The message was extended **additively**: existing fields 1–7 are unchanged, and fields 8–23 carry the enriched native-alarm state. Old clients that only read fields 1–7 continue to work; new fields are populated only where the source provides them. | Field | # | Type | Meaning | |-------|---|------|---------| @@ -107,11 +112,14 @@ The streaming protocol is defined in `sitestream.proto` (`src/ZB.MOM.WW.ScadaBri | `original_raise_time` | 19 | Timestamp | First-raise time of the underlying condition (nullable on the wire). | | `current_value` | 20 | string | Current process value associated with the alarm. | | `limit_value` | 21 | string | Limit / setpoint value that the alarm evaluates against. | +| `native_source_canonical_name` | 22 | string | Native binding canonical name; empty for computed alarms. | +| `is_configured_placeholder` | 23 | bool | Marks a quiet-binding placeholder row. **Snapshot-only** — see the relay note below; on the live gRPC stream this is always `false`. | - **Server-side mapping (`StreamRelayActor.HandleAlarmStateChanged`)**: maps the enriched domain `AlarmStateChanged` event — `Kind` + `AlarmConditionState` + native metadata — out to the proto `AlarmStateUpdate`. The nullable `original_raise_time` is emitted only when present, and `shelve_state` is mapped from the domain shelve enum to its wire string via a new **`AlarmShelveStateCodec`** (string↔enum, defaulting to `Unshelved`). The domain `Confirmed` (`bool?`) is collapsed to a definite bool for field 11. -- **Client-side mapping (`SiteStreamGrpcClient.ConvertToDomainEvent`)**: reconstructs the domain `AlarmStateChanged` from the proto — `Kind` is parsed via `ParseAlarmKind`, the `Condition` is rebuilt with `severity` taken from the existing wire `priority`, and native metadata is repopulated from fields 8–21 — so central-side consumers receive the same domain event the site emitted. +- **Placeholder rows are dropped at the relay**: `is_configured_placeholder` (field 23) is a **Debug View snapshot-only** concept emitted by `InstanceActor.BuildAlarmStatesSnapshot` for quiet bindings — it is never a real alarm transition (its timestamp may be `DateTimeOffset.MinValue`, the Protobuf `Timestamp` lower boundary). `StreamRelayActor.HandleAlarmStateChanged` therefore returns early — **never relaying a placeholder row to the live gRPC stream** — so field 23 is always `false` on the live stream and only ever carries `true` in the snapshot path. +- **Client-side mapping (`SiteStreamGrpcClient.ConvertToDomainEvent`)**: reconstructs the domain `AlarmStateChanged` from the proto — `Kind` is parsed via `ParseAlarmKind`, the `Condition` is rebuilt with `severity` taken from the existing wire `priority`, and native metadata is repopulated from fields 8–23 (`native_source_canonical_name` → `NativeSourceCanonicalName`, `is_configured_placeholder` → `IsConfiguredPlaceholder`) — so central-side consumers receive the same domain event the site emitted. -> **Regeneration is manual (macOS-only).** `sitestream.proto` is **not** auto-compiled: the `` include is commented out in the `.csproj`, and the generated C# is **vendored** under `SiteStreamGrpc/`. To regenerate after editing the proto: toggle the `` include on, build so `Grpc.Tools` regenerates the C#, copy the generated files into `SiteStreamGrpc/`, then re-comment the include. Adding fields 8–21 followed this process. +> **Regeneration is manual (macOS-only).** `sitestream.proto` is **not** auto-compiled: the `` include is commented out in the `.csproj`, and the generated C# is **vendored** under `SiteStreamGrpc/`. To regenerate after editing the proto: toggle the `` include on, build so `Grpc.Tools` regenerates the C#, copy the generated files into `SiteStreamGrpc/`, then re-comment the include. Adding `AlarmStateUpdate` fields 8–23 and the four unary RPCs (`IngestAuditEvents`, `IngestCachedTelemetry`, `PullAuditEvents`, `PullSiteCalls`) plus their message types followed this process. #### gRPC Connection Keepalive @@ -161,9 +169,9 @@ Keepalive settings are configurable via `CommunicationOptions`: - **Pattern**: Fire-and-forget telemetry with a periodic reconciliation pull. - The site **Store-and-Forward Engine** emits a `CachedCallTelemetry` message to central on **every** cached-call lifecycle transition (`Pending → Retrying → Delivered / Parked / Failed / Discarded`). The first telemetry event for an operation carries its initial status — `Pending` when a transient failure has buffered the call, or directly `Delivered`/`Failed` for a cached call that never buffers. The message carries the `TrackedOperationId`, source site, `Kind` (the `TrackedOperationKind` enum), target summary, status, retry count, last error, key timestamps, and source provenance. - Emission is **best-effort and at-least-once**, **idempotent on `TrackedOperationId`** — central's Site Call Audit component ingests with insert-if-not-exists then upsert-on-newer-status, so a re-sent or out-of-order event is harmless. -- **Reconciliation pull**: because telemetry is best-effort, the central **Site Call Audit** component periodically — and on site reconnect — issues a `CachedCallReconcileRequest` to each site; the site replies with a `CachedCallReconcileResponse` carrying all tracking rows changed since a cursor. Any telemetry missed during a disconnect self-heals through this pull. +- **Reconciliation pull**: because telemetry is best-effort, the central **Site Call Audit** component periodically — and on site reconnect — pulls the changed rows back from each site over the **`PullSiteCalls` unary gRPC RPC** on `SiteStreamService` (not a ClusterClient round-trip). Central sends a `PullSiteCallsRequest` (`since_utc` cursor + `batch_size`); the site reads its `IOperationTrackingStore` and replies with a `PullSiteCallsResponse` carrying the matching operation-tracking rows (as `SiteCallOperationalDto`s) plus a `more_available` flag that signals a saturated batch so central advances the cursor and pulls again. Any telemetry missed during a disconnect self-heals through this pull. The Audit Log (#23) reconciliation feed uses the sibling `PullAuditEvents` RPC the same way. - Central audit is an **eventually-consistent mirror** — the site's operation tracking table remains the source of truth for cached-call status (`Tracking.Status(id)` is always answered site-locally). -- **Transport**: ClusterClient (site→central command/control), consistent with how other site→central messages are sent. +- **Transport**: the *push* telemetry emission rides **ClusterClient** (site→central command/control), consistent with how other site→central messages are sent; the *reconciliation pull* rides the **gRPC** unary `PullSiteCalls` RPC (central→site request/response). The two paths are complementary — push is the fast, best-effort feed; pull is the slower self-heal backfill. ## Topology @@ -251,7 +259,7 @@ Each request/response pattern has a default timeout that can be overridden in co | 5. Recipe/Command Delivery | 30 seconds | Fire-and-forget with ack | | 8. Remote Queries | 30 seconds | Querying parked messages or event logs | | 9. Notification Submission | 30 seconds | Fire-and-forget with ack; central acks after persisting the row | -| 10. Cached Call Telemetry | 30 seconds | Reconciliation pull is request/response; telemetry emission itself is fire-and-forget | +| 10. Cached Call Telemetry | 30 seconds | Telemetry emission (ClusterClient) is fire-and-forget; the reconciliation pull is the unary gRPC `PullSiteCalls` request/response (its deadline is the gRPC call timeout, not the Akka ask) | Timeouts use the Akka.NET **ask pattern**. If no response is received within the timeout, the caller receives a timeout failure. @@ -302,6 +310,9 @@ Disconnect is detected at the **transport layer**, never via an application-leve - **Cluster Infrastructure**: Manages node roles and failover detection. - **Configuration Database**: Provides site node addresses (NodeAAddress, NodeBAddress for Akka remoting; GrpcNodeAAddress, GrpcNodeBAddress for gRPC streaming) for address resolution. - **Site Runtime (SiteStreamManager)**: The SiteStreamGrpcServer subscribes to SiteStreamManager to receive real-time events for gRPC delivery. +- **`ISiteAuditQueue` (site-local)**: Handed to `SiteStreamGrpcServer` (post-construction, on site roles) so the `PullAuditEvents` RPC can read the site's `Pending`/`Forwarded` audit rows to serve the Audit Log (#23) reconciliation pull. Null when not wired (central-only host) — the handler then returns an empty response. +- **`IOperationTrackingStore` (site-local)**: Handed to `SiteStreamGrpcServer` (post-construction, on site roles) so the `PullSiteCalls` RPC can read operation-tracking rows changed since a cursor to serve the Site Call Audit (#22) reconciliation pull. Null when not wired — the handler returns an empty response. +- **`AuditLogIngestActor` proxy (central)**: Handed to `SiteStreamGrpcServer` after the central cluster singleton starts; the `IngestAuditEvents` / `IngestCachedTelemetry` RPCs route ingested batches to it. Null when not yet wired — the handler returns an empty `IngestAck` so the caller treats it as transient and retries. ## Interactions @@ -309,7 +320,8 @@ Disconnect is detected at the **transport layer**, never via an application-leve - **Site Runtime**: Receives deployments, lifecycle commands, and artifact updates. Provides debug view data. - **Central UI**: Debug view requests and remote queries flow through communication. - **Health Monitoring**: Receives periodic health reports from sites. -- **Store-and-Forward Engine (site)**: Parked message queries/commands are routed through communication. Also emits `CachedCallTelemetry` and answers `CachedCallReconcileRequest` pulls, and receives relayed `RetryParkedOperation` / `DiscardParkedOperation` commands. -- **Site Call Audit (central)**: Receives cached-call telemetry and reconciliation responses; issues reconciliation pulls and relays parked-operation Retry/Discard commands to sites through communication. +- **Store-and-Forward Engine (site)**: Parked message queries/commands are routed through communication. Also emits `CachedCallTelemetry` (push, ClusterClient) and serves the `PullSiteCalls` gRPC reconciliation pull from its `IOperationTrackingStore`, and receives relayed `RetryParkedOperation` / `DiscardParkedOperation` commands. +- **Site Call Audit (central)**: Receives cached-call telemetry and issues the `PullSiteCalls` gRPC reconciliation pulls to sites; relays parked-operation Retry/Discard commands to sites through communication. +- **Audit Log (#23)**: Sites forward audit-event telemetry (push) and serve the `PullAuditEvents` gRPC reconciliation pull from their `ISiteAuditQueue`; the central `AuditLogIngestActor` is the ingest target for both the push path and the combined cached-call telemetry packet. - **Site Event Logging**: Event log queries are routed through communication. - **Management Service**: The ManagementActor is registered with ClusterClientReceptionist on central nodes. The CLI communicates with the ManagementActor via ClusterClient, which is a separate channel from inter-cluster remoting. diff --git a/docs/requirements/Component-HealthMonitoring.md b/docs/requirements/Component-HealthMonitoring.md index e4277e0f..26628bc5 100644 --- a/docs/requirements/Component-HealthMonitoring.md +++ b/docs/requirements/Component-HealthMonitoring.md @@ -116,4 +116,4 @@ These tiles are **point-in-time** like the Notification Outbox and Site Call Aud - **Central UI**: Health Monitoring Dashboard displays aggregated metrics. - **Communication Layer**: Health reports flow as periodic messages. -- **KPI History (#26)**: emits `IKpiSampleSource` (`SiteHealthKpiSampleSource`, per-Site) consumed by the KpiHistory recorder (#26). It reads the in-memory `ICentralHealthAggregator.GetAllSiteStates()` (no DB), turning the per-site snapshot — previously sequence-numbered every 30s but discarded — into trends (`connectionsUp`/`connectionsDown`, `scriptErrors`, `alarmEvalErrors`, `sfBufferDepth`, `deadLetters`, `parkedMessages`, `deployedInstances`/`enabledInstances`/`disabledInstances`, `auditBacklogPending`, `eventLogWriteFailures`) rendered in the dashboard's per-site `KpiTrendChart` panel. See [Component-KpiHistory.md](Component-KpiHistory.md). +- **KPI History (#26)**: emits `IKpiSampleSource` (`SiteHealthKpiSampleSource`, per-Site) consumed by the KpiHistory recorder (#26). It reads the in-memory `ICentralHealthAggregator.GetAllSiteStates()` (no DB), turning the per-site snapshot — previously sequence-numbered every 30s but discarded — into 12 sampled `KpiSample` metrics per site, persisted every minute: `connectionsUp`/`connectionsDown`, `scriptErrors`, `alarmEvalErrors`, `sfBufferDepth`, `deadLetters`, `parkedMessages`, `deployedInstances`/`enabledInstances`/`disabledInstances`, `auditBacklogPending`, `eventLogWriteFailures`. Of these, **only four are charted** in the dashboard's per-site `KpiTrendChart` panel — `connectionsDown`, `scriptErrors`, `sfBufferDepth`, and `deadLetters` (the metrics carried in the shared `KpiMetrics.SiteHealth` Commons catalog and fetched by `Health.razor`'s trend selector). The remaining eight are **sampled and persisted but not (yet) charted** — they are retained in the `KpiSample` store and available for future trend surfaces or ad-hoc query. The synthetic central self-report (`CentralHealthReportLoop.CentralSiteId = "$central"`, fed by the leader-only report loop via `CollectReport($central)`) appears in `GetAllSiteStates()` and is **intentionally sampled into the per-Site KPI store as a real `KpiScopes.Site` series** keyed `$central`; the Central UI deliberately **pins `$central` first** in the per-site trend selector (rendered as "Central Cluster"), so the central cluster's own trends sit alongside the real sites'. (`$central`'s zero-valued connection/instance/S&F metrics reflect the central self-report carrying no site-runtime data, not a defect.) See [Component-KpiHistory.md](Component-KpiHistory.md). diff --git a/docs/requirements/Component-Host.md b/docs/requirements/Component-Host.md index 17e56706..2f924366 100644 --- a/docs/requirements/Component-Host.md +++ b/docs/requirements/Component-Host.md @@ -184,6 +184,8 @@ The Host's `Program.cs` calls these extension methods; the component libraries o | NotificationService | Yes | No | Yes | Yes | No | | NotificationOutbox | Yes | No | Yes | Yes | No | | SiteCallAudit | Yes | No | Yes | Yes | No | +| KpiHistory | Yes | No | Yes | Yes | No | +| Transport | Yes | No | Yes | No | No | | TemplateEngine | Yes | No | Yes | Yes | No | | DeploymentManager | Yes | No | Yes | Yes | No | | Security | Yes | No | Yes | Yes | No | diff --git a/docs/requirements/Component-ScriptAnalysis.md b/docs/requirements/Component-ScriptAnalysis.md index 34177810..d31dd888 100644 --- a/docs/requirements/Component-ScriptAnalysis.md +++ b/docs/requirements/Component-ScriptAnalysis.md @@ -73,12 +73,20 @@ The identifiers `dynamic` and `Activator` are forbidden at any scope, as they pr **Pass 1 — semantic symbol resolution (adapted from Site Runtime)** -- Builds a Roslyn compilation using the full trusted-platform reference set from `ScriptTrustPolicy.DefaultReferences` (plus any `extraReferences`). +- Builds a Roslyn compilation using `ScriptTrustPolicy.AnalysisReferences` (plus any `extraReferences`). - For each identifier in the syntax tree, resolves the underlying symbol to its fully qualified containing namespace and type name. - Flags any symbol whose containing namespace or type matches a forbidden scope in `ScriptTrustPolicy.ForbiddenScopes`, taking `AllowedExceptions` into account. - Correctly handles aliases (`using X = System.IO.File`), `using static`, and `global::` prefixes — the resolved symbol is checked, not the spelling. - Because the full reference set is loaded, this pass also catches a forbidden type accessed inside an otherwise-allowed namespace (e.g., bare `Process` after `using System.Diagnostics;`). +##### `AnalysisReferences` vs `DefaultReferences` + +The two reference sets are deliberately distinct and must not be conflated: + +- **`DefaultReferences`** — the **minimal, runtime-fidelity** set (built from `DefaultAssemblies`: CoreLib, LINQ, Math, the C# runtime binder, and the Commons API-surface assembly). It is consumed by `RoslynScriptCompiler` (the compile gate) and must mirror exactly what the site runtime compiles/executes against. It deliberately does **not** reference the forbidden-API anchor assemblies (`System.Diagnostics.Process.dll`, `System.Net.Sockets.dll`, …) so that a forbidden type remains an *undefined symbol* at compile time — the compile gate then independently rejects it, providing a second layer of defence. This set must stay minimal. +- **`AnalysisReferences`** — the **full-framework** set used *only* by `ScriptTrustValidator`'s Pass 1. It is built from `AppContext.GetData("TRUSTED_PLATFORM_ASSEMBLIES")` (the TPA list of the host) so that *every* type a script names resolves to its true namespace and is judged authoritatively. Enriching the analysis set can only *improve* detection (the verdict is by namespace/type, never a false allow), which is why the Central UI run gate may safely forward its full compilation reference surface as `extraReferences`. +- **TPA-fallback behaviour** — on a host that does not publish the TPA list (single-file, AOT, or trimmed deployment), `AnalysisReferences` falls back to `DefaultReferences` **enriched with `ForbiddenAnchorAssemblies`** (the assemblies that host the forbidden-API types). This keeps the documented forbidden anchors — notably bare `Process` inside the allowed `System.Diagnostics` namespace — resolvable, so the semantic pass stays authoritative even in the degraded mode. The fallback is **not silent**: `ScriptTrustPolicy.AnalysisReferencesDegraded` is set to `true` and a warning is emitted via `System.Diagnostics.Trace` so operators and tests can detect the weakened mode. + **Pass 2 — syntactic reflection-gateway and identifier hardening (adapted from Inbound API)** - Walks the syntax tree for member-access expressions and simple name references. @@ -102,12 +110,12 @@ Violations from both passes are merged and deduplicated before being returned. #### `Compile(string code, Type? globalsType = null, IEnumerable? extraReferences = null, IEnumerable? extraImports = null)` - Creates a `CSharpScript` with the given code, `globalsType`, references (defaults from `ScriptTrustPolicy.DefaultReferences` plus `extraReferences`), and imports (defaults from `ScriptTrustPolicy.DefaultImports` plus `extraImports`). -- Calls `.Compile()` and returns the resulting `Diagnostic[]` filtered to errors and warnings. +- Calls `.Compile()` and returns the resulting `Diagnostic[]` filtered to **error-severity diagnostics only**. This is a compile *gate*: a warning must not block a deploy, so only errors (undefined symbols, type mismatches) are surfaced to callers as gate failures. - Each caller passes its own `globalsType` — `ScriptCompileSurface` for the design-time deploy gate, the real `ScriptGlobals` for Site Runtime execution, `null` for pure syntax checks. #### `ParseDiagnostics(string code)` -- Parses the script text using Roslyn's `CSharpSyntaxTree.ParseText` and returns syntax-level diagnostics (errors and warnings). +- Parses the script text using Roslyn's `CSharpSyntaxTree.ParseText` and returns **error-severity** syntax-level diagnostics only (consistent with `Compile` — warnings do not fail the gate). - No compilation is performed — useful for fast syntax checks where no globals type is available. --- diff --git a/docs/requirements/Component-SiteCallAudit.md b/docs/requirements/Component-SiteCallAudit.md index 5c8d96e1..5ed06a60 100644 --- a/docs/requirements/Component-SiteCallAudit.md +++ b/docs/requirements/Component-SiteCallAudit.md @@ -148,7 +148,11 @@ configurable window (default 365 days), matching the `Notifications` purge. active/standby failover. - **KPI History (#26)**: emits `IKpiSampleSource` (`SiteCallAuditKpiSampleSource`, Global + per-Site + per-Node) consumed by the - KpiHistory recorder (#26), reusing the existing KPI reads; the resulting + KpiHistory recorder (#26), reusing the existing KPI reads. All six metrics — `buffered` / `parked` / `failedLastInterval` / `deliveredLastInterval` / - `stuck` / `oldestPendingAgeSeconds` series render as trends on the Site Calls - page via `KpiTrendChart`. See [Component-KpiHistory.md](Component-KpiHistory.md). + `stuck` / `oldestPendingAgeSeconds` — are sampled into the `KpiSample` history + store, but only the three charted via the public `KpiMetrics.SiteCallAudit` + catalog (`buffered` / `parked` / `failedLastInterval`) render as trends on the + Site Calls page via `KpiTrendChart`; `deliveredLastInterval` / `stuck` / + `oldestPendingAgeSeconds` are sampled-but-not-yet-charted (available for future + trend panels / ad-hoc query). See [Component-KpiHistory.md](Component-KpiHistory.md). diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogIngestActor.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogIngestActor.cs index 7acb3394..8b800e61 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogIngestActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogIngestActor.cs @@ -11,12 +11,14 @@ using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase; namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// -/// Central-side singleton (per Bundle E wiring) that ingests batches of +/// Central-side cluster singleton that ingests batches of /// rows pushed from sites via the /// IngestAuditEvents gRPC RPC. Each row is stamped with the central-side -/// the central-side IngestedAtUtc (in DetailsJson) and inserted idempotently via -/// — duplicates are -/// silently swallowed (first-write-wins per Bundle A's hardening). +/// ingest timestamp into DetailsJson (there is no promoted IngestedAtUtc +/// column — the value is a DetailsJson field set via +/// ) and inserted idempotently +/// via — duplicates are +/// silently swallowed (first-write-wins). /// /// /// @@ -25,23 +27,24 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// consistent and the site is free to flip its local row to Forwarded. /// /// -/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing -/// action. The actor wraps each repository call in its own try/catch so a -/// single bad row cannot cause the rest of the batch to be lost — that -/// per-row catch is what keeps this actor alive across handler throws, not -/// the supervisor strategy. The override -/// returns the Akka default decider (Restart for most exceptions) and -/// governs children only; this actor has no children today, so the override -/// is a forward-compat placeholder. +/// Audit-write failures must NEVER abort the user-facing action. The actor +/// wraps each repository call in its own try/catch so a single bad row cannot +/// cause the rest of the batch to be lost, and it guards scope/repository +/// resolution so a transient DI fault cannot restart the singleton — those +/// catches are what keep this actor alive across handler throws, not the +/// supervisor strategy. The override returns +/// the Akka default decider (Restart for most exceptions) and governs children +/// only; this actor has no children today, so the override is a forward-compat +/// placeholder. /// /// -/// Two constructors exist for a deliberate reason: Bundle D's tests inject a +/// Two constructors exist for a deliberate reason: the test ctor injects a /// concrete against a per-test MSSQL fixture -/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency -/// end to end), while Bundle E's host wiring registers the actor as a cluster -/// singleton and must therefore resolve the repository — which is a scoped EF -/// Core service — from a fresh DI scope per message. Mirroring the Notification -/// Outbox actor's pattern. +/// (the only way to verify the ingest-timestamp stamp + duplicate-key +/// idempotency end to end), while the production host wiring registers the +/// actor as a cluster singleton and must therefore resolve the repository — +/// which is a scoped EF Core service — from a fresh DI scope per message. +/// Mirroring the Notification Outbox actor's pattern. /// /// public class AuditLogIngestActor : ReceiveActor @@ -53,7 +56,7 @@ public class AuditLogIngestActor : ReceiveActor /// /// Test-mode constructor — injects a concrete repository instance whose /// lifetime exceeds the test, so the actor reuses the same instance across - /// every message. Used by Bundle D's MSSQL-backed TestKit fixture. + /// every message. Used by the MSSQL-backed TestKit fixture. /// /// Audit log repository instance shared across all messages. /// Logger for ingest diagnostics. @@ -116,13 +119,12 @@ public class AuditLogIngestActor : ReceiveActor // Resolve the repository for the whole batch — one DbContext per // message, mirroring NotificationOutboxActor. The injected-repository - // mode (Bundle D tests) skips the scope entirely. - // Bundle C (M5-T6): the IAuditRedactor is also resolved from the - // per-message scope when one is available so the row is truncated + - // redacted before InsertIfNotExistsAsync. The single-repository test - // ctor has no service provider — it falls through with no redactor, - // which preserves the small-payload assumptions baked into the - // existing D2 fixtures. + // mode (test ctor) skips the scope entirely. + // The IAuditRedactor is also resolved from the per-message scope when + // one is available so the row is truncated + redacted before + // InsertIfNotExistsAsync. The single-repository test ctor has no + // service provider — it falls through with no redactor, which preserves + // the small-payload assumptions baked into the existing fixtures. // AuditLog-003: use CreateAsyncScope + await using so scoped EF Core // services (IAsyncDisposable DbContexts) dispose asynchronously // without blocking on sync Dispose() of pending connection cleanup. @@ -133,15 +135,42 @@ public class AuditLogIngestActor : ReceiveActor } else { - await using var scope = _serviceProvider!.CreateAsyncScope(); - var repository = scope.ServiceProvider.GetRequiredService(); - var redactor = scope.ServiceProvider.GetService(); - // M6 Bundle E (T8): central health counter is best-effort — - // unregistered (test composition roots) means the per-row catch - // simply logs without surfacing on the health dashboard. - var failureCounter = scope.ServiceProvider.GetService(); - await IngestWithRepositoryAsync(repository, redactor, failureCounter, cmd, nowUtc, accepted) - .ConfigureAwait(false); + // AuditLog-014: guard scope-creation + repository resolution in a + // try/catch, mirroring OnCachedTelemetryAsync. A transient DI / + // DbContext-factory fault (pooled-context init, SQL-connection + // exhaustion, a resolution race during host churn) would otherwise + // propagate out of the ReceiveAsync handler, trip the parent's + // supervision, and RESTART this central singleton over a transient + // fault — dropping the captured reply so the site's Ask times out. + // Best-effort audit must never wedge the singleton: log, optionally + // bump the failure counter, and still reply with whatever was + // accepted (empty on an up-front scope-resolution throw) so the + // site keeps its rows Pending and retries on the next drain. + try + { + await using var scope = _serviceProvider!.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + var redactor = scope.ServiceProvider.GetService(); + // M6 Bundle E (T8): central health counter is best-effort — + // unregistered (test composition roots) means the per-row catch + // simply logs without surfacing on the health dashboard. + var failureCounter = scope.ServiceProvider.GetService(); + await IngestWithRepositoryAsync(repository, redactor, failureCounter, cmd, nowUtc, accepted) + .ConfigureAwait(false); + } + catch (Exception ex) + { + // Scope creation or a required-service resolution threw before + // (or while) processing the batch. Surface a sustained fault on + // the dashboard if the counter is registered, but never let the + // throw escape the handler and restart the singleton. + try { _serviceProvider!.GetService()?.Increment(); } + catch { /* counter must never throw — defence in depth */ } + _logger.LogError( + ex, + "Audit event batch ingest failed before/while resolving the repository scope; replying with {Accepted} accepted row(s). The site keeps unaccepted rows Pending and retries on the next drain.", + accepted.Count); + } } replyTo.Tell(new IngestAuditEventsReply(accepted)); @@ -159,10 +188,10 @@ public class AuditLogIngestActor : ReceiveActor { try { - // Stamp IngestedAtUtc here, not at the site. Bundle A's - // repository hardening already swallows duplicate-key races, - // so the same id arriving twice (site retry, reconciliation) - // is a silent no-op. + // Stamp the ingest timestamp here, not at the site. The + // repository's duplicate-key hardening already swallows + // duplicate-key races, so the same id arriving twice (site + // retry, reconciliation) is a silent no-op. // Redact BEFORE the IngestedAtUtc stamp so the redacted // copy carries the central-side ingest timestamp. The redactor // is contract-bound to never throw. AuditLog-008: a null diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs index 5a0906cf..4c931cd4 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs @@ -1,3 +1,5 @@ +using Microsoft.Extensions.Configuration; + namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// @@ -37,6 +39,14 @@ public sealed class AuditLogPurgeOptions /// a large backlog within a tick. Clamped to a sane minimum in /// . /// + /// + /// AuditLog-013: the operator-facing config key is ChannelPurgeBatchSize + /// (per Component-AuditLog.md), so the binder maps that documented key onto this + /// backing property via . The unattributed + /// property name (ChannelPurgeBatchSizeConfigured) would otherwise have been + /// the bind key, silently ignoring the documented section. + /// + [ConfigurationKeyName("ChannelPurgeBatchSize")] public int ChannelPurgeBatchSizeConfigured { get; set; } = 5000; /// diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Observability/ScadaBridgeTelemetry.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Observability/ScadaBridgeTelemetry.cs index 1b382bab..a1aafc55 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Commons/Observability/ScadaBridgeTelemetry.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Observability/ScadaBridgeTelemetry.cs @@ -91,4 +91,29 @@ public static class ScadaBridgeTelemetry Volatile.Write(ref _queueDepthProvider, provider); } + + /// + /// Clears the StoreAndForward queue-depth provider, but only if the currently + /// registered provider is the exact delegate passed in + /// (reference-equal compare-and-clear). This lets a StoreAndForward service deregister + /// its own provider on graceful stop without stomping a newer instance that already + /// re-registered into the process-global slot: if a late stop of the old instance + /// passes its (now-superseded) delegate, the identity check fails and the newer + /// provider is preserved. After a successful clear the gauge falls back to reporting 0. + /// Mirrors 's signature and + /// access pattern. + /// + /// The provider delegate to remove; ignored unless it is the + /// one currently registered. + public static void ClearQueueDepthProvider(Func provider) + { + if (provider is null) + { + return; + } + + // Compare-and-clear: only null the slot when it still holds the caller's + // delegate, so a stale stop cannot clobber a successor's provider. + Interlocked.CompareExchange(ref _queueDepthProvider, null, provider); + } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Types/Kpi/KpiSeriesBucketer.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Types/Kpi/KpiSeriesBucketer.cs index 9eb512bd..b5fafbf1 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Commons/Types/Kpi/KpiSeriesBucketer.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Types/Kpi/KpiSeriesBucketer.cs @@ -16,8 +16,12 @@ public static class KpiSeriesBucketer /// Empty buckets are omitted — no gap-filling. /// /// - /// Input series, assumed to be sorted ascending by . - /// If not sorted, the point with the largest timestamp within each bucket is selected. + /// Input series, which must be sorted ascending by . + /// For sorted input the last point in iteration order within a bucket is the one with the + /// largest timestamp (the intended last-value-per-bucket result). For unsorted input the + /// method still selects the last point in iteration order within each bucket — it does + /// not pick the largest-timestamp point — so the result is well-defined but not + /// the last-value semantics callers expect; pre-sort the series first. /// If null or empty, an empty list is returned. /// /// UTC start of the query window (inclusive). @@ -85,8 +89,12 @@ public static class KpiSeriesBucketer if (bucketIndex >= maxPoints) bucketIndex = maxPoints - 1; - // Keep the point with the highest timestamp in this bucket - // (last-value semantics; if ties, keep first encountered — stable). + // Keep the last point in iteration order within this bucket. Because the stored + // candidate's BucketStartUtc is the bucket-START timestamp (not the raw point's + // capture time), the comparison below is true for essentially any in-bucket point, + // so each later-in-iteration point overwrites the previous one. For the ascending- + // sorted input this method requires, last-in-iteration IS the largest-timestamp + // point — i.e. last-value-per-bucket semantics. if (!occupied[bucketIndex] || point.BucketStartUtc > best[bucketIndex].BucketStartUtc) { diff --git a/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs b/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs index add1faad..1a3b0519 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs @@ -54,9 +54,17 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository /// public async Task GetCurrentDeploymentStatusAsync(int instanceId, CancellationToken cancellationToken = default) { + // DeploymentManager-026: deployments are insert-only (one row per deploy + // attempt), so two records for the same instance can tie on DeployedAt when + // they are created within the same clock tick (a rapid redeploy, or a + // redeploy immediately after a timed-out attempt). SQL Server's choice + // between equal sort keys is undefined, so reconciliation could read the + // wrong "current" record. ThenByDescending(d => d.Id) makes the read + // deterministic — the highest Id (the most recently inserted row) wins. return await _dbContext.DeploymentRecords .Where(d => d.InstanceId == instanceId) .OrderByDescending(d => d.DeployedAt) + .ThenByDescending(d => d.Id) .FirstOrDefaultAsync(cancellationToken); } diff --git a/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs b/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs index 0cdd89ef..4d501750 100644 --- a/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs @@ -99,7 +99,12 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers // routed to subscribers (NativeAlarmActors) by source-object reference. /// sourceReference → set of subscriber actor refs (NativeAlarmActors), for routing + ref-count. private readonly Dictionary> _alarmSourceSubscribers = new(); - /// sourceReference → raw condition filter string passed to the adapter (first subscriber wins). + /// + /// sourceReference → raw condition filter string passed to the adapter (last subscriber wins). + /// The shared feed carries a single filter: overwrites it + /// unconditionally on every subscribe, so co-subscribers to one source reference must agree on + /// the condition filter (a second subscriber's filter re-gates the first subscriber's transitions). + /// private readonly Dictionary _alarmSourceFilter = new(); /// /// sourceReference → parsed condition-type predicate (M2.4 / #8). The authoritative @@ -1791,6 +1796,30 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers { _alarmSubscribesInFlight.Remove(msg.SourceReference); + // DataConnectionLayer-023: the last (or only) subscriber may have been + // unsubscribed while this alarm subscribe was in flight. HandleUnsubscribeAlarms + // emptied/removed _alarmSourceSubscribers for the source but could not tear down + // the adapter feed because the subscription id was not stored yet. Mirror the + // DCL-021 tag-path guard: if no subscriber remains, release the just-created + // adapter feed instead of storing an orphaned subscription id that would stream + // transitions to nobody for the lifetime of the adapter. + if (!_alarmSourceSubscribers.ContainsKey(msg.SourceReference)) + { + if (msg.Success && msg.SubscriptionId != null && + _adapter is IAlarmSubscribableConnection alarmable) + { + _log.Warning( + "[{0}] AlarmSubscribeCompleted arrived for source {1} but the last " + + "subscriber unsubscribed while the subscribe was in flight; releasing " + + "the orphaned adapter alarm feed.", + _connectionName, msg.SourceReference); + _ = alarmable.UnsubscribeAlarmsAsync(msg.SubscriptionId); + } + + // No live requester remains to receive a response. + return; + } + if (msg.Success && msg.SubscriptionId != null) { _alarmSubscriptionIds[msg.SourceReference] = msg.SubscriptionId; @@ -1874,6 +1903,11 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers _alarmSourceSubscribers.Remove(request.SourceReference); _alarmSourceFilter.Remove(request.SourceReference); _alarmSourceFilterPredicate.Remove(request.SourceReference); + // DataConnectionLayer-023: clear the in-flight marker so that if an adapter + // subscribe is still in flight for this source, the late AlarmSubscribeCompleted + // is recognized as orphaned (its guard checks _alarmSourceSubscribers, now empty) + // and the just-created feed is released rather than stored and leaked. + _alarmSubscribesInFlight.Remove(request.SourceReference); if (_alarmSubscriptionIds.Remove(request.SourceReference, out var subId) && _adapter is IAlarmSubscribableConnection alarmable) { diff --git a/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Adapters/MxGatewayDataConnection.cs b/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Adapters/MxGatewayDataConnection.cs index 571c74f4..aaac3ea2 100644 --- a/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Adapters/MxGatewayDataConnection.cs +++ b/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Adapters/MxGatewayDataConnection.cs @@ -277,6 +277,19 @@ public class MxGatewayDataConnection : IDataConnection, IBrowsableDataConnection public async ValueTask DisposeAsync() { _eventLoopCts?.Cancel(); + // DataConnectionLayer-025: the DataConnectionActor disposes adapters + // fire-and-forget on failover/stop without necessarily calling + // DisconnectAsync first, so tear down the alarm stream here too — otherwise + // the long-running RunAlarmStreamAsync task and its CTS leak on every + // MxGateway failover/teardown that goes through DisposeAsync. Mirror the + // lock-guarded block already in DisconnectAsync. + lock (_alarmLock) + { + _alarmCts?.Cancel(); + _alarmCts?.Dispose(); + _alarmCts = null; + _alarmSubCount = 0; + } if (_client is not null) await _client.DisposeAsync(); GC.SuppressFinalize(this); diff --git a/src/ZB.MOM.WW.ScadaBridge.DeploymentManager/ArtifactDeploymentService.cs b/src/ZB.MOM.WW.ScadaBridge.DeploymentManager/ArtifactDeploymentService.cs index 19d6828c..ae218453 100644 --- a/src/ZB.MOM.WW.ScadaBridge.DeploymentManager/ArtifactDeploymentService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.DeploymentManager/ArtifactDeploymentService.cs @@ -12,8 +12,16 @@ namespace ZB.MOM.WW.ScadaBridge.DeploymentManager; /// /// WP-7: System-wide artifact deployment. -/// Broadcasts artifacts (shared scripts, external systems, notification lists, DB connections, -/// data connections, and SMTP configurations) to all sites with per-site tracking. +/// Broadcasts artifacts (shared scripts, external systems, DB connections, and +/// data connections) to all sites with per-site tracking. +/// +/// Notification lists and SMTP configuration are deliberately NOT shipped to +/// sites: notification delivery is central-only (sites store-and-forward to +/// central and never talk to SMTP), so no notification artifact or SMTP +/// credential is ever distributed to a site. The +/// still carries the +/// NotificationLists/SmtpConfigurations fields for additive +/// message-contract compatibility, but central never populates them. /// /// - Successful sites are NOT rolled back on other failures. /// - Failed sites are retryable individually. @@ -26,7 +34,6 @@ public class ArtifactDeploymentService private readonly IDeploymentManagerRepository _deploymentRepo; private readonly ITemplateEngineRepository _templateRepo; private readonly IExternalSystemRepository _externalSystemRepo; - private readonly INotificationRepository _notificationRepo; private readonly CommunicationService _communicationService; private readonly IAuditService _auditService; private readonly DeploymentManagerOptions _options; @@ -39,7 +46,12 @@ public class ArtifactDeploymentService /// Repository for deployment records. /// Repository for templates. /// Repository for external systems. - /// Repository for notifications. + /// + /// DeploymentManager-025: retained on the signature for DI/source compatibility but + /// intentionally NOT consumed. Notification lists and SMTP configuration are + /// central-only and are never shipped to sites, so the artifact path must not read + /// the notification repository at all. + /// /// Service for communicating with sites. /// Service for audit logging. /// Deployment manager options. @@ -59,7 +71,9 @@ public class ArtifactDeploymentService _deploymentRepo = deploymentRepo; _templateRepo = templateRepo; _externalSystemRepo = externalSystemRepo; - _notificationRepo = notificationRepo; + // DeploymentManager-025: notificationRepo is deliberately not stored — notification + // lists and SMTP configs are central-only and are never fetched for shipping to sites. + _ = notificationRepo; _communicationService = communicationService; _auditService = auditService; _options = options.Value; @@ -98,15 +112,19 @@ public class ArtifactDeploymentService /// /// Builds a per-site using a previously-fetched /// snapshot of the global artifact sets (shared scripts, external systems + methods, - /// DB connections, notification lists, SMTP configurations). Only the per-site - /// data-connection query runs here. + /// DB connections). Only the per-site data-connection query runs here. /// /// /// DeploymentManager-023: separating the global fetch from the per-site build lets /// issue the global queries exactly once across /// the whole multi-site sweep, eliminating the N+1 re-query of shared scripts, - /// external systems, methods, DB connections, notification lists, and SMTP - /// configurations. + /// external systems, methods, and DB connections. + /// + /// DeploymentManager-025: the command's NotificationLists and + /// SmtpConfigurations fields are always sent null — notification + /// delivery is central-only and no notification artifact or SMTP credential is + /// ever distributed to a site. The fields remain on the contract only for + /// additive compatibility. /// private async Task BuildDeployArtifactsCommandAsync( int siteId, @@ -125,30 +143,34 @@ public class ArtifactDeploymentService globals.SharedScripts, globals.ExternalSystems, globals.DatabaseConnections, - globals.NotificationLists, + // DeploymentManager-025: notification lists are central-only — never shipped to sites. + NotificationLists: null, dataConnectionArtifacts, - globals.SmtpConfigurations, + // DeploymentManager-025: SMTP config (incl. credentials) is central-only — never shipped to sites. + SmtpConfigurations: null, DateTimeOffset.UtcNow); } /// /// Fetches the system-wide artifact sets that are identical across every site — - /// shared scripts, external systems (with their methods serialized in), database - /// connections, notification lists, and SMTP configurations. Used by - /// to pre-load once before the per-site loop. + /// shared scripts, external systems (with their methods serialized in), and + /// database connections. Used by to pre-load + /// once before the per-site loop. /// /// /// DeploymentManager-023: the per-site artifact build path previously re-issued - /// every one of these queries per site (≈ 5·N + M·N round trips for N sites + /// every one of these queries per site (≈ N + M·N round trips for N sites /// and M external systems). Hoisting them here drops that to a single fetch. + /// + /// DeploymentManager-025: notification lists and SMTP configurations are NOT + /// fetched here. Notification delivery is central-only, so they are never + /// shipped to sites — the artifact path must not even read them. /// private async Task FetchGlobalArtifactsAsync(CancellationToken cancellationToken) { var sharedScripts = await _templateRepo.GetAllSharedScriptsAsync(cancellationToken); var externalSystems = await _externalSystemRepo.GetAllExternalSystemsAsync(cancellationToken); var dbConnections = await _externalSystemRepo.GetAllDatabaseConnectionsAsync(cancellationToken); - var notificationLists = await _notificationRepo.GetAllNotificationListsAsync(cancellationToken); - var smtpConfigurations = await _notificationRepo.GetAllSmtpConfigurationsAsync(cancellationToken); // Map shared scripts var scriptArtifacts = sharedScripts.Select(s => @@ -177,35 +199,23 @@ public class ArtifactDeploymentService var dbConnectionArtifacts = dbConnections.Select(d => new DatabaseConnectionArtifact(d.Name, d.ConnectionString, d.MaxRetries, d.RetryDelay)).ToList(); - // Map notification lists - var notificationListArtifacts = notificationLists.Select(nl => - new NotificationListArtifact(nl.Name, nl.Recipients.Where(r => r.EmailAddress is not null).Select(r => r.EmailAddress!).ToList())).ToList(); - - // Map SMTP configurations — use Host as the artifact name (matches SQLite PK on site) - var smtpArtifacts = smtpConfigurations.Select(smtp => - new SmtpConfigurationArtifact( - $"{smtp.Host}:{smtp.Port}", smtp.Host, smtp.Port, smtp.AuthType, smtp.FromAddress, - smtp.Credentials, null, smtp.TlsMode)).ToList(); - return new GlobalArtifactSnapshot( scriptArtifacts, externalSystemArtifacts, - dbConnectionArtifacts, - notificationListArtifacts, - smtpArtifacts); + dbConnectionArtifacts); } /// /// Bag of the global artifact sets that do not vary per site, captured once at /// the start of and reused for every per-site - /// command build (DeploymentManager-023). + /// command build (DeploymentManager-023). Notification lists and SMTP + /// configurations are deliberately absent — they are central-only and never + /// shipped to sites (DeploymentManager-025). /// private sealed record GlobalArtifactSnapshot( IReadOnlyList SharedScripts, IReadOnlyList ExternalSystems, - IReadOnlyList DatabaseConnections, - IReadOnlyList NotificationLists, - IReadOnlyList SmtpConfigurations); + IReadOnlyList DatabaseConnections); /// /// Deploys artifacts to all sites. Builds a per-site command with that site's data connections. @@ -226,9 +236,10 @@ public class ArtifactDeploymentService var perSiteResults = new Dictionary(); // DeploymentManager-023: hoist the system-wide artifact queries (shared scripts, - // external systems + methods, DB connections, notification lists, SMTP configs) - // OUT of the per-site loop so they run ONCE instead of once per site. Only - // data connections legitimately vary per site, so they stay inside the loop. + // external systems + methods, DB connections) OUT of the per-site loop so they + // run ONCE instead of once per site. Only data connections legitimately vary + // per site, so they stay inside the loop. (Notification lists and SMTP config + // are central-only and not fetched at all — DeploymentManager-025.) var globals = await FetchGlobalArtifactsAsync(cancellationToken); // Build per-site commands sequentially (DbContext is not thread-safe). diff --git a/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs b/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs index 058cd67c..06216e03 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs @@ -332,20 +332,30 @@ public class DatabaseGateway : IDatabaseGateway } catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { - // [2] The caller asked to abandon the work — propagate the cancellation + // [1] The caller asked to abandon the work — propagate the cancellation // unchanged; it must never be reclassified as a transient DB error. throw; } catch (SqlException ex) { - // Classify by SqlException.Number and rethrow as the strongly-typed - // transient / permanent failure the callers branch on. The context - // is the connection NAME, never the connection string. + // [2] ExternalSystemGateway-025: a caller-token cancellation can surface + // from the SQL driver as a SqlException (a mid-flight cancel), not an + // OperationCanceledException, so the [1] filter above never sees it. + // Re-check the caller's token at the TOP of this block so such a cancel + // propagates as OperationCanceledException regardless of the driver's + // exception shape — never reclassified as a permanent DB error (the + // "-008 cancel-not-reclassified" contract). Version-independent: no need + // to match a specific SqlException number. + cancellationToken.ThrowIfCancellationRequested(); + + // Otherwise classify by SqlException.Number and rethrow as the + // strongly-typed transient / permanent failure the callers branch on. + // The context is the connection NAME, never the connection string. throw SqlErrorClassifier.Throw(connectionName, ex); } catch (Exception ex) when (SqlErrorClassifier.IsTransient(ex)) { - // [1] A live outage that did not surface as a SqlException — treat as + // [3] A live outage that did not surface as a SqlException — treat as // transient so the caller buffers + retries. The message uses the // connection NAME, never the connection string (credential safety). throw new TransientDatabaseException( diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/ZB.MOM.WW.ScadaBridge.Host.csproj b/src/ZB.MOM.WW.ScadaBridge.Host/ZB.MOM.WW.ScadaBridge.Host.csproj index 36fd9aa4..324751a1 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/ZB.MOM.WW.ScadaBridge.Host.csproj +++ b/src/ZB.MOM.WW.ScadaBridge.Host/ZB.MOM.WW.ScadaBridge.Host.csproj @@ -53,6 +53,7 @@ + diff --git a/src/ZB.MOM.WW.ScadaBridge.KpiHistory/KpiHistoryRecorderActor.cs b/src/ZB.MOM.WW.ScadaBridge.KpiHistory/KpiHistoryRecorderActor.cs index d85dda0f..65af844f 100644 --- a/src/ZB.MOM.WW.ScadaBridge.KpiHistory/KpiHistoryRecorderActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.KpiHistory/KpiHistoryRecorderActor.cs @@ -68,6 +68,19 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers /// private CancellationTokenSource? _shutdownCts; + /// + /// In-flight guard for the sample loop. Set true at the start of a sample pass and cleared + /// when the pass's arrives. While true, further + /// s are skipped so passes never overlap — Akka periodic timers + /// enqueue (not coalesce) missed ticks, so without this guard a pass running longer than + /// (slow/recovering DB) would let each + /// subsequent tick spawn another concurrent pass, amplifying load on the struggling store + /// and double-writing samples for overlapping windows. Mirrors the + /// dispatch + /// in-flight guard the timer pattern is modelled on. + /// + private bool _sampleInFlight; + /// Akka timer scheduler, assigned by the actor system via . public ITimerScheduler Timers { get; set; } = null!; @@ -87,7 +100,7 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers _logger = logger ?? throw new ArgumentNullException(nameof(logger)); Receive(_ => HandleSampleTick()); - Receive(_ => { }); // best-effort: no actor state to reset on completion + Receive(_ => _sampleInFlight = false); // lower the in-flight guard (success or fault) Receive(_ => HandlePurgeTick()); Receive(_ => { }); // best-effort: no actor state to reset on completion } @@ -135,19 +148,31 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers } /// - /// Handles a sample tick: captures the shared capturedAtUtc instant on the actor - /// thread, then launches the asynchronous sampling pass off-thread and pipes a - /// completion back to so the mailbox is never blocked while sources - /// are collected and the batch is written. + /// Handles a sample tick. If a sample pass is already in flight the tick is skipped + /// (logged at debug) so passes never overlap; otherwise the in-flight guard is raised, + /// the shared capturedAtUtc instant is captured on the actor thread, and the + /// asynchronous sampling pass is launched off-thread with a + /// piped back to to lower the guard on the actor thread — so the + /// mailbox is never blocked while sources are collected and the batch is written. /// private void HandleSampleTick() { + if (_sampleInFlight) + { + // A prior pass is still awaiting its DB round-trip; coalesce this tick rather than + // piling a second concurrent pass onto a slow/recovering store. + _logger.LogDebug("KPI sample tick skipped — a sample pass is already in flight."); + return; + } + + _sampleInFlight = true; var capturedAt = DateTime.UtcNow; var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None; // RunSamplePass self-isolates its faults (it never throws), but the failure // projection is kept as a belt-and-braces guard so even a faulted task still - // produces a SampleComplete. + // produces a SampleComplete that lowers the in-flight guard — otherwise the loop + // would wedge permanently. RunSamplePass(capturedAt, cancellationToken).PipeTo( Self, success: () => SampleComplete.Instance, @@ -282,7 +307,10 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers private SampleTick() { } } - /// Piped-back completion of a sampling pass; lets the pass run off the actor thread. + /// + /// Piped-back completion of a sampling pass; lets the pass run off the actor thread and + /// lowers the _sampleInFlight guard on the actor thread (fires on success and fault). + /// internal sealed class SampleComplete { public static readonly SampleComplete Instance = new(); diff --git a/src/ZB.MOM.WW.ScadaBridge.ManagementService/ManagementActor.cs b/src/ZB.MOM.WW.ScadaBridge.ManagementService/ManagementActor.cs index 36b91662..25a88365 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ManagementService/ManagementActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ManagementService/ManagementActor.cs @@ -1887,8 +1887,26 @@ public class ManagementActor : ReceiveActor return await repo.GetAllMappingsAsync(); } + // Security-023 (membership half): an LDAP-group mapping's Role is a free string on the + // wire (CLI/API), so reject anything outside the canonical Roles.All set at the single + // server-side write path. A non-canonical role never functioned (no policy or authz + // check matches it), so rejecting it removes a silent-misconfiguration footgun rather + // than changing behaviour. Membership is checked case-insensitively to match the rest + // of the actor's role comparisons; the existing case-sensitivity asymmetry between the + // UI RequireClaim policies and the ManagementActor check is a separately-deferred change + // and is deliberately NOT altered here — the stored value's verbatim casing is preserved. + private static void ValidateMappingRole(string role) + { + if (!Roles.All.Contains(role, StringComparer.OrdinalIgnoreCase)) + { + throw new ManagementCommandException( + $"Role '{role}' is not a recognized role. Valid roles are: {string.Join(", ", Roles.All)}."); + } + } + private static async Task HandleCreateRoleMapping(IServiceProvider sp, CreateRoleMappingCommand cmd, string user) { + ValidateMappingRole(cmd.Role); var repo = sp.GetRequiredService(); var mapping = new LdapGroupMapping(cmd.LdapGroupName, cmd.Role); await repo.AddMappingAsync(mapping); @@ -1899,6 +1917,7 @@ public class ManagementActor : ReceiveActor private static async Task HandleUpdateRoleMapping(IServiceProvider sp, UpdateRoleMappingCommand cmd, string user) { + ValidateMappingRole(cmd.Role); var repo = sp.GetRequiredService(); var mapping = await repo.GetMappingByIdAsync(cmd.MappingId) ?? throw new ManagementCommandException($"RoleMapping with ID {cmd.MappingId} not found."); diff --git a/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustPolicy.cs b/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustPolicy.cs index 0d8a4901..492b40c8 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustPolicy.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustPolicy.cs @@ -103,6 +103,19 @@ public static class ScriptTrustPolicy /// scripting options and the semantic-analysis compilation built for trust /// validation, so the validator resolves symbols against exactly the same /// metadata the script is compiled against. + /// + /// + /// This is the minimal, runtime-fidelity set. It deliberately does + /// NOT reference the assemblies that host the forbidden APIs (e.g. + /// System.Diagnostics.Process.dll, System.Net.Sockets.dll) — a + /// forbidden type must remain an undefined symbol at compile time so + /// the gate independently rejects it. Do + /// not widen this set with forbidden-API anchor assemblies; that second layer + /// of defence (a forbidden type fails to bind at execution-time compilation) + /// depends on it staying minimal. The trust validator's symbol-resolution + /// needs are met separately by (which adds + /// on the minimal-fallback path). + /// /// public static readonly IReadOnlyList DefaultAssemblies = [ @@ -113,6 +126,42 @@ public static class ScriptTrustPolicy typeof(DynamicJsonElement).Assembly, ]; + /// + /// Anchor assemblies that host the forbidden-API types named in + /// . Used ONLY to enrich the trust + /// validator's semantic reference set () + /// when the TRUSTED_PLATFORM_ASSEMBLIES list is unavailable — a + /// single-file, AOT, or trimmed host. Without these, the minimal fallback set + /// cannot resolve a bare forbidden type that lives inside an + /// allowed namespace (the documented case being Process via + /// using System.Diagnostics;): the symbol resolves to nothing, Pass 1's + /// syntactic fallback ignores dotless identifiers, and Pass 2 never flags a + /// bare identifier — so the forbidden reference slips the validator entirely + /// (ScriptAnalysis-001). Anchoring these assemblies in the fallback keeps the + /// semantic pass authoritative even in the degraded mode. + /// + /// + /// These are never added to — see the + /// remark on . Most forbidden anchor types + /// (System.IO.File, System.Threading.Thread, + /// System.Reflection.Assembly, + /// System.Runtime.InteropServices.Marshal) already live in the same + /// assembly as typeof(object) (System.Private.CoreLib), so the + /// minimal set already resolves them; the ones that ship in their own + /// assemblies — System.Diagnostics.Process and + /// System.Net.Sockets.Socket — are listed here explicitly. + /// + /// + public static readonly IReadOnlyList ForbiddenAnchorAssemblies = + [ + typeof(System.Diagnostics.Process).Assembly, + typeof(System.IO.File).Assembly, + typeof(System.Threading.Thread).Assembly, + typeof(System.Reflection.Assembly).Assembly, + typeof(System.Net.Sockets.Socket).Assembly, + typeof(System.Runtime.InteropServices.Marshal).Assembly, + ]; + /// /// Metadata references for the trust-validation semantic compilation and /// the design-time script compilation. @@ -143,6 +192,20 @@ public static class ScriptTrustPolicy /// public static readonly IReadOnlyList AnalysisReferences = BuildAnalysisReferences(); + /// + /// True when was built WITHOUT the + /// trusted-platform-assemblies (TPA) list — i.e. the semantic pass is running + /// against the minimal fallback set (a single-file/AOT/trimmed host) rather + /// than the full framework. In this degraded mode the validator still + /// resolves the documented forbidden anchors (because + /// is folded into the fallback), but + /// types outside that anchor set may resolve as unknown and rely on the + /// syntactic pass / downstream compile gate. A warning is also emitted via + /// at type initialisation. Consumers + /// (and tests) can read this flag to detect / surface the weakened mode. + /// + public static bool AnalysisReferencesDegraded { get; private set; } + private static IReadOnlyList BuildAnalysisReferences() { var byPath = new Dictionary(StringComparer.OrdinalIgnoreCase); @@ -166,23 +229,77 @@ public static class ScriptTrustPolicy } } + // The TPA list was unavailable (single-file / AOT / trimmed host) — we + // are about to fall back to the minimal set, which weakens the semantic + // pass. Make the degradation LOUD (not silent): record the flag and emit + // a warning so operators/tests can detect the mode (ScriptAnalysis-001). + var tpaAvailable = byPath.Count > 0; + if (!tpaAvailable) + { + AnalysisReferencesDegraded = true; + System.Diagnostics.Trace.TraceWarning( + "ScriptTrustPolicy: TRUSTED_PLATFORM_ASSEMBLIES unavailable; the script-trust " + + "semantic pass is running against the MINIMAL fallback reference set (plus the " + + "forbidden-API anchor assemblies). Symbol resolution is reduced — forbidden anchors " + + "(Process, Socket, File, Thread, Assembly, Marshal) remain caught, but other types " + + "may resolve as unknown. This typically indicates a single-file/AOT/trimmed host."); + } + // Ensure app assemblies the script API surface needs are present even if // not in the TPA list (e.g. Commons / DynamicJsonElement). foreach (var asm in DefaultAssemblies) - { - var loc = asm.Location; - if (loc.Length == 0 || byPath.ContainsKey(loc) || !File.Exists(loc)) - continue; + TryAddAssembly(byPath, asm); - try { byPath[loc] = MetadataReference.CreateFromFile(loc); } - catch { /* ignore */ } + // On the minimal-fallback path, also fold in the assemblies that HOST the + // forbidden-API types so a bare forbidden type inside an allowed namespace + // (the documented `Process` via `using System.Diagnostics;` case) still + // resolves and is flagged authoritatively by the semantic pass. When the + // TPA list is present these are already covered, so this only matters in + // the degraded mode (ScriptAnalysis-001). NOTE: these anchors are added + // ONLY here, never to DefaultReferences — the compile gate must keep + // rejecting forbidden types as undefined symbols. + if (!tpaAvailable) + { + foreach (var asm in ForbiddenAnchorAssemblies) + TryAddAssembly(byPath, asm); } - // Fallback to the minimal set if the TPA list was unavailable (e.g. a - // single-file/AOT host) so validation still functions. + // Should never be empty (DefaultAssemblies always resolve), but guard + // against a pathological host and fall back to the minimal references. return byPath.Count > 0 ? byPath.Values.ToList() : DefaultReferences; } + private static void TryAddAssembly( + Dictionary byPath, Assembly asm) + { + var loc = asm.Location; + if (loc.Length == 0 || byPath.ContainsKey(loc) || !File.Exists(loc)) + return; + + try { byPath[loc] = MetadataReference.CreateFromFile(loc); } + catch { /* skip an unreadable assembly rather than fail validation */ } + } + + /// + /// Builds exactly the reference set the trust validator would use on the + /// minimal TPA-fallback path: plus the + /// that host the forbidden-API types. + /// This is what produces when + /// TRUSTED_PLATFORM_ASSEMBLIES is unavailable. Exposed so the degraded + /// mode can be exercised directly (e.g. by the adversarial tests proving the + /// SA-001 fallback hole is closed) without depending on the host actually + /// lacking a TPA list. + /// + public static IReadOnlyList BuildMinimalFallbackReferences() + { + var byPath = new Dictionary(StringComparer.OrdinalIgnoreCase); + foreach (var asm in DefaultAssemblies) + TryAddAssembly(byPath, asm); + foreach (var asm in ForbiddenAnchorAssemblies) + TryAddAssembly(byPath, asm); + return byPath.Values.ToList(); + } + /// /// Default namespace imports made available to compiled scripts. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs b/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs index bbe2c7c1..3ceb3d8b 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs @@ -55,17 +55,42 @@ public static class ScriptTrustValidator /// /// The C# script source to analyse. /// - /// Optional additional metadata references to add to - /// for semantic - /// resolution — e.g. the compile-surface globals assembly so a script - /// referencing the API surface resolves cleanly. Forbidden references are - /// NOT added here (a script can't reach a forbidden API just because the - /// assembly is referenced; the deny-list still applies). + /// Optional additional metadata references to widen symbol resolution + /// for the semantic pass — e.g. the compile-surface globals assembly so a + /// script referencing the API surface resolves cleanly. Extra references can + /// ONLY improve resolution; they can NEVER whitelist a forbidden API. The + /// verdict is by resolved namespace/type against + /// , so passing more + /// references — even the forbidden-API assemblies themselves — can only make + /// the verdict more accurate (more types resolve to their true namespace and + /// are judged), never produce a false allow. Callers therefore need not (and + /// cannot, for safety purposes) curate this set; the Central UI run gate + /// forwards its full compilation reference surface here precisely because + /// doing so is safe. /// /// A list of trust-model violation messages; empty if the script is clean. public static IReadOnlyList FindViolations( string code, IEnumerable? extraReferences = null) + => FindViolations(code, ScriptTrustPolicy.AnalysisReferences, extraReferences); + + /// + /// Overload that runs the trust analysis against an explicit base reference + /// set instead of . The + /// public entry point above always uses the full analysis set; this overload + /// exists so tests can pin behaviour against the minimal TPA-fallback set + /// () and prove + /// the degraded mode still catches the documented forbidden anchors + /// (ScriptAnalysis-001). The two passes are otherwise identical. + /// + /// The C# script source to analyse. + /// The base reference set Pass 1 resolves against. + /// Optional additional references that only widen resolution. + /// A list of trust-model violation messages; empty if the script is clean. + public static IReadOnlyList FindViolations( + string code, + IReadOnlyList baseReferences, + IEnumerable? extraReferences = null) { if (string.IsNullOrWhiteSpace(code)) return Array.Empty(); @@ -84,7 +109,7 @@ public static class ScriptTrustValidator // resolves and is judged by its true namespace — closing the // forbidden-type-in-allowed-namespace blind spot (e.g. a bare // System.Diagnostics.Process via `using System.Diagnostics;`). - var references = ScriptTrustPolicy.AnalysisReferences.ToList(); + var references = baseReferences.ToList(); if (extraReferences != null) references.AddRange(extraReferences); diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs index a8735131..24fab10e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs @@ -32,10 +32,17 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// the daily terminal-row purge scheduler (Piece B — /// , which invokes /// on a timer). Both -/// background timers are started in and gate on the -/// reconciliation collaborators ( + -/// ) being available — the repo-only test ctor -/// injects neither, so neither timer runs there. +/// background timers are started in , but on independent +/// preconditions (SiteCallAudit-007). The purge timer is armed whenever +/// background timers are enabled (it needs only the repository, which every +/// production / reconciliation ctor always has) — it is NOT gated on the +/// reconciliation collaborators, so a host that registers Site Call Audit +/// without the reconciliation client still purges and the central +/// SiteCalls table cannot grow unbounded. The reconciliation timer +/// additionally requires its collaborators ( + +/// ) and logs a Warning when it cannot arm. The +/// repo-only MSSQL test ctor disables both timers (background timers off) so the +/// read/upsert tests see no scheduled side effects. /// /// /// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" — @@ -68,6 +75,16 @@ public class SiteCallAuditActor : ReceiveActor /// Maximum page size honoured by a . private const int MaxPageSize = 200; + /// + /// SiteCallAudit-009: hard ceiling on the number of PullSiteCalls RPCs + /// issued for a single site within ONE reconciliation tick when the site keeps + /// reporting . Bounds the + /// within-tick continuation drain so a misbehaving site (or a pathological + /// single-timestamp saturation that pins the cursor) can never spin the + /// dispatcher unbounded; the remaining backlog drains on the next tick. + /// + private const int MaxReconciliationPagesPerTick = 50; + private readonly IServiceProvider? _serviceProvider; private readonly ISiteCallAuditRepository? _injectedRepository; private readonly SiteCallAuditOptions _options; @@ -81,12 +98,25 @@ public class SiteCallAuditActor : ReceiveActor /// singletons registered by AddAuditLogCentralReconciliationClient); /// in the test path they are injected directly. They are null when /// the actor was built via the repo-only test ctor — in that case the - /// reconciliation tick is NOT started (see ); - /// the purge tick gates on the same collaborators (see ). + /// reconciliation tick is NOT started (see ). + /// The purge tick, by contrast, does NOT depend on these collaborators + /// (SiteCallAudit-007): it needs only the repository, so it is armed by + /// alone (see ). /// private readonly IPullSiteCallsClient? _pullClient; private readonly ISiteEnumerator? _siteEnumerator; + /// + /// Master switch for the two background schedulers (reconciliation + purge), + /// set true by the production and reconciliation ctors and false + /// by the repo-only MSSQL test ctor. SiteCallAudit-007: the purge timer is + /// gated on THIS flag rather than on the reconciliation collaborators, so a + /// host that omits the reconciliation client still purges (no unbounded + /// central SiteCalls growth) while the MSSQL read/upsert tests stay + /// free of any scheduled side effects. + /// + private readonly bool _backgroundTimersEnabled; + /// /// Per-site reconciliation watermark — the highest /// seen for that site on a previous @@ -123,9 +153,11 @@ public class SiteCallAuditActor : ReceiveActor /// An optional lets a test pin the stuck/KPI /// windows; when omitted the production defaults apply. /// - /// This ctor injects NO reconciliation client/enumerator, so the - /// reconciliation tick is gated off (see ) - /// — the MSSQL-backed read/upsert tests must not fire phantom pulls. + /// This ctor disables BOTH background timers (sets + /// to false) and injects no + /// reconciliation client/enumerator, so neither the reconciliation tick nor + /// the purge tick fires — the MSSQL-backed read/upsert tests must see no + /// scheduled side effects (no phantom pulls, no background purge). /// /// /// Concrete repository instance to use for all messages. @@ -143,6 +175,10 @@ public class SiteCallAuditActor : ReceiveActor _logger = logger; _options = options ?? new SiteCallAuditOptions(); + // Repo-only MSSQL test ctor: keep BOTH background timers off so the + // read/upsert tests see no scheduled side effects (SiteCallAudit-007). + _backgroundTimersEnabled = false; + RegisterHandlers(); } @@ -150,10 +186,10 @@ public class SiteCallAuditActor : ReceiveActor /// Test-mode constructor for the reconciliation tick (Piece A) — injects a /// concrete repository PLUS the two reconciliation collaborators directly, /// so the per-site self-heal pull is unit-testable in-memory without a DI - /// container or a live gRPC channel. Because the client + enumerator are - /// present, the reconciliation tick IS started; the purge tick is also - /// started (both gate on the collaborators being available — see - /// / ). + /// container or a live gRPC channel. Background timers are enabled, so the + /// purge tick starts (it needs only the repository) and the reconciliation + /// tick starts too because the client + enumerator are present — see + /// / . /// /// Concrete repository instance used for upserts and purges. /// Enumerates the sites to reconcile each tick. @@ -186,6 +222,11 @@ public class SiteCallAuditActor : ReceiveActor _logger = logger; _options = options ?? new SiteCallAuditOptions(); + // Reconciliation test ctor: collaborators present, so both timers arm + // (the reconciliation tick uses the collaborators; the purge tick needs + // only the repository). + _backgroundTimersEnabled = true; + RegisterHandlers(); } @@ -223,6 +264,12 @@ public class SiteCallAuditActor : ReceiveActor _pullClient = serviceProvider.GetService(); _siteEnumerator = serviceProvider.GetService(); + // Production path: background timers run. The purge tick is armed + // unconditionally here (it needs only the repository); the reconciliation + // tick additionally requires its collaborators and logs a Warning if + // they were not registered (SiteCallAudit-007). + _backgroundTimersEnabled = true; + RegisterHandlers(); } @@ -275,16 +322,30 @@ public class SiteCallAuditActor : ReceiveActor } /// - /// Starts the periodic reconciliation tick — but ONLY when both the pull - /// client and the site enumerator are available. The repo-only test ctor - /// injects neither, so the tick is gated off there (the MSSQL read/upsert - /// tests must not fire phantom pulls); the reconciliation test ctor and the - /// production ctor (which resolves both from the SP) start it. + /// Starts the periodic reconciliation tick — but ONLY when background timers + /// are enabled AND both the pull client and the site enumerator are + /// available. The repo-only test ctor disables background timers, so the tick + /// is gated off there (the MSSQL read/upsert tests must not fire phantom + /// pulls); the reconciliation test ctor and the production ctor (which + /// resolves both from the SP) start it. SiteCallAudit-007: when background + /// timers are enabled but the collaborators were not registered, log a + /// Warning so a misconfigured host surfaces the missing self-heal rather than + /// silently skipping it. /// private void StartReconciliationTimer() { + if (!_backgroundTimersEnabled) + { + return; + } + if (_pullClient is null || _siteEnumerator is null) { + _logger.LogWarning( + "SiteCallAudit reconciliation timer not started — the reconciliation " + + "collaborators (IPullSiteCallsClient / ISiteEnumerator) were not registered; " + + "lost cached-call telemetry will not self-heal until they are wired up. " + + "The daily terminal-row purge still runs."); return; } @@ -298,15 +359,20 @@ public class SiteCallAuditActor : ReceiveActor } /// - /// Starts the daily purge tick — gated on the same collaborator presence as - /// the reconciliation tick. The purge itself only needs the repository, but - /// gating both schedulers together keeps the repo-only test ctor (no - /// client/enumerator) free of BOTH background timers, so the MSSQL read/ - /// upsert tests see no scheduled side effects. + /// Starts the daily purge tick. SiteCallAudit-007: the purge needs ONLY the + /// repository — never the reconciliation collaborators — so it is gated on + /// alone, NOT on + /// / . This decouples + /// the daily terminal-row purge from the reconciliation client: a host that + /// registers Site Call Audit without the reconciliation client still purges, + /// so the central SiteCalls table can never grow unbounded just + /// because the self-heal puller is absent. Only the repo-only MSSQL test ctor + /// (background timers off) skips it, keeping the read/upsert tests free of + /// scheduled side effects. /// private void StartPurgeTimer() { - if (_pullClient is null || _siteEnumerator is null) + if (!_backgroundTimersEnabled) { return; } @@ -501,37 +567,103 @@ public class SiteCallAuditActor : ReceiveActor /// deduplicated by the idempotent monotonic upsert — the same inclusive-boundary /// contract as SiteAuditReconciliationActor's cursor. /// + /// + /// SiteCallAudit-009: consumes + /// to guarantee forward progress. Whereas the prior implementation ignored + /// the flag entirely and relied solely on the tick cadence, this method now + /// continues pulling within the same tick while the site reports + /// MoreAvailable=true, bounded by + /// . This closes the + /// single-timestamp-saturation edge: if a backlog larger than + /// all shares one + /// exact , the inclusive max-timestamp cursor + /// cannot advance, so the previous code re-pulled the identical window forever + /// across ticks and never drained the tail. Here, a saturated batch whose + /// observed max timestamp did NOT advance past since is detected as a + /// no-progress pin: the loop stops and logs a Warning (the same observability + /// intent as the sibling's stalled signal, without its EventStream state + /// machine), so the pathological site surfaces rather than spinning silently. + /// This diverges from SiteAuditReconciliationActor, which reads + /// MoreAvailable to drive a SiteAuditTelemetryStalledChanged + /// stalled-detection state machine instead of a within-tick continuation drain. + /// /// private async Task ReconcileSiteAsync( SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository) { - var since = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; - var response = await client - .PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None) - .ConfigureAwait(false); + var cursor = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; - var maxUpdated = since; - var nowUtc = DateTime.UtcNow; - foreach (var row in response.SiteCalls) + // SiteCallAudit-009: drain within the tick while the site keeps reporting + // MoreAvailable, bounded by MaxReconciliationPagesPerTick so a misbehaving + // site can never spin the dispatcher. Each page advances the in-flight + // cursor; a saturated page that fails to advance the cursor is the + // single-timestamp no-progress pin — break and surface it. + for (var page = 0; page < MaxReconciliationPagesPerTick; page++) { - // IngestedAtUtc is the "central ingested (or last refreshed) this - // row" stamp — owned by the central actor, exactly as OnUpsertAsync - // does for the telemetry path. Monotonic UpsertAsync makes a row - // already present (from a prior push) a silent no-op. - var siteCall = row with { IngestedAtUtc = nowUtc }; - await repository.UpsertAsync(siteCall).ConfigureAwait(false); + var since = cursor; + var response = await client + .PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None) + .ConfigureAwait(false); - if (row.UpdatedAtUtc > maxUpdated) + var maxUpdated = since; + var nowUtc = DateTime.UtcNow; + foreach (var row in response.SiteCalls) { - maxUpdated = row.UpdatedAtUtc; + // IngestedAtUtc is the "central ingested (or last refreshed) this + // row" stamp — owned by the central actor, exactly as OnUpsertAsync + // does for the telemetry path. Monotonic UpsertAsync makes a row + // already present (from a prior push) a silent no-op. + var siteCall = row with { IngestedAtUtc = nowUtc }; + await repository.UpsertAsync(siteCall).ConfigureAwait(false); + + if (row.UpdatedAtUtc > maxUpdated) + { + maxUpdated = row.UpdatedAtUtc; + } + } + + // Persist the advanced cursor after every page so a fault on a later + // page (caught per-site upstream) still keeps the rows already drained. + cursor = maxUpdated; + _reconciliationCursors[site.SiteId] = cursor; + + if (!response.MoreAvailable) + { + // Backlog fully drained for this site this tick. + return; + } + + if (maxUpdated <= since) + { + // No-progress pin: the site saturated the batch yet the max + // observed UpdatedAtUtc did not advance past the inclusive cursor + // (a burst of > batch-size rows sharing one exact timestamp). + // Continuing would re-pull the identical window forever, so stop + // and surface it — the inclusive max-timestamp cursor cannot make + // progress on this input without a composite (timestamp,id) + // keyset, which the pull contract does not yet support. + _logger.LogWarning( + "SiteCallAudit reconciliation for site {SiteId} cannot make progress: a saturated " + + "batch of more than {BatchSize} rows shares a single UpdatedAtUtc ({CursorUtc:o}), " + + "so the inclusive cursor is pinned. The backlog tail beyond the batch ceiling will " + + "not reconcile until those rows' timestamps differ.", + site.SiteId, + _options.ReconciliationBatchSize, + since); + return; } } - // Advance the cursor to the newest row seen. A MoreAvailable response - // means the site saturated the batch; the next tick continues draining - // from the advanced cursor (no immediate re-pull loop — the natural - // tick cadence drains the backlog, matching SiteAuditReconciliationActor). - _reconciliationCursors[site.SiteId] = maxUpdated; + // Hit the within-tick page ceiling while MoreAvailable was still true: + // the cursor advanced each page (so the backlog IS draining), there is + // simply more than MaxReconciliationPagesPerTick × batch-size of it. The + // next tick resumes from the advanced cursor. + _logger.LogInformation( + "SiteCallAudit reconciliation for site {SiteId} hit the per-tick page ceiling " + + "({MaxPages} pages); the cursor advanced each page and the remaining backlog " + + "drains on the next tick.", + site.SiteId, + MaxReconciliationPagesPerTick); } // ── Piece B: daily terminal-row purge scheduler ── diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/EventLogQueryService.cs b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/EventLogQueryService.cs index 22c6c01c..2a80f243 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/EventLogQueryService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/EventLogQueryService.cs @@ -73,14 +73,23 @@ public class EventLogQueryService : IEventLogQueryService if (request.From.HasValue) { + // SiteEventLogging-024 (re-opens -016): timestamps are stored as ISO + // 8601 "o" UTC strings (always +00:00). The store compares them + // lexicographically (BINARY collation), so the bound MUST be + // normalised to UTC before ToString("o") — otherwise a non-UTC + // DateTimeOffset from a central client (e.g. +05:00) produces a + // string that sorts wrongly against the +00:00 stored values and the + // range filter silently includes/excludes the wrong rows. whereClauses.Add("timestamp >= $from"); - parameters.Add(new SqliteParameter("$from", request.From.Value.ToString("o"))); + parameters.Add(new SqliteParameter("$from", request.From.Value.ToUniversalTime().ToString("o"))); } if (request.To.HasValue) { + // SiteEventLogging-024: normalise the upper bound to UTC for the same + // reason as $from above. whereClauses.Add("timestamp <= $to"); - parameters.Add(new SqliteParameter("$to", request.To.Value.ToString("o"))); + parameters.Add(new SqliteParameter("$to", request.To.Value.ToUniversalTime().ToString("o"))); } if (!string.IsNullOrWhiteSpace(request.EventType)) diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs index 1c5c448d..f44f1f33 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -528,7 +528,28 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers { var instanceName = command.InstanceUniqueName; - if (_instanceActors.TryGetValue(instanceName, out var actor)) + // SiteRuntime-029: a disable arriving mid-redeploy must cancel the buffered + // redeploy. Otherwise HandleTerminated re-creates the Instance Actor and + // re-stores its config with isEnabled: true when the predecessor terminates, + // silently reverting the operator's disable back to enabled. Mirror the + // last-write-wins handling in HandleDeploy/HandleDelete: drop the pending + // command (so HandleTerminated returns early), clear the shadow, and tell the + // displaced deployer it was superseded. The disable itself still persists + // is_enabled = false below, which becomes the durable state. + if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef)) + { + if (_pendingRedeploys.Remove(terminatingRef, out var pending)) + { + pending.OriginalSender.Tell(new DeploymentStatusResponse( + pending.Command.DeploymentId, + instanceName, + DeploymentStatus.Failed, + $"superseded by disable of {instanceName} before redeploy finished terminating", + DateTimeOffset.UtcNow)); + } + _terminatingActorsByName.Remove(instanceName); + } + else if (_instanceActors.TryGetValue(instanceName, out var actor)) { Context.Stop(actor); _instanceActors.Remove(instanceName); @@ -628,12 +649,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers { var instanceName = command.InstanceUniqueName; - if (_instanceActors.TryGetValue(instanceName, out var actor)) + // SiteRuntime-029: a delete arriving while a redeploy is still terminating must + // be authoritative over the mid-redeploy bookkeeping. HandleDeploy already + // removed the instance from _instanceActors and buffered a PendingRedeploy + // keyed by the terminating ref. If we fall straight through to the + // _instanceActors miss + unconditional decrement, the buffered redeploy is + // left intact — so when Terminated fires, HandleTerminated calls + // ApplyDeployment(isRedeploy: true) and RESURRECTS the just-deleted instance, + // with the counter now inconsistent. Cancel the pending redeploy first. + var wasPresent = false; + if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef)) + { + // Drop the buffered command so HandleTerminated's _pendingRedeploys.Remove + // misses and it returns early (no resurrection). Clear the shadow too. + if (_pendingRedeploys.Remove(terminatingRef, out var pending)) + { + pending.OriginalSender.Tell(new DeploymentStatusResponse( + pending.Command.DeploymentId, + instanceName, + DeploymentStatus.Failed, + $"superseded by delete of {instanceName} before redeploy finished terminating", + DateTimeOffset.UtcNow)); + } + _terminatingActorsByName.Remove(instanceName); + // The terminating predecessor is already being stopped by HandleDeploy; + // no Context.Stop needed here. + wasPresent = true; + } + else if (_instanceActors.TryGetValue(instanceName, out var actor)) { Context.Stop(actor); _instanceActors.Remove(instanceName); + wasPresent = true; } - _totalDeployedCount = Math.Max(0, _totalDeployedCount - 1); + + // SiteRuntime-029: only decrement when the instance was actually present + // (live in _instanceActors OR mid-redeploy in _terminatingActorsByName). + // A delete for a wholly-unknown instance must not drive the count negative. + if (wasPresent) + _totalDeployedCount = Math.Max(0, _totalDeployedCount - 1); UpdateInstanceCounts(); var sender = Sender; @@ -1379,14 +1433,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } } - // WP-33: Store notification lists - if (command.NotificationLists != null) - { - foreach (var nl in command.NotificationLists) - { - await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails); - } - } + // DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP + // configuration are central-only — sites store-and-forward notifications + // to central and never deliver over SMTP. Central no longer ships these + // (the DeployArtifactsCommand fields stay for additive compatibility but + // are always null), so the site neither persists them nor reads them. + // Purge any rows a prior (pre-fix) build may have written — including the + // plaintext SMTP password — so existing exposure is cleared, not just + // future writes. Purge is idempotent and runs on every artifact apply. + await _storage.PurgeCentralOnlyNotificationConfigAsync(); // Store data connection definitions (OPC UA endpoints, etc.) if (command.DataConnections != null) @@ -1413,16 +1468,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections)); } - // Store SMTP configurations - if (command.SmtpConfigurations != null) - { - foreach (var smtp in command.SmtpConfigurations) - { - await _storage.StoreSmtpConfigurationAsync( - smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode, - smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig); - } - } + // DeploymentManager-025 / SiteRuntime-031: SMTP configuration is + // central-only and is never stored on a site (see the purge above). // Replicate artifacts to standby node _replicationActor?.Tell(new ReplicateArtifacts(command)); diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs index db05db94..06434741 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs @@ -12,6 +12,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; using ZB.MOM.WW.ScadaBridge.SiteEventLogging; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming; @@ -209,6 +210,12 @@ public class InstanceActor : ReceiveActor // WP-16: Handle alarm state changes from Alarm Actors (Tell pattern) Receive(HandleAlarmStateChanged); + // SiteRuntime-027: a NativeAlarmActor tells us when one of its native + // conditions has left the mirror for good (snapshot-swap removal, retention + // drop, or cap eviction) so we can evict the stale _latestAlarmEvents key + // and not leak per-instance memory / bloat DebugView snapshots. + Receive(HandleNativeAlarmDropped); + // WP-25: Debug view subscribe/unsubscribe (Ask pattern for snapshot) Receive(HandleSubscribeDebugView); Receive(HandleUnsubscribeDebugView); @@ -1016,6 +1023,25 @@ public class InstanceActor : ReceiveActor _streamManager?.PublishAlarmStateChanged(changed); } + /// + /// SiteRuntime-027: evicts a native condition's key from the alarm-state maps once + /// the owning has dropped it from its mirror (after + /// emitting the condition's final return-to-normal). Without this the + /// _latestAlarmEvents map grows without bound on a source that mints a fresh + /// SourceReference per occurrence (one permanently-retained Normal entry per + /// distinct condition the instance has ever seen), leaking per-instance memory and + /// bloating every DebugView snapshot. + /// + /// Native-only by construction: the key is a native condition's SourceReference. + /// Computed-alarm keys (configuration-bounded) are never sent here and never removed. + /// + private void HandleNativeAlarmDropped(NativeAlarmDropped dropped) + { + _latestAlarmEvents.Remove(dropped.SourceReference); + _alarmStates.Remove(dropped.SourceReference); + _alarmTimestamps.Remove(dropped.SourceReference); + } + /// /// WP-25: Debug view subscribe — returns snapshot and begins streaming. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs index 0a90be04..a9f37e5e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs @@ -8,6 +8,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteEventLogging; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; @@ -204,6 +205,10 @@ public class NativeAlarmActor : ReceiveActor { Emit(prior, prior.Condition with { Active = false }); PersistDelete(sourceRef); + // SiteRuntime-027: this condition is gone for good — tell the parent + // to evict its _latestAlarmEvents key so it does not retain a stale + // (Normal) entry forever. + NotifyParentDropped(sourceRef); } } @@ -244,6 +249,9 @@ public class NativeAlarmActor : ReceiveActor { _alarms.Remove(t.SourceReference); PersistDelete(t.SourceReference); + // SiteRuntime-027: evict the parent's _latestAlarmEvents key for the + // now-resolved condition so it does not leak. + NotifyParentDropped(t.SourceReference); } EnforceCap(); @@ -289,19 +297,48 @@ public class NativeAlarmActor : ReceiveActor var overflow = _alarms.Values .OrderBy(a => a.TransitionTime) .Take(_alarms.Count - cap) - .Select(a => a.SourceReference) .ToList(); - foreach (var sourceRef in overflow) + foreach (var evicted in overflow) { + var sourceRef = evicted.SourceReference; + + // SiteRuntime-028: the sibling drop paths (ApplySnapshotSwap, the + // ApplyLiveTransition retention drop) always emit a return-to-normal + // before the condition leaves the mirror. EnforceCap previously dropped + // a condition whose last-emitted state could still be Active, with no + // compensating emit — so the Instance Actor (and central's stream / the + // operator Alarm Summary) kept showing it Active forever, a phantom + // stuck alarm the mirror could never clear. Emit the return-to-normal + // for any still-active evicted condition (mirroring ApplySnapshotSwap) + // before removing it. + if (evicted.Condition.Active) + { + Emit(evicted, evicted.Condition with { Active = false }); + } + _alarms.Remove(sourceRef); PersistDelete(sourceRef); + // SiteRuntime-027: this condition is gone for good — evict the parent's + // _latestAlarmEvents key so it does not retain a stale entry. + NotifyParentDropped(sourceRef); _logger.LogWarning( "Native alarm cap {Cap} exceeded for {Source} on {Instance}; dropped oldest mirrored alarm {Ref}", cap, _source.CanonicalName, _instanceName, sourceRef); } } + /// + /// SiteRuntime-027: signals the parent Instance Actor that a native condition has + /// left the mirror for good so it can evict the matching _latestAlarmEvents + /// key. Always sent AFTER the condition's final return-to-normal + /// emit, so the stream still sees the clear. + /// + private void NotifyParentDropped(string sourceReference) + { + _instanceActor.Tell(new NativeAlarmDropped(sourceReference)); + } + /// /// Builds and tells the parent an enriched for a condition. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/SiteReplicationActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/SiteReplicationActor.cs index 84423503..2b14c97f 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/SiteReplicationActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/SiteReplicationActor.cs @@ -189,18 +189,16 @@ public class SiteReplicationActor : ReceiveActor foreach (var db in command.DatabaseConnections) await _storage.StoreDatabaseConnectionAsync(db.Name, db.ConnectionString, db.MaxRetries, db.RetryDelay); - if (command.NotificationLists != null) - foreach (var nl in command.NotificationLists) - await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails); + // DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP + // configuration are central-only and are never persisted on a site. + // Mirror the primary apply path: purge any pre-fix rows (including the + // plaintext SMTP password) instead of writing the command's + // (now-always-null) NotificationLists/SmtpConfigurations. + await _storage.PurgeCentralOnlyNotificationConfigAsync(); if (command.DataConnections != null) foreach (var dc in command.DataConnections) await _storage.StoreDataConnectionDefinitionAsync(dc.Name, dc.Protocol, dc.PrimaryConfigurationJson, dc.BackupConfigurationJson, dc.FailoverRetryCount); - - if (command.SmtpConfigurations != null) - foreach (var smtp in command.SmtpConfigurations) - await _storage.StoreSmtpConfigurationAsync(smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode, - smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig); } catch (Exception ex) { diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Messages/NativeAlarmDropped.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Messages/NativeAlarmDropped.cs new file mode 100644 index 00000000..cdd0de58 --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Messages/NativeAlarmDropped.cs @@ -0,0 +1,23 @@ +namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages; + +/// +/// SiteRuntime-027: terminal-drop signal sent from a NativeAlarmActor to its +/// parent InstanceActor when a native condition leaves the mirror for good — +/// the snapshot-swap removal, the live-transition retention drop +/// (inactive && acknowledged), and the cap eviction. The parent removes the +/// condition's key () from its _latestAlarmEvents +/// map so the per-instance map and every DebugView snapshot do not accumulate one +/// permanently-retained (Normal) entry per distinct native condition the instance has +/// ever seen. +/// +/// The actor still emits the condition's return-to-normal AlarmStateChanged +/// (so central/UI see it clear) immediately BEFORE this drop signal; only the +/// stale-key retention in _latestAlarmEvents is what this evicts. Computed-alarm +/// keys are configuration-bounded and are never dropped this way. +/// +/// +/// The native condition's source reference — the same value used as the +/// AlarmStateChanged.AlarmName key for native alarms, so the parent can remove +/// the matching _latestAlarmEvents entry. +/// +public sealed record NativeAlarmDropped(string SourceReference); diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Persistence/SiteStorageService.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Persistence/SiteStorageService.cs index 710c9ecb..fd25814a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Persistence/SiteStorageService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Persistence/SiteStorageService.cs @@ -623,6 +623,30 @@ public class SiteStorageService // ── WP-33: Notification List CRUD ── + /// + /// DeploymentManager-025 / SiteRuntime-031: notification delivery is central-only. + /// Sites store-and-forward notifications to the central cluster and never deliver + /// over SMTP, so notification lists and SMTP configuration must never live on a + /// site. This purges every row from the site-local notification_lists and + /// smtp_configurations tables, clearing any rows a prior (now-corrected) + /// build may have shipped — most importantly the plaintext SMTP password. It is + /// idempotent and is invoked on every artifact apply / deploy so existing exposure + /// is cleared, not just future writes. The tables themselves are retained (the + /// schema is harmless once empty); only their contents are removed. + /// + /// A task that completes when both tables have been emptied. + public async Task PurgeCentralOnlyNotificationConfigAsync() + { + await using var connection = new SqliteConnection(_connectionString); + await connection.OpenAsync(); + + await using var command = connection.CreateCommand(); + command.CommandText = @" + DELETE FROM notification_lists; + DELETE FROM smtp_configurations;"; + await command.ExecuteNonQueryAsync(); + } + /// /// Stores or updates a notification list. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs index b1c3b9e2..6908cb1d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs @@ -163,6 +163,16 @@ public class StoreAndForwardService /// private int _queueDepthProviderRegistered; + /// + /// StoreAndForward-025: the exact provider delegate this instance registered with + /// the process-global gauge slot, retained so + /// can deregister it by identity (compare-and-clear). Holding + /// the same reference both registers and clears the slot; the identity check ensures a + /// late stop of this instance cannot clear a newer instance's provider. Null until + /// registers. + /// + private Func? _queueDepthProvider; + /// /// WP-10: Delivery handler delegate. The return value / exception is interpreted /// the same way on both the immediate-delivery path () @@ -347,7 +357,12 @@ public class StoreAndForwardService if (Interlocked.CompareExchange(ref _queueDepthProviderRegistered, 1, 0) == 0) { Interlocked.Add(ref _bufferedCount, pending); - ScadaBridgeTelemetry.SetQueueDepthProvider(() => Interlocked.Read(ref _bufferedCount)); + // StoreAndForward-025: retain the exact delegate so StopAsync can + // deregister it by identity (compare-and-clear) without stomping a + // newer instance that may have re-registered into the global slot. + var provider = (Func)(() => Interlocked.Read(ref _bufferedCount)); + _queueDepthProvider = provider; + ScadaBridgeTelemetry.SetQueueDepthProvider(provider); } _retryTimer = new Timer( @@ -380,6 +395,15 @@ public class StoreAndForwardService /// DI container ran its own shutdown. We now await the captured sweep task /// (with a bounded so a hung /// dependency cannot block host shutdown indefinitely) before returning. + /// + /// StoreAndForward-025: after the timer is disposed and the in-flight sweep has + /// drained, the queue-depth provider this instance registered with the process-global + /// gauge is deregistered by identity (compare-and- + /// clear) — otherwise a stopped service would report a frozen depth forever and the + /// provider closure would pin this dead instance for the process lifetime. The clear is + /// identity-checked so a newer instance that already re-registered the global slot is + /// not stomped, and is reset so a later + /// on this instance re-registers cleanly. /// /// A task representing the asynchronous stop operation. public async Task StopAsync() @@ -393,35 +417,46 @@ public class StoreAndForwardService } var inflight = Volatile.Read(ref _sweepTask); - if (inflight is null || inflight.IsCompleted) + if (inflight is not null && !inflight.IsCompleted) { - return; + try + { + // WaitAsync with a finite timeout: a hung delivery handler / + // storage call cannot block host shutdown indefinitely. On timeout + // the sweep keeps running but the host is free to proceed with + // disposal — preferred to never returning. + await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false); + } + catch (TimeoutException) + { + _logger.LogWarning( + "Store-and-forward retry sweep did not finish within {Timeout}; " + + "shutdown is proceeding while the sweep is still in-flight", + SweepShutdownWaitTimeout); + } + catch (Exception ex) + { + // The sweep itself already logs at Error on failure (see + // RetryPendingMessagesAsync's catch); we only log here so a + // surprise fault during shutdown is still visible. Swallow so the + // host's shutdown sequence can continue regardless. + _logger.LogWarning(ex, + "Store-and-forward retry sweep faulted during shutdown wait"); + } } - try + // StoreAndForward-025: release the process-global queue-depth gauge provider so a + // stopped service stops reporting a frozen depth and the closure no longer pins + // this dead instance. Identity-checked (compare-and-clear) so a successor + // instance's provider is left intact; reset the one-time guard so a later + // StartAsync re-registers. + var provider = _queueDepthProvider; + if (provider is not null) { - // WaitAsync with a finite timeout: a hung delivery handler / - // storage call cannot block host shutdown indefinitely. On timeout - // the sweep keeps running but the host is free to proceed with - // disposal — preferred to never returning. - await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false); - } - catch (TimeoutException) - { - _logger.LogWarning( - "Store-and-forward retry sweep did not finish within {Timeout}; " + - "shutdown is proceeding while the sweep is still in-flight", - SweepShutdownWaitTimeout); - } - catch (Exception ex) - { - // The sweep itself already logs at Error on failure (see - // RetryPendingMessagesAsync's catch); we only log here so a - // surprise fault during shutdown is still visible. Swallow so the - // host's shutdown sequence can continue regardless. - _logger.LogWarning(ex, - "Store-and-forward retry sweep faulted during shutdown wait"); + ScadaBridgeTelemetry.ClearQueueDepthProvider(provider); + _queueDepthProvider = null; } + Interlocked.Exchange(ref _queueDepthProviderRegistered, 0); } /// diff --git a/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Services/TemplateFolderService.cs b/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Services/TemplateFolderService.cs index e989e178..783f565d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Services/TemplateFolderService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Services/TemplateFolderService.cs @@ -56,6 +56,10 @@ public class TemplateFolderService await _repository.AddFolderAsync(folder, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); await _auditService.LogAsync(user, "Create", "TemplateFolder", folder.Id.ToString(), name, folder, cancellationToken); + // The audit entry is staged on the change tracker by LogAsync and needs its + // own SaveChangesAsync to persist (mirrors TemplateService) — otherwise the + // row is discarded when the ManagementActor's DI scope is disposed. + await _repository.SaveChangesAsync(cancellationToken); return Result.Success(folder); } @@ -89,6 +93,8 @@ public class TemplateFolderService await _repository.UpdateFolderAsync(folder, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); await _auditService.LogAsync(user, "Update", "TemplateFolder", folder.Id.ToString(), newName, folder, cancellationToken); + // Persist the staged audit entry (see CreateFolderAsync). + await _repository.SaveChangesAsync(cancellationToken); return Result.Success(folder); } @@ -152,6 +158,8 @@ public class TemplateFolderService await _repository.UpdateFolderAsync(folder, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); await _auditService.LogAsync(user, "Move", "TemplateFolder", folder.Id.ToString(), folder.Name, folder, cancellationToken); + // Persist the staged audit entry (see CreateFolderAsync). + await _repository.SaveChangesAsync(cancellationToken); return Result.Success(folder); } @@ -202,6 +210,8 @@ public class TemplateFolderService await _repository.UpdateFolderAsync(adjacent, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); await _auditService.LogAsync(user, "Reorder", "TemplateFolder", folder.Id.ToString(), folder.Name, folder, cancellationToken); + // Persist the staged audit entry (see CreateFolderAsync). + await _repository.SaveChangesAsync(cancellationToken); return Result.Success(folder); } @@ -242,6 +252,8 @@ public class TemplateFolderService await _repository.DeleteFolderAsync(folderId, cancellationToken); await _repository.SaveChangesAsync(cancellationToken); await _auditService.LogAsync(user, "Delete", "TemplateFolder", folderId.ToString(), folder.Name, null, cancellationToken); + // Persist the staged audit entry (see CreateFolderAsync). + await _repository.SaveChangesAsync(cancellationToken); return Result.Success(true); } diff --git a/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Validation/SemanticValidator.cs b/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Validation/SemanticValidator.cs index 35338588..bf542620 100644 --- a/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Validation/SemanticValidator.cs +++ b/src/ZB.MOM.WW.ScadaBridge.TemplateEngine/Validation/SemanticValidator.cs @@ -565,43 +565,6 @@ public class SemanticValidator return result; } - /// - /// Parses a parameter definitions JSON string (JSON Schema or legacy flat array) and returns the declared parameter names. - /// - /// JSON Schema or legacy flat-array string; null/empty returns an empty list. - /// The list of parameter names declared in the definition. - internal static List ParseParameterDefinitions(string? parameterDefinitionsJson) - { - if (string.IsNullOrWhiteSpace(parameterDefinitionsJson)) - return []; - - try - { - using var doc = JsonDocument.Parse(parameterDefinitionsJson); - // JSON Schema: { type:"object", properties:{ name:{...}, ... }, required:[...] } - if (doc.RootElement.ValueKind == JsonValueKind.Object) - { - if (doc.RootElement.TryGetProperty("properties", out var props) - && props.ValueKind == JsonValueKind.Object) - { - return props.EnumerateObject().Select(p => p.Name).ToList(); - } - } - // Legacy flat form: [{ name, type, required? }] - else if (doc.RootElement.ValueKind == JsonValueKind.Array) - { - return doc.RootElement.EnumerateArray() - .Select(e => e.TryGetProperty("type", out var t) ? t.GetString() ?? "unknown" : "unknown") - .ToList(); - } - } - catch (JsonException) - { - } - - return []; - } - /// /// Extracts call targets from script code by simple pattern matching. /// Looks for CallScript("name", ...) and CallShared("name", ...) patterns. diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Configuration/AuditLogOptionsBindingTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Configuration/AuditLogOptionsBindingTests.cs index d42f1f7d..f1393d16 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Configuration/AuditLogOptionsBindingTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Configuration/AuditLogOptionsBindingTests.cs @@ -3,6 +3,8 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; +using ZB.MOM.WW.ScadaBridge.AuditLog; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration; using ZB.MOM.WW.ScadaBridge.AuditLog.Redaction; using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; @@ -88,6 +90,55 @@ public class AuditLogOptionsBindingTests Assert.Equal("@token|@secret", ov.RedactSqlParamsMatching); } + [Fact] + public void PurgeOptions_Bind_FromDocumentedSectionAndKeys() + { + // AuditLog-013: the design doc (Component-AuditLog.md §Configuration) + // documents the purge tuning as the nested `AuditLog:Purge` section with + // keys `IntervalHours` + `ChannelPurgeBatchSize`. This test pins that the + // code binds from EXACTLY that shape — the section path the production + // code uses (ServiceCollectionExtensions.PurgeSectionName) AND the + // documented `ChannelPurgeBatchSize` key (mapped onto the + // ChannelPurgeBatchSizeConfigured backing property via + // [ConfigurationKeyName]). It would fail against the pre-fix code, where + // the binder looked for `ChannelPurgeBatchSizeConfigured` and silently + // ignored the documented key. + const string json = """ + { + "AuditLog": { + "Purge": { + "IntervalHours": 6, + "ChannelPurgeBatchSize": 1000 + } + } + } + """; + + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(json)); + var configuration = new ConfigurationBuilder() + .AddJsonStream(stream) + .Build(); + + // Section path matches production (PurgeSectionName == "AuditLog:Purge"). + Assert.Equal("AuditLog:Purge", ServiceCollectionExtensions.PurgeSectionName); + + var services = new ServiceCollection(); + services.AddOptions() + .Bind(configuration.GetSection(ServiceCollectionExtensions.PurgeSectionName)); + using var provider = services.BuildServiceProvider(); + + var opts = provider.GetRequiredService>().Value; + + // IntervalHours bound from the nested section (not the 24 h default). + Assert.Equal(6, opts.IntervalHours); + Assert.Equal(TimeSpan.FromHours(6), opts.Interval); + + // ChannelPurgeBatchSize bound via the documented key onto the backing + // property (not the 5000 default). + Assert.Equal(1000, opts.ChannelPurgeBatchSizeConfigured); + Assert.Equal(1000, opts.ChannelPurgeBatchSize); + } + [Fact] public void Filter_Behavior_Updates_OnConfigReload() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.Commons.Tests/Kpi/KpiSeriesBucketerTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Commons.Tests/Kpi/KpiSeriesBucketerTests.cs index 331cec89..9e40da55 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Commons.Tests/Kpi/KpiSeriesBucketerTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Commons.Tests/Kpi/KpiSeriesBucketerTests.cs @@ -183,6 +183,62 @@ public class KpiSeriesBucketerTests Assert.Equal(T(40), result[2].BucketStartUtc); } + // ----------------------------------------------------------------------- + // Unsorted input: last-in-iteration wins within a bucket (NOT largest timestamp) + // ----------------------------------------------------------------------- + + [Fact] + public void Bucket_UnsortedInput_SelectsLastInIterationNotLargestTimestamp() + { + // KpiHistory-006 regression: pins the documented contract that for unsorted input the + // bucketer selects the LAST point in iteration order within a bucket — it does NOT pick + // the largest-timestamp point. Both points below fall in bucket 0 ([0, 30)); the later + // one in iteration order, T(5)=value 1.0, arrives second, so it overwrites T(20)=value + // 99.0 even though T(20) has the larger timestamp. (For ascending-sorted input these + // coincide — last-in-iteration IS largest timestamp — which is why production is safe.) + var raw = new[] + { + new KpiSeriesPoint(T(20), 99.0), // larger timestamp, but encountered FIRST + new KpiSeriesPoint(T(5), 1.0), // smaller timestamp, but encountered LAST → wins + new KpiSeriesPoint(T(45), 7.0), // bucket 1 ([30, 60]) + }; + + // 60-minute window / 2 buckets → 30 min each. raw.Count (3) > maxPoints (2) → downsample. + var result = KpiSeriesBucketer.Bucket(raw, T(0), T(60), maxPoints: 2); + + Assert.Equal(2, result.Count); + // Bucket 0: last-in-iteration (value 1.0) wins, NOT the largest-timestamp point (99.0). + Assert.Equal(1.0, result[0].Value); + Assert.Equal(7.0, result[1].Value); + } + + // ----------------------------------------------------------------------- + // Short series (raw.Count <= maxPoints): returned unchanged → raw capture timestamps + // ----------------------------------------------------------------------- + + [Fact] + public void Bucket_ShortSeries_ReturnsRawCaptureTimestampsNotBucketBoundaries() + { + // KpiHistory-005/003 regression: the short-series early-return path returns the input + // unchanged, so each output point's BucketStartUtc is the RAW capture timestamp — NOT a + // bucket-boundary timestamp (which is what the downsample path emits, asserted by + // Bucket_BucketStartUtc_IsSetToBucketStartNotRawPointTimestamp). This pins the + // intentional difference between the two return paths. + var raw = new[] + { + new KpiSeriesPoint(T(7), 1.0), + new KpiSeriesPoint(T(23), 2.0), + }; + + // raw.Count (2) <= maxPoints (5) → early return, same reference. + var result = KpiSeriesBucketer.Bucket(raw, T(0), T(60), maxPoints: 5); + + Assert.Same(raw, result); + // Timestamps are the raw capture instants, not bucket starts (which would be T(0), T(12), …). + Assert.Equal(T(7), result[0].BucketStartUtc); + Assert.Equal(T(23), result[1].BucketStartUtc); + } + // ----------------------------------------------------------------------- // Right-edge: point exactly at toUtc lands in the last bucket // ----------------------------------------------------------------------- diff --git a/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/RepositoryCoverageTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/RepositoryCoverageTests.cs index bef4dd48..1fdbfdbc 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/RepositoryCoverageTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/RepositoryCoverageTests.cs @@ -940,6 +940,31 @@ public class DeploymentManagerRepositoryTests : IDisposable Assert.Equal("d-002", current!.DeploymentId); } + [Fact] + public async Task GetCurrentDeploymentStatus_SameTickRecords_DeterministicTiebreakerPicksHighestId() + { + // DeploymentManager-026: deployments are insert-only, so two records for the + // same instance can tie on DeployedAt when created within the same clock tick + // (rapid redeploy, or redeploy right after a timed-out attempt). Without a + // secondary sort key the "current" read is non-deterministic. The repository + // now orders by DeployedAt DESC, THEN Id DESC, so the most recently inserted + // (highest Id) row wins the tie deterministically. + var instance = await SeedInstanceAsync(); + var sameInstant = DateTimeOffset.UtcNow; + + await _repository.AddDeploymentRecordAsync( + new DeploymentRecord("d-old", "admin") { InstanceId = instance.Id, DeployedAt = sameInstant }); + await _repository.SaveChangesAsync(); + await _repository.AddDeploymentRecordAsync( + new DeploymentRecord("d-new", "admin") { InstanceId = instance.Id, DeployedAt = sameInstant }); + await _repository.SaveChangesAsync(); + + var current = await _repository.GetCurrentDeploymentStatusAsync(instance.Id); + Assert.NotNull(current); + // d-new was inserted second → it has the higher Id and must win the tie. + Assert.Equal("d-new", current!.DeploymentId); + } + [Fact] public async Task DeleteDeploymentRecord_ViaStubAttachPath_RemovesEntity() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs b/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs index b727106e..bce8f57e 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs @@ -231,4 +231,79 @@ public class DataConnectionActorAlarmTests : TestKit "", "", "", "", "", null, DateTimeOffset.UtcNow, "", "")); ExpectMsg(u => u.Transition.Kind == AlarmTransitionKind.SnapshotComplete); } + + // ── DataConnectionLayer-023: mid-flight alarm unsubscribe must release the adapter feed ── + + [Fact] + public async Task DCL023_UnsubscribeDuringInFlightAlarmSubscribe_ReleasesAdapterFeed_AndKeepsStateClean() + { + // Regression test for DataConnectionLayer-023. Previously the native-alarm + // subscribe path never inherited the DCL-021 obsolete-completion guard: if the + // last subscriber unsubscribed while the adapter alarm subscribe was in flight, + // HandleUnsubscribeAlarms could not tear down the feed (the subscription id was + // not stored yet), and the late AlarmSubscribeCompleted unconditionally stored + // _alarmSubscriptionIds[source] = id — an orphaned device-side alarm feed that + // streamed transitions to nobody for the lifetime of the adapter. After the fix, + // HandleUnsubscribeAlarms clears the in-flight marker and the late completion is + // recognized as orphaned (no subscriber remains) and released via + // UnsubscribeAlarmsAsync, with no leaked subscription id retained. + var subscribeStarted = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var releaseSubscribe = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + + var adapter = Substitute.For(); + adapter.ConnectAsync(Arg.Any>(), Arg.Any()) + .Returns(Task.CompletedTask); + var alarmable = (IAlarmSubscribableConnection)adapter; + // Park the adapter subscribe so UnsubscribeAlarmsRequest is processed first. + alarmable.SubscribeAlarmsAsync(Arg.Any(), Arg.Any(), + Arg.Any(), Arg.Any()) + .Returns(_ => + { + subscribeStarted.TrySetResult(); + return releaseSubscribe.Task; + }); + alarmable.UnsubscribeAlarmsAsync(Arg.Any(), Arg.Any()) + .Returns(Task.CompletedTask); + + var actor = Sys.ActorOf(Props.Create(() => new DataConnectionActor( + "conn", adapter, _options, _health, _factory, "OpcUa"))); + + // Subscribe a source — block on the parked adapter subscribe. + actor.Tell(new SubscribeAlarmsRequest("c1", "instA", "conn", "Tank01", null, DateTimeOffset.UtcNow)); + // The immediate SubscribeAlarmsResponse only arrives after HandleAlarmSubscribeCompleted; + // since the subscribe is parked, none is expected yet. + await subscribeStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + + // The last subscriber unsubscribes while the alarm subscribe is still in flight. + actor.Tell(new UnsubscribeAlarmsRequest("unsub-c1", "instA", "conn", "Tank01", DateTimeOffset.UtcNow)); + await Task.Delay(150); + + // Release the parked subscribe — AlarmSubscribeCompleted is now orphaned. + releaseSubscribe.SetResult("alarm-sub-orphan"); + await Task.Delay(300); + + // The orphaned, just-created adapter feed must be released exactly once. + await alarmable.Received(1).UnsubscribeAlarmsAsync( + Arg.Is(s => s == "alarm-sub-orphan"), Arg.Any()); + + // No leaked subscription id must remain: a fresh subscribe to the same source + // must open a NEW adapter feed (proving _alarmSubscriptionIds was not populated + // with the orphaned id, which would have short-circuited the adapter subscribe). + var resubStarted = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + alarmable.SubscribeAlarmsAsync(Arg.Any(), Arg.Any(), + Arg.Any(), Arg.Any()) + .Returns(_ => + { + resubStarted.TrySetResult(); + return Task.FromResult("alarm-sub-2"); + }); + + actor.Tell(new SubscribeAlarmsRequest("c2", "instB", "conn", "Tank01", null, DateTimeOffset.UtcNow)); + await resubStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + ExpectMsg(m => m.Success, TimeSpan.FromSeconds(5)); + + // Two distinct adapter subscribes total (the orphaned one + the fresh one). + await alarmable.Received(2).SubscribeAlarmsAsync( + "Tank01", Arg.Any(), Arg.Any(), Arg.Any()); + } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs b/tests/ZB.MOM.WW.ScadaBridge.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs index 47487b91..fe41d646 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.DeploymentManager.Tests/ArtifactDeploymentServiceTests.cs @@ -32,6 +32,9 @@ public class ArtifactDeploymentServiceTests : TestKit _deploymentRepo = Substitute.For(); _templateRepo = Substitute.For(); _externalSystemRepo = Substitute.For(); + // DeploymentManager-025/-027: the notification repo is retained only so the + // tests can assert the artifact path NEVER touches it (it is no longer a + // constructor dependency of ArtifactDeploymentService). _notificationRepo = Substitute.For(); _audit = Substitute.For(); } @@ -149,9 +152,8 @@ public class ArtifactDeploymentServiceTests : TestKit { // DeploymentManager-023: previously each per-site iteration of the deploy-many // loop re-issued the global artifact queries (shared scripts, external systems, - // DB connections, notification lists, SMTP configs) — a textbook N+1 over the - // global sets. With three sites the queries must now be issued ONCE in total, - // regardless of site count. + // DB connections) — a textbook N+1 over the global sets. With three sites the + // queries must now be issued ONCE in total, regardless of site count. var sites = new List { new("Site One", "site-1") { Id = 1 }, @@ -170,8 +172,16 @@ public class ArtifactDeploymentServiceTests : TestKit await _templateRepo.Received(1).GetAllSharedScriptsAsync(Arg.Any()); await _externalSystemRepo.Received(1).GetAllExternalSystemsAsync(Arg.Any()); await _externalSystemRepo.Received(1).GetAllDatabaseConnectionsAsync(Arg.Any()); - await _notificationRepo.Received(1).GetAllNotificationListsAsync(Arg.Any()); - await _notificationRepo.Received(1).GetAllSmtpConfigurationsAsync(Arg.Any()); + // DeploymentManager-025/-027: notification lists and SMTP configuration are + // central-only — the artifact path must NEVER fetch them, and the per-site + // command must NEVER carry them. + await _notificationRepo.DidNotReceive().GetAllNotificationListsAsync(Arg.Any()); + await _notificationRepo.DidNotReceive().GetAllSmtpConfigurationsAsync(Arg.Any()); + Assert.All(recorder.Received, cmd => + { + Assert.Null(cmd.NotificationLists); + Assert.Null(cmd.SmtpConfigurations); + }); // The per-site query (data connections) DOES vary per site and must still run // once per site. await _siteRepo.Received(1).GetDataConnectionsBySiteIdAsync(1, Arg.Any()); @@ -197,8 +207,14 @@ public class ArtifactDeploymentServiceTests : TestKit await _templateRepo.Received(1).GetAllSharedScriptsAsync(Arg.Any()); await _externalSystemRepo.Received(1).GetAllExternalSystemsAsync(Arg.Any()); await _externalSystemRepo.Received(1).GetAllDatabaseConnectionsAsync(Arg.Any()); - await _notificationRepo.Received(1).GetAllNotificationListsAsync(Arg.Any()); - await _notificationRepo.Received(1).GetAllSmtpConfigurationsAsync(Arg.Any()); + // DeploymentManager-025/-027: central-only — never fetched, never shipped. + await _notificationRepo.DidNotReceive().GetAllNotificationListsAsync(Arg.Any()); + await _notificationRepo.DidNotReceive().GetAllSmtpConfigurationsAsync(Arg.Any()); + Assert.All(recorder.Received, cmd => + { + Assert.Null(cmd.NotificationLists); + Assert.Null(cmd.SmtpConfigurations); + }); } [Fact] diff --git a/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs index c77da622..7ecba161 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs @@ -370,7 +370,7 @@ public class DatabaseGatewayTests [MemberData(nameof(TransientNonSqlOutages))] public async Task CachedWrite_NonSqlOutage_ClassifiedTransient_BuffersNotCrash(Exception outage) { - // [1] A live outage that is NOT a SqlException must be classified TRANSIENT + // [3] A live outage that is NOT a SqlException must be classified TRANSIENT // (buffered for retry), NOT escape unclassified to crash the script actor, // and NOT be returned as a permanent Failed result. var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") @@ -401,7 +401,7 @@ public class DatabaseGatewayTests [Fact] public async Task CachedWrite_CancellationRequested_PropagatesOperationCanceled_NotReclassified() { - // [2] OperationCanceledException raised while the caller's token is + // [1] OperationCanceledException raised while the caller's token is // cancelled must propagate UNCHANGED — never reclassified as a transient // DB error and never buffered. Mirrors the HTTP path's first catch: // `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;` @@ -449,7 +449,7 @@ public class DatabaseGatewayTests [Fact] public async Task DeliverBuffered_NonSqlOutage_RethrowsAsTransient_SoEngineRetries() { - // [1] on the RETRY path: a non-SqlException outage during delivery must be + // [3] on the RETRY path: a non-SqlException outage during delivery must be // classified transient and propagate (as TransientDatabaseException) so // the S&F engine schedules another retry — it must NOT crash/park. var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 }; @@ -473,6 +473,71 @@ public class DatabaseGatewayTests () => gateway.DeliverBufferedAsync(message)); } + // ── ExternalSystemGateway-025: a caller-token cancel that surfaces from the SQL + // driver as a SqlException (mid-flight cancel) must propagate as + // OperationCanceledException — never reclassified as a permanent DB error. + // The fix re-checks the caller's token at the TOP of `catch (SqlException)` + // via cancellationToken.ThrowIfCancellationRequested(), so the cancel wins + // regardless of the driver's exception shape (version-independent). ── + + [Fact] + public async Task CachedWrite_CancellationSurfacingAsSqlException_PropagatesCanceled_NotReclassifiedPermanent() + { + var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 }; + StubConnection(conn); + + var (sf, connStr, keepAlive) = NewStoreAndForward(); + using var _ = keepAlive; + + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + // The SQL driver raises a SqlException on a mid-flight cancel (error + // number 0, not in the transient set — pre-fix it was reclassified as a + // PERMANENT DB error). The raw seam throws it through the production + // ExecuteWriteAsync classification so the new ThrowIfCancellationRequested + // guard at the top of `catch (SqlException)` runs end-to-end. + var sqlException = FabricateSqlException("Operation cancelled by user.", number: 0); + var gateway = new RawExecuteStubGateway(_repository, sf, onRunSql: () => throw sqlException); + + await Assert.ThrowsAsync( + () => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)", cancellationToken: cts.Token)); + + // The cancel won — it must NOT have been classified as transient (buffered) + // nor returned as a permanent Failed result. + Assert.Equal(0, ReadBufferDepth(connStr)); + } + + /// + /// Fabricates a with a given + /// message and error number via the driver's internal CreateException + /// factory (the type has no public constructor). Used only to drive the + /// catch (SqlException) branch of ExecuteWriteAsync in tests. + /// + private static Microsoft.Data.SqlClient.SqlException FabricateSqlException(string message, int number) + { + var errorCtor = typeof(Microsoft.Data.SqlClient.SqlError).GetConstructors( + System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic) + .First(c => c.GetParameters().Length == 8 + && c.GetParameters()[7].ParameterType == typeof(Exception)); + var sqlError = (Microsoft.Data.SqlClient.SqlError)errorCtor.Invoke( + new object?[] { number, (byte)0, (byte)0, "server", message, "procedure", 0, null }); + + var collectionCtor = typeof(Microsoft.Data.SqlClient.SqlErrorCollection).GetConstructors( + System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic).First(); + var collection = (Microsoft.Data.SqlClient.SqlErrorCollection)collectionCtor.Invoke(Array.Empty()); + typeof(Microsoft.Data.SqlClient.SqlErrorCollection) + .GetMethod("Add", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic)! + .Invoke(collection, new object?[] { sqlError }); + + var createException = typeof(Microsoft.Data.SqlClient.SqlException).GetMethod( + "CreateException", + System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic, + new[] { typeof(Microsoft.Data.SqlClient.SqlErrorCollection), typeof(string) })!; + return (Microsoft.Data.SqlClient.SqlException)createException.Invoke( + null, new object?[] { collection, "6.0.0" })!; + } + /// /// Reads the current buffered-message count off the S&F SQLite DB by /// counting sf_messages rows (the engine's persistence table). diff --git a/tests/ZB.MOM.WW.ScadaBridge.KpiHistory.Tests/KpiHistoryRecorderActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.KpiHistory.Tests/KpiHistoryRecorderActorTests.cs index 80f21421..27f4bf60 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.KpiHistory.Tests/KpiHistoryRecorderActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.KpiHistory.Tests/KpiHistoryRecorderActorTests.cs @@ -142,6 +142,40 @@ public class KpiHistoryRecorderActorTests : TestKit } } + /// + /// Repository fake whose blocks on a manual-reset gate + /// until the test releases it, holding a sample pass "in flight" so an overlapping-tick + /// scenario can be driven deterministically. Counts how many times the write was entered. + /// + private sealed class GatedRepository : IKpiHistoryRepository + { + private readonly ManualResetEventSlim _release = new(initialState: false); + private int _writeCount; + + /// Number of times has been entered. + public int WriteCount => Volatile.Read(ref _writeCount); + + /// Releases all blocked writes so the gated passes can complete. + public void Release() => _release.Set(); + + public Task RecordSamplesAsync( + IReadOnlyCollection samples, CancellationToken cancellationToken = default) + { + Interlocked.Increment(ref _writeCount); + // Block on a threadpool thread (PipeTo runs the pass off the actor thread), holding + // the pass in flight until the test opens the gate. + return Task.Run(() => _release.Wait(cancellationToken), cancellationToken); + } + + public Task> GetRawSeriesAsync( + string source, string metric, string scope, string? scopeKey, + DateTime fromUtc, DateTime toUtc, CancellationToken cancellationToken = default) => + Task.FromResult>(Array.Empty()); + + public Task PurgeOlderThanAsync(DateTime before, CancellationToken cancellationToken = default) => + Task.FromResult(0); + } + private IServiceProvider BuildServiceProvider( IKpiHistoryRepository repository, params IKpiSampleSource[] sources) { @@ -244,11 +278,62 @@ public class KpiHistoryRecorderActorTests : TestKit duration: TimeSpan.FromSeconds(2), interval: TimeSpan.FromMilliseconds(50)); - // Second tick to the SAME actor: source now returns a healthy sample. - // AwaitAssert confirms the actor processed the message and recorded it. + // Second tick to the SAME actor: source now returns a healthy sample. Re-send the tick + // on each poll so we don't race the (asynchronous) SampleComplete that lowers the + // in-flight guard after the faulted first pass — a tick that lands before the guard + // clears is harmlessly skipped, and the next poll's tick runs. The recovered sample + // being recorded proves the actor's message loop is still alive after a faulted pass. + // (>= 1 rather than exactly-1: a later re-sent tick could run an extra recovering pass + // once the guard clears, which is not what this test pins — KH-001's one-pass-per-tick + // guard is pinned by OverlappingTick_WhileFirstPassInFlight_DoesNotStartSecondPass.) + AwaitAssert( + () => + { + actor.Tell(KpiHistoryRecorderActor.SampleTick.Instance); + Assert.NotEmpty(repository.Recorded); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + [Fact] + public void OverlappingTick_WhileFirstPassInFlight_DoesNotStartSecondPass() + { + // KpiHistory-001 regression: the in-flight guard must coalesce a tick that arrives + // while a prior sample pass is still awaiting its DB write. With a gated repository + // holding the first write open, a second SampleTick must NOT spawn a second pass — + // so RecordSamplesAsync is entered exactly once until the gate is released. + var repository = new GatedRepository(); + var sp = BuildServiceProvider(repository, new HealthySource()); + var actor = CreateActor(sp); + + // First tick: raises the guard, enters the (gated, blocking) write — held in flight. actor.Tell(KpiHistoryRecorderActor.SampleTick.Instance); AwaitAssert( - () => Assert.Single(repository.Recorded), + () => Assert.Equal(1, repository.WriteCount), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // Second tick while the first pass is still in flight: must be skipped by the guard. + actor.Tell(KpiHistoryRecorderActor.SampleTick.Instance); + + // Give the second tick ample time to (wrongly) start a pass; the write count must + // stay at 1, proving no second concurrent pass was launched. + Assert.Equal(1, repository.WriteCount); + Thread.Sleep(300); + Assert.Equal(1, repository.WriteCount); + + // Release the gate so the first pass completes and the guard is lowered; a fresh + // tick must now run a new pass (guard correctly reset, not stuck). Re-send the tick on + // each poll so we don't race the (asynchronous) SampleComplete that lowers the guard — + // a tick that lands before the guard clears is harmlessly skipped, the next one runs. + repository.Release(); + AwaitAssert( + () => + { + actor.Tell(KpiHistoryRecorderActor.SampleTick.Instance); + Assert.Equal(2, repository.WriteCount); + }, duration: TimeSpan.FromSeconds(3), interval: TimeSpan.FromMilliseconds(50)); } diff --git a/tests/ZB.MOM.WW.ScadaBridge.ManagementService.Tests/RoleMappingValidationTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ManagementService.Tests/RoleMappingValidationTests.cs new file mode 100644 index 00000000..50a8c7e2 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.ManagementService.Tests/RoleMappingValidationTests.cs @@ -0,0 +1,117 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using NSubstitute; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Security; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management; +using ZB.MOM.WW.ScadaBridge.Security; + +namespace ZB.MOM.WW.ScadaBridge.ManagementService.Tests; + +/// +/// Security-023 (membership-validation half): the LDAP-group-mapping Role arrives +/// as a free string over the CLI/Management API. The single server-side write path +/// (ManagementActor.HandleCreateRoleMapping / HandleUpdateRoleMapping) now +/// rejects any role that is not in the canonical set, returning a +/// COMMAND_FAILED error before any DB write. A non-canonical role never functioned +/// (no authorization policy or ManagementActor check matched it), so rejecting it removes +/// a silent-misconfiguration footgun rather than changing behaviour. +/// +/// +/// Per Security-023's scope: only the membership check is added. The pre-existing +/// case-sensitivity asymmetry (case-sensitive UI RequireClaim vs case-insensitive +/// ManagementActor authz) is a separately-deferred change and is NOT touched — the +/// membership check accepts a correctly-cased canonical role and stores it verbatim. +/// +public class RoleMappingValidationTests : TestKit, IDisposable +{ + private readonly ISecurityRepository _securityRepo; + private readonly IAuditService _auditService; + private readonly ServiceCollection _services; + + public RoleMappingValidationTests() + { + _securityRepo = Substitute.For(); + _auditService = Substitute.For(); + + _services = new ServiceCollection(); + _services.AddScoped(_ => _securityRepo); + _services.AddScoped(_ => _auditService); + } + + private IActorRef CreateActor() + { + var sp = _services.BuildServiceProvider(); + return Sys.ActorOf(Props.Create(() => new ManagementActor( + sp, NullLogger.Instance))); + } + + // CreateRoleMappingCommand / UpdateRoleMappingCommand both require Administrator + // (see ManagementActor.GetRequiredRole), so the test principal carries that role. + private static ManagementEnvelope Envelope(object command) => + new(new AuthenticatedUser("admin", "admin", new[] { Roles.Administrator }, Array.Empty()), + command, Guid.NewGuid().ToString("N")); + + void IDisposable.Dispose() => Shutdown(); + + [Fact] + public void CreateRoleMapping_UnknownRole_ReturnsError_NoRowInserted() + { + var actor = CreateActor(); + actor.Tell(Envelope(new CreateRoleMappingCommand("SCADA-Admins", "Wizard"))); + + var response = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal("COMMAND_FAILED", response.ErrorCode); + Assert.Contains("Wizard", response.Error); + _securityRepo.DidNotReceiveWithAnyArgs().AddMappingAsync(default!, default); + } + + [Fact] + public void CreateRoleMapping_MisspelledCanonicalRole_ReturnsError_NoRowInserted() + { + // A pure typo of a real role ("Deploer" for "Deployer") is exactly the silent + // footgun Security-023 describes: it would stamp a role claim no policy matches. + var actor = CreateActor(); + actor.Tell(Envelope(new CreateRoleMappingCommand("SCADA-Deploy", "Deploer"))); + + var response = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal("COMMAND_FAILED", response.ErrorCode); + _securityRepo.DidNotReceiveWithAnyArgs().AddMappingAsync(default!, default); + } + + [Fact] + public void UpdateRoleMapping_UnknownRole_ReturnsError_NoRowUpdated() + { + var actor = CreateActor(); + actor.Tell(Envelope(new UpdateRoleMappingCommand(7, "SCADA-Admins", "Wizard"))); + + var response = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal("COMMAND_FAILED", response.ErrorCode); + Assert.Contains("Wizard", response.Error); + // Rejected before the mapping is even looked up. + _securityRepo.DidNotReceiveWithAnyArgs().GetMappingByIdAsync(default, default); + _securityRepo.DidNotReceiveWithAnyArgs().UpdateMappingAsync(default!, default); + } + + [Fact] + public void CreateRoleMapping_CanonicalRole_Succeeds_RowInserted() + { + LdapGroupMapping? inserted = null; + _securityRepo + .When(r => r.AddMappingAsync(Arg.Any(), Arg.Any())) + .Do(ci => inserted = ci.Arg()); + + var actor = CreateActor(); + actor.Tell(Envelope(new CreateRoleMappingCommand("SCADA-Deploy", Roles.Deployer))); + + ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.NotNull(inserted); + // Membership check does not alter the stored value's casing (case-sensitivity + // asymmetry is a separately-deferred change). + Assert.Equal(Roles.Deployer, inserted!.Role); + Assert.Equal("SCADA-Deploy", inserted.LdapGroupName); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/Kpi/NotificationOutboxKpiSampleSourceTests.cs b/tests/ZB.MOM.WW.ScadaBridge.NotificationOutbox.Tests/Kpi/NotificationOutboxKpiSampleSourceTests.cs similarity index 98% rename from tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/Kpi/NotificationOutboxKpiSampleSourceTests.cs rename to tests/ZB.MOM.WW.ScadaBridge.NotificationOutbox.Tests/Kpi/NotificationOutboxKpiSampleSourceTests.cs index c7f24d50..87747bc1 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/Kpi/NotificationOutboxKpiSampleSourceTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.NotificationOutbox.Tests/Kpi/NotificationOutboxKpiSampleSourceTests.cs @@ -1,13 +1,11 @@ -using Microsoft.Extensions.Options; using NSubstitute; using ZB.MOM.WW.ScadaBridge.Commons.Entities.Kpi; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; using ZB.MOM.WW.ScadaBridge.Commons.Types.Kpi; using ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications; -using ZB.MOM.WW.ScadaBridge.NotificationOutbox; using ZB.MOM.WW.ScadaBridge.NotificationOutbox.Kpi; -namespace ZB.MOM.WW.ScadaBridge.NotificationService.Tests.Kpi; +namespace ZB.MOM.WW.ScadaBridge.NotificationOutbox.Tests.Kpi; /// /// Tests for — the M6 KPI sample source that diff --git a/tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/SmtpErrorClassifierTests.cs b/tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/SmtpErrorClassifierTests.cs index 8a4b53e0..ab02af3d 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/SmtpErrorClassifierTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.NotificationService.Tests/SmtpErrorClassifierTests.cs @@ -8,8 +8,8 @@ namespace ZB.MOM.WW.ScadaBridge.NotificationService.Tests; /// NS-002/NS-003: Tests for the shared SMTP error classification policy. This /// policy is correctness-relevant — it decides whether a delivery failure is /// retried (transient) or returned to the caller (permanent) — and is shared -/// between and the central outbox's -/// EmailNotificationDeliveryAdapter, so it deserves direct coverage. +/// by the central outbox's EmailNotificationDeliveryAdapter, so it +/// deserves direct coverage. /// public class SmtpErrorClassifierTests { diff --git a/tests/ZB.MOM.WW.ScadaBridge.ScriptAnalysis.Tests/ScriptTrustValidatorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ScriptAnalysis.Tests/ScriptTrustValidatorTests.cs index be77f8c2..34ee3a0d 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.ScriptAnalysis.Tests/ScriptTrustValidatorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.ScriptAnalysis.Tests/ScriptTrustValidatorTests.cs @@ -193,4 +193,132 @@ public class ScriptTrustValidatorTests var code = "var n = System.Linq.Enumerable.Range(0,3).Sum(); var m = System.Math.Max(1,2);"; Assert.Empty(ScriptTrustValidator.FindViolations(code)); } + + // ---- ScriptAnalysis-003: adversarial bypass-vector coverage -------------- + + // (a) TPA-FALLBACK DEGRADATION (the SA-001 hole). Forces Pass 1 onto the + // minimal fallback reference set (DefaultAssemblies + ForbiddenAnchorAssemblies) + // — the set used on a single-file/AOT/trimmed host with no TPA list — and + // proves a BARE forbidden type inside an ALLOWED namespace is STILL caught. + // Before the fix, `Process` resolved to nothing against the minimal set, the + // syntactic fallback ignored the dotless identifier, and Pass 2 never flags a + // bare identifier — so `Process.Start` slipped the validator entirely. The + // anchor assemblies folded into the fallback close that hole. + + [Fact] + public void TpaFallback_StillRejects_BareProcess_ViaUsing() + { + // The documented forbidden-type-in-allowed-namespace case: System.Diagnostics + // is allowed (Stopwatch/Debug), the `using` is not flagged, and `Process` + // is a BARE identifier. Against the minimal fallback set this must still + // be rejected — otherwise the SA-001 fallback hole is open. + var minimal = ScriptTrustPolicy.BuildMinimalFallbackReferences(); + var code = "using System.Diagnostics; var p = Process.Start(\"x\");"; + Assert.NotEmpty(ScriptTrustValidator.FindViolations(code, minimal)); + } + + [Fact] + public void TpaFallback_StillRejects_BareSocket_ViaUsing() + { + // System.Net.Sockets.Socket lives in its own assembly (not CoreLib); the + // anchor set must include it so the minimal fallback still resolves a bare + // `Socket` reference. + var minimal = ScriptTrustPolicy.BuildMinimalFallbackReferences(); + var code = "using System.Net.Sockets; Socket s = null;"; + Assert.NotEmpty(ScriptTrustValidator.FindViolations(code, minimal)); + } + + [Fact] + public void TpaFallback_StillAllows_DiagnosticsStopwatch() + { + // Control: the fallback must not over-block — Stopwatch (System.Diagnostics, + // allowed) stays clean even against the minimal anchor-enriched set. + var minimal = ScriptTrustPolicy.BuildMinimalFallbackReferences(); + var code = "var sw = System.Diagnostics.Stopwatch.StartNew();"; + Assert.Empty(ScriptTrustValidator.FindViolations(code, minimal)); + } + + [Fact] + public void MinimalFallbackReferences_Resolve_Process_AsForbidden() + { + // Pins the resolution mechanism directly: against the minimal fallback set, + // bare `Process` resolves to its true namespace and is reported by the + // semantic pass (the message names the forbidden scope), not merely caught + // by some incidental syntactic rule. + var minimal = ScriptTrustPolicy.BuildMinimalFallbackReferences(); + var violations = ScriptTrustValidator.FindViolations( + "using System.Diagnostics; var p = Process.Start(\"x\");", minimal); + Assert.Contains(violations, v => v.Contains("System.Diagnostics.Process", StringComparison.Ordinal)); + } + + // (b) EXTENSION-METHOD invocation of a forbidden API. `asm.GetCustomAttribute()` + // resolves to the extension method on System.Reflection.CustomAttributeExtensions + // (a forbidden namespace) even though it is invoked in receiver position — the + // semantic pass resolves the reduced extension method's containing type, so the + // forbidden namespace is caught through the invocation itself. + [Fact] + public void Rejects_ExtensionMethod_InForbiddenNamespace() + { + var code = + "using System.Reflection; " + + "Assembly asm = typeof(string).Assembly; " + + "var a = asm.GetCustomAttribute();"; + Assert.NotEmpty(ScriptTrustValidator.FindViolations(code)); + } + + // (c) VERBATIM (@) and UNICODE-ESCAPE spellings of a forbidden identifier. + // VisitIdentifierName compares Identifier.ValueText, which decodes both the + // verbatim '@' prefix and \uXXXX escapes — so neither spelling evades the + // ForbiddenIdentifiers deny-list. + [Fact] + public void Rejects_VerbatimIdentifier_Activator() + { + var code = "var o = @Activator.CreateInstance(typeof(string));"; + Assert.NotEmpty(ScriptTrustValidator.FindViolations(code)); + } + + [Fact] + public void Rejects_UnicodeEscapedIdentifier_Activator() + { + // 'A' is 'A' — the token spells "Activator" but ValueText is "Activator". + var code = "var o = \\u0041ctivator.CreateInstance(typeof(string));"; + Assert.NotEmpty(ScriptTrustValidator.FindViolations(code)); + } + + // (d) UNSAFE block. The validator is a forbidden-API deny-list, not a + // language-feature gate: a benign `unsafe` block reaches no forbidden API, so + // it must NOT be a false positive — while a forbidden API used INSIDE an unsafe + // block is still caught (the walker descends into the block). + [Fact] + public void Allows_BenignUnsafeBlock_NoForbiddenApi() + { + var code = "unsafe { int x = 1; int* p = &x; var y = *p; }"; + Assert.Empty(ScriptTrustValidator.FindViolations(code)); + } + + [Fact] + public void Rejects_ForbiddenApi_InsideUnsafeBlock() + { + var code = "unsafe { var t = typeof(string).Assembly; }"; + Assert.NotEmpty(ScriptTrustValidator.FindViolations(code)); + } + + // (e) COMMENT / STRING-LITERAL must NOT cause a false positive. A forbidden + // namespace mentioned only in trivia or a string literal reaches no API and + // must stay clean (the walker inspects name/member nodes, never trivia or + // literal text). Reconstructing a forbidden API from runtime strings is outside + // the static validator's remit (documented sandbox caveat). + [Fact] + public void Allows_ForbiddenNamespace_InCommentOnly() + { + var code = "// using System.IO; File.ReadAllText(\"x\")\nvar y = 1;"; + Assert.Empty(ScriptTrustValidator.FindViolations(code)); + } + + [Fact] + public void Allows_ForbiddenNamespace_InStringLiteralOnly() + { + var code = "var s = \"System.IO.File.ReadAllText\";"; + Assert.Empty(ScriptTrustValidator.FindViolations(code)); + } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.Security.Tests/SecurityTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Security.Tests/SecurityTests.cs index 72037388..79c3367b 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Security.Tests/SecurityTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Security.Tests/SecurityTests.cs @@ -71,12 +71,16 @@ public class LdapAuthServiceTests [Fact] public async Task AuthenticateAsync_ConnectionFailure_FailsClosed_NeverThrows() { - // Point at a non-existent server: the library fails closed (never throws) and - // maps the unreachable directory to the system-side ServiceAccountBindFailed - // bucket — preserving the donor's "directory unavailable ⇒ login fails" rule. + // Security-025: point at a guaranteed-unroutable loopback address (127.0.0.1 on a + // closed high port) rather than DNS-resolving "nonexistent.invalid". The connect is + // refused deterministically and immediately, with no external DNS dependency and no + // multi-second timeout dead-time, so this stays a fast, network-sandbox-safe unit + // test. The library still fails closed (never throws) and maps the unreachable + // directory to the system-side ServiceAccountBindFailed bucket — preserving the + // donor's "directory unavailable ⇒ login fails" rule, which is what this asserts. var options = CreateOptions(LdapTransport.None, allowInsecure: true) with { - Server = "nonexistent.invalid", + Server = "127.0.0.1", Port = 9999, ConnectionTimeoutMs = 2_000, }; @@ -1159,6 +1163,85 @@ public class AuthorizationPolicyTests Assert.True(await EvaluatePolicy(AuthorizationPolicies.OperationalAudit, principal)); } + // ───────────────────────────────────────────────────────────────────── + // Security-024 — M7 two-person Secured Write separation-of-duties (SoD) + // policies. The whole safety argument of Secured Writes is that the role + // that INITIATES (Operator → RequireOperator) is distinct from the role + // that APPROVES (Verifier → RequireVerifier). These functional + // AuthorizeAsync tests prove the grant/deny behaviour and the mutual + // distinctness — a regression that mapped, say, RequireOperator to + // Roles.Verifier would now fail here instead of silently collapsing the + // SoD (the prior coverage only asserted the constant string values). + // ───────────────────────────────────────────────────────────────────── + + [Fact] + public async Task OperatorPolicy_OperatorRole_Succeeds() + { + var principal = CreatePrincipal(new[] { Roles.Operator }); + Assert.True(await EvaluatePolicy(AuthorizationPolicies.RequireOperator, principal)); + } + + [Theory] + [InlineData("Verifier")] + [InlineData("Administrator")] + [InlineData("Designer")] + [InlineData("Deployer")] + [InlineData("Viewer")] + public async Task OperatorPolicy_NonOperatorRoles_Fail(string role) + { + var principal = CreatePrincipal(new[] { role }); + Assert.False(await EvaluatePolicy(AuthorizationPolicies.RequireOperator, principal)); + } + + [Fact] + public async Task OperatorPolicy_NoRoles_Fails() + { + var principal = CreatePrincipal(Array.Empty()); + Assert.False(await EvaluatePolicy(AuthorizationPolicies.RequireOperator, principal)); + } + + [Fact] + public async Task VerifierPolicy_VerifierRole_Succeeds() + { + var principal = CreatePrincipal(new[] { Roles.Verifier }); + Assert.True(await EvaluatePolicy(AuthorizationPolicies.RequireVerifier, principal)); + } + + [Theory] + [InlineData("Operator")] + [InlineData("Administrator")] + [InlineData("Designer")] + [InlineData("Deployer")] + [InlineData("Viewer")] + public async Task VerifierPolicy_NonVerifierRoles_Fail(string role) + { + var principal = CreatePrincipal(new[] { role }); + Assert.False(await EvaluatePolicy(AuthorizationPolicies.RequireVerifier, principal)); + } + + [Fact] + public async Task VerifierPolicy_NoRoles_Fails() + { + var principal = CreatePrincipal(Array.Empty()); + Assert.False(await EvaluatePolicy(AuthorizationPolicies.RequireVerifier, principal)); + } + + [Fact] + public async Task SecuredWriteSoD_OperatorPrincipalCannotSatisfyVerifier_AndViceVersa() + { + // The SoD invariant at the policy layer: an Operator-only principal can + // initiate (RequireOperator) but cannot approve (RequireVerifier), and a + // Verifier-only principal is the mirror. The two policies are mutually + // distinct, so a single role can never both initiate and approve. + var operatorOnly = CreatePrincipal(new[] { Roles.Operator }); + Assert.True(await EvaluatePolicy(AuthorizationPolicies.RequireOperator, operatorOnly)); + Assert.False(await EvaluatePolicy(AuthorizationPolicies.RequireVerifier, operatorOnly)); + + var verifierOnly = CreatePrincipal(new[] { Roles.Verifier }); + Assert.True(await EvaluatePolicy(AuthorizationPolicies.RequireVerifier, verifierOnly)); + Assert.False(await EvaluatePolicy(AuthorizationPolicies.RequireOperator, verifierOnly)); + } + private static ClaimsPrincipal CreatePrincipal(string[] roles, string[]? siteIds = null) { var claims = new List(); diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs index 78b63f1b..611da1a4 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs @@ -1,5 +1,6 @@ using Akka.Actor; using Akka.TestKit.Xunit2; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging.Abstractions; using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; @@ -177,4 +178,48 @@ public class SiteCallAuditPurgeTests : TestKit duration: TimeSpan.FromSeconds(3), interval: TimeSpan.FromMilliseconds(50)); } + + // --------------------------------------------------------------------- + // 4. SiteCallAudit-007: purge timer arms even when the reconciliation + // collaborators are ABSENT (production ctor, no IPullSiteCallsClient / + // ISiteEnumerator registered). Proves the decoupling — a host that omits + // the reconciliation client still purges, so the central SiteCalls table + // cannot grow unbounded. + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_ProductionCtor_NoReconciliationCollaborators_StillPurges() + { + var repo = new RecordingRepo { RowsDeletedPerCall = 3 }; + + // Build a DI container that registers the repository the production + // ctor resolves per-tick, but deliberately registers NEITHER + // IPullSiteCallsClient NOR ISiteEnumerator. GetService returns null for + // both, so the actor's reconciliation tick is gated off — but the purge + // tick must still arm (SiteCallAudit-007). + var provider = new ServiceCollection() + .AddScoped(_ => repo) + .BuildServiceProvider(); + + var options = FastPurgeOptions(retentionDays: 30); + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + provider, + options, + NullLogger.Instance))); + + // No reconciliation collaborators were registered, yet the purge tick + // must still fire on the production path. + AwaitAssert( + () => Assert.True(repo.PurgeThresholds.Count >= 1, + "purge timer must arm even when the reconciliation collaborators are absent " + + $"(SiteCallAudit-007), got {repo.PurgeThresholds.Count} purge calls"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + var threshold = repo.PurgeThresholds[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(30); + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"purge threshold {threshold:o} should be within 1 minute of {expected:o}"); + } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs index 22244e50..9912456e 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs @@ -107,6 +107,30 @@ public class SiteCallAuditReconciliationTests : TestKit } } + /// + /// Pull client that ALWAYS returns the same saturated response + /// (MoreAvailable=true) regardless of the since cursor — + /// simulates the SiteCallAudit-009 single-timestamp no-progress pin: a backlog + /// larger than the batch size all sharing one exact UpdatedAtUtc, so + /// the inclusive max-timestamp cursor never advances. Records every call so + /// the test can assert the within-tick drain is BOUNDED (the actor must not + /// spin the dispatcher forever on this pathological input). + /// + private sealed class SaturatedPinPullClient : IPullSiteCallsClient + { + private readonly IReadOnlyList _rows; + public int CallCount { get; private set; } + + public SaturatedPinPullClient(IReadOnlyList rows) => _rows = rows; + + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + CallCount++; + return Task.FromResult(new PullSiteCallsResponse(_rows, MoreAvailable: true)); + } + } + /// /// Recording repository that captures every call /// (keyed by id, last-write-wins on the captured row). The reconciliation @@ -301,4 +325,115 @@ public class SiteCallAuditReconciliationTests : TestKit // so it upserts nothing on its own. Assert.Equal(0, repo.UpsertCallCount); } + + // --------------------------------------------------------------------- + // 5. SiteCallAudit-009: MoreAvailable drives a within-tick continuation + // drain — a multi-page backlog whose timestamps advance is fully drained + // in ONE tick rather than one page per tick. + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_MoreAvailable_DrainsMultiplePagesWithinOneTick() + { + var siteId = "siteA"; + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + var p1a = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t1); + var p1b = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t2); + var p2 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t3); + + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://siteA:8083")); + // Page 1 saturates (MoreAvailable: true) → the actor continues pulling + // within the SAME tick; page 2 is the final page (MoreAvailable: false). + // The continuation pull's `since` must be t2 (page-1 max), proving the + // cursor advanced page-to-page inside one tick rather than across ticks. + var client = new ScriptedPullClient().Script(siteId, + new PullSiteCallsResponse(new[] { p1a, p1b }, MoreAvailable: true), + new PullSiteCallsResponse(new[] { p2 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + // Slow tick so the multi-page drain CANNOT be the natural tick cadence — + // it must be the within-tick continuation loop. Long enough that only the + // first tick fires in the assert window. + var options = new SiteCallAuditOptions + { + ReconciliationIntervalOverride = TimeSpan.FromSeconds(2), + ReconciliationBatchSize = 2, + }; + + CreateActor(sites, client, repo, options); + + AwaitAssert( + () => + { + // All three rows reconciled — including the page-2 row that only a + // within-tick continuation pull could have fetched. + Assert.True(repo.Upserted.ContainsKey(p1a.TrackedOperationId)); + Assert.True(repo.Upserted.ContainsKey(p1b.TrackedOperationId)); + Assert.True(repo.Upserted.ContainsKey(p2.TrackedOperationId), + "the page-2 row must be reconciled within the same tick via the MoreAvailable continuation drain"); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // Exactly two pulls happened (page 1 + the continuation page 2) and the + // second pull's `since` cursor advanced to the page-1 max (t2). + Assert.True(client.Calls.Count >= 2, $"expected >= 2 pulls within the tick, got {client.Calls.Count}"); + Assert.Equal(DateTime.MinValue, client.Calls[0].SinceUtc); + Assert.Equal(t2, client.Calls[1].SinceUtc); + } + + // --------------------------------------------------------------------- + // 6. SiteCallAudit-009: single-timestamp saturation pin does NOT spin — + // a saturated batch whose max UpdatedAtUtc never advances past `since` + // breaks the within-tick drain after one page (no unbounded re-pull), + // and still upserts the rows it saw. + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_SingleTimestampSaturation_DoesNotSpin_MakesNoProgressGracefully() + { + var siteId = "siteA"; + // A burst sharing ONE exact UpdatedAtUtc that saturates the batch — the + // inclusive max-timestamp cursor cannot advance, so an unbounded + // continuation loop would re-pull this identical window forever. + var ts = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var r1 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: ts); + var r2 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: ts); + + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://siteA:8083")); + var client = new SaturatedPinPullClient(new[] { r1, r2 }); + var repo = new RecordingRepo(); + + // Long interval so AT MOST one tick fires in the assert window — lets us + // bound the WITHIN-tick pull count. A no-progress pin must break after a + // single page, NOT loop up to MaxReconciliationPagesPerTick (50). + var options = new SiteCallAuditOptions + { + ReconciliationIntervalOverride = TimeSpan.FromSeconds(2), + ReconciliationBatchSize = 2, + }; + + CreateActor(sites, client, repo, options); + + AwaitAssert( + () => Assert.True(client.CallCount >= 1, "the first reconciliation tick should have pulled"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // The rows it saw were still upserted (idempotent mirror refresh). + Assert.True(repo.Upserted.ContainsKey(r1.TrackedOperationId)); + Assert.True(repo.Upserted.ContainsKey(r2.TrackedOperationId)); + + // Critical SiteCallAudit-009 invariant: the within-tick drain BROKE on the + // no-progress pin rather than looping to the 50-page ceiling. With a 2s + // tick interval, only the first tick has fired in the window, so the pull + // count reflects ONE tick's within-loop behaviour. A correct break yields + // 1 pull for that tick; we allow a small margin for a possible second tick + // edge, but it must be far below the 50-page within-tick ceiling. + Assert.True(client.CallCount < 10, + $"a single-timestamp saturation pin must break the within-tick drain, not spin to the " + + $"page ceiling; got {client.CallCount} pulls (an unbounded within-tick loop would be 50+)"); + } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteEventLogging.Tests/EventLogQueryServiceTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteEventLogging.Tests/EventLogQueryServiceTests.cs index b68e9d0a..b3622d99 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteEventLogging.Tests/EventLogQueryServiceTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteEventLogging.Tests/EventLogQueryServiceTests.cs @@ -213,6 +213,42 @@ public class EventLogQueryServiceTests : IDisposable Assert.Equal(2, response.Entries.Count); } + [Fact] + public async Task Query_FiltersByTimeRange_HandlesNonUtcOffset() + { + // SiteEventLogging-024 (re-opens -016): the store holds UTC ISO-8601 "o" + // strings (always +00:00) and compares them lexicographically. If the + // From/To bounds are stringified verbatim without UTC normalisation, a + // non-UTC DateTimeOffset from a central client sorts wrongly against the + // stored +00:00 values and the wrong rows are returned. This test seeds + // events at known UTC instants and queries with bounds expressed in a + // +05:00 offset that bracket the middle row; it FAILS against the unfixed + // code (verbatim ToString("o")) and PASSES once From/To are normalised with + // .ToUniversalTime(). + + // Three events at distinct, well-separated UTC instants. The recorder always + // stores UTC, so seed the rows as UTC to mirror real data. + var baseUtc = new DateTimeOffset(2026, 6, 1, 12, 0, 0, TimeSpan.Zero); + InsertEventAt(baseUtc.AddHours(-2), "script", "Info", null, "EARLY", "Early event"); // 10:00 UTC + InsertEventAt(baseUtc, "script", "Info", null, "MIDDLE", "Middle event"); // 12:00 UTC + InsertEventAt(baseUtc.AddHours(2), "script", "Info", null, "LATE", "Late event"); // 14:00 UTC + + // Express the SAME wall-clock window the operator intends — 11:00..13:00 UTC — + // but as a +05:00 DateTimeOffset (16:00..18:00 local). These bound only the + // MIDDLE row. With the bug, ToString("o") emits "...+05:00" which compares + // wrongly against the stored "...+00:00" rows. + var offset = TimeSpan.FromHours(5); + var fromNonUtc = new DateTimeOffset(2026, 6, 1, 16, 0, 0, offset); // == 11:00 UTC + var toNonUtc = new DateTimeOffset(2026, 6, 1, 18, 0, 0, offset); // == 13:00 UTC + + var response = _queryService.ExecuteQuery(MakeRequest(from: fromNonUtc, to: toNonUtc)); + + Assert.True(response.Success); + // Assert on row IDENTITIES, not just the count: only the MIDDLE row falls in + // the 11:00..13:00 UTC window. + Assert.Equal(new[] { "MIDDLE" }, response.Entries.Select(e => e.Source).ToArray()); + } + [Fact] public async Task Query_EmptyResult_WhenNoMatches() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs index 1e9caa5b..0f4ca1ac 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs @@ -184,6 +184,62 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable Assert.True(disable.Success); } + [Fact] + public async Task SR029_DeleteDuringPendingRedeploy_InstanceStaysDeleted_AndCounterIsCorrect() + { + // Regression test for SiteRuntime-029. A delete arriving WHILE a redeploy is + // still terminating used to: (1) over-decrement _totalDeployedCount, and + // (2) leave the buffered _pendingRedeploys entry intact — so when Terminated + // fired, HandleTerminated called ApplyDeployment(isRedeploy: true) and + // RESURRECTED the just-deleted instance (re-creating the actor and re-writing + // the deployed-config SQLite row). After the fix, HandleDelete is authoritative + // over the mid-redeploy bookkeeping: it cancels the pending redeploy (telling + // the displaced deployer it was superseded), clears the terminating shadow, and + // decrements the counter exactly once. + var health = new CountCapturingHealthCollector(); + var actor = CreateDeploymentManager(health); + await Task.Delay(500); + + // Establish the running instance. + actor.Tell(new DeployInstanceCommand( + "dep-1", "RaceTarget", "h1", MakeConfigJson("RaceTarget"), "admin", DateTimeOffset.UtcNow)); + var first = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal(DeploymentStatus.Success, first.Status); + await Task.Delay(300); + + // Fire a redeploy immediately followed by a delete. Both queue on the + // singleton mailbox: HandleDeploy runs first (removes from _instanceActors, + // watches + stops the predecessor, buffers the redeploy, sets the terminating + // shadow), then HandleDelete runs while the predecessor is still terminating + // (Terminated has not fired) — exactly the SiteRuntime-029 window. + var redeployProbe = CreateTestProbe(); + actor.Tell(new DeployInstanceCommand( + "dep-2", "RaceTarget", "h2", MakeConfigJson("RaceTarget"), "admin", DateTimeOffset.UtcNow), + redeployProbe.Ref); + actor.Tell(new DeleteInstanceCommand("del-1", "RaceTarget", DateTimeOffset.UtcNow)); + + // The delete succeeds... + var delete = ExpectMsg(TimeSpan.FromSeconds(10)); + Assert.True(delete.Success); + + // ...and the displaced redeploy is told it was superseded (not silently lost). + var superseded = redeployProbe.ExpectMsg(TimeSpan.FromSeconds(10)); + Assert.Equal("dep-2", superseded.DeploymentId); + Assert.Equal(DeploymentStatus.Failed, superseded.Status); + Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase); + + // Give the predecessor's Terminated signal time to fire — it must NOT + // resurrect the deleted instance. + await Task.Delay(1000); + + // The instance stays deleted: no deployed-config row remains. + var configs = await _storage.GetAllDeployedConfigsAsync(); + Assert.DoesNotContain(configs, c => c.InstanceUniqueName == "RaceTarget"); + + // The deployed count is back to 0 — neither over-decremented nor resurrected. + Assert.Equal(0, health.LastDeployedCount); + } + [Fact] public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs index 3c3a45c8..fb37f76f 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs @@ -9,6 +9,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; @@ -44,8 +45,12 @@ public class NativeAlarmActorTests : TestKit, IDisposable "Process", "hi", "hi", "", "", null, time ?? DateTimeOffset.UtcNow, "92", "90"); private IActorRef Spawn(IActorRef instanceActor, IActorRef dclManager, IServiceProvider? serviceProvider = null) => + Spawn(instanceActor, dclManager, _options, serviceProvider); + + private IActorRef Spawn(IActorRef instanceActor, IActorRef dclManager, SiteRuntimeOptions options, + IServiceProvider? serviceProvider = null) => ActorOf(Props.Create(() => new NativeAlarmActor( - Source(), "inst", instanceActor, dclManager, _storage, _options, + Source(), "inst", instanceActor, dclManager, _storage, options, NullLogger.Instance, AlarmKind.NativeOpcUa, serviceProvider))); [Fact] @@ -297,6 +302,74 @@ public class NativeAlarmActorTests : TestKit, IDisposable }, TimeSpan.FromSeconds(2)); } + // ── SiteRuntime-028: cap eviction emits a return-to-normal for an active drop ── + + [Fact] + public void EnforceCap_EvictsActiveOldestCondition_EmitsReturnToNormalAndDropSignal() + { + // SiteRuntime-028: a cap eviction that drops a still-Active condition without a + // return-to-normal leaves the Instance Actor (and central's stream / the + // operator Alarm Summary) showing a phantom Active alarm forever. With cap=1, + // raising a second condition evicts the oldest (still Active) one — which must + // produce a Normal AlarmStateChanged for the evicted SourceReference, plus the + // SiteRuntime-027 NativeAlarmDropped so the parent evicts its stale key. + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var options = new SiteRuntimeOptions { MirroredAlarmCapPerSource = 1 }; + var actor = Spawn(instance.Ref, dcl.Ref, options); + dcl.ExpectMsg(); + + var t0 = DateTimeOffset.UtcNow; + // Oldest active condition (will be evicted when the cap is exceeded). + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "OLD.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0))); + instance.ExpectMsg(m => m.SourceReference == "OLD.Hi" && m.State == AlarmState.Active); + + // Newer active condition pushes the set to 2 > cap(1) → OLD.Hi is evicted. + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "NEW.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0.AddSeconds(5)))); + + // The new condition is emitted active... + instance.ExpectMsg(m => m.SourceReference == "NEW.Hi" && m.State == AlarmState.Active); + // ...and the evicted oldest condition must be cleared (return-to-normal), not + // left phantom-active. + instance.ExpectMsg(m => m.SourceReference == "OLD.Hi" && m.State == AlarmState.Normal); + // ...and the parent is told to evict the stale _latestAlarmEvents key. + instance.ExpectMsg(m => m.SourceReference == "OLD.Hi"); + } + + // ── SiteRuntime-027: terminal retention drop signals the parent to evict its key ── + + [Fact] + public void RetentionDrop_ResolvedCondition_EmitsReturnToNormalThenDropSignal() + { + // SiteRuntime-027: when a native condition resolves (inactive AND acknowledged) + // it drops out of the mirror. The Instance Actor must be told (NativeAlarmDropped) + // so its _latestAlarmEvents map does not retain a stale (Normal) entry forever — + // otherwise a source that mints a fresh SourceReference per occurrence leaks one + // entry per condition the instance has ever seen. + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref); + dcl.ExpectMsg(); + + var t0 = DateTimeOffset.UtcNow; + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + // Resolved: inactive AND acknowledged → return-to-normal emitted, then dropped. + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Clear, + new AlarmConditionState(false, true, null, AlarmShelveState.Unshelved, false, 0), t0.AddSeconds(5)))); + + instance.ExpectMsg(m => m.SourceReference == "T01.Hi" && m.State == AlarmState.Normal); + instance.ExpectMsg(m => m.SourceReference == "T01.Hi"); + } + void IDisposable.Dispose() { Shutdown(); diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Persistence/ArtifactStorageTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Persistence/ArtifactStorageTests.cs index 4459804e..e753bc1c 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Persistence/ArtifactStorageTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Persistence/ArtifactStorageTests.cs @@ -1,5 +1,6 @@ using Microsoft.Extensions.Logging.Abstractions; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Repositories; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Persistence; @@ -140,6 +141,42 @@ public class ArtifactStorageTests : IAsyncLifetime, IDisposable // Upsert should not throw } + // ── DeploymentManager-025 / SiteRuntime-031: central-only notif/SMTP purge ── + + [Fact] + public async Task PurgeCentralOnlyNotificationConfig_RemovesPersistedNotificationListsAndSmtpRows() + { + // Simulate a pre-fix build that already shipped a notification list and an + // SMTP config (with a plaintext password) to the site. + await _storage.StoreNotificationListAsync("Ops Team", ["ops@example.com"]); + await _storage.StoreSmtpConfigurationAsync( + "smtp.example.com:587", "smtp.example.com", 587, "BasicAuth", + "noreply@example.com", "smtpuser", "PLAINTEXT-SECRET", null); + + var repo = new SiteNotificationRepository(_storage); + Assert.NotEmpty(await repo.GetAllNotificationListsAsync()); + Assert.NotEmpty(await repo.GetAllSmtpConfigurationsAsync()); + + // The fix: every artifact apply/deploy purges these central-only rows. + await _storage.PurgeCentralOnlyNotificationConfigAsync(); + + // Both tables are now empty — the plaintext SMTP credential is gone. + Assert.Empty(await repo.GetAllNotificationListsAsync()); + Assert.Empty(await repo.GetAllSmtpConfigurationsAsync()); + } + + [Fact] + public async Task PurgeCentralOnlyNotificationConfig_IsIdempotent_OnEmptyTables() + { + // No rows present — purge must not throw and must leave the tables empty. + await _storage.PurgeCentralOnlyNotificationConfigAsync(); + await _storage.PurgeCentralOnlyNotificationConfigAsync(); + + var repo = new SiteNotificationRepository(_storage); + Assert.Empty(await repo.GetAllNotificationListsAsync()); + Assert.Empty(await repo.GetAllSmtpConfigurationsAsync()); + } + // ── Schema includes all WP-33 tables ── [Fact] diff --git a/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs index a09a4b39..f89d5840 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs @@ -131,6 +131,84 @@ public class QueueDepthGaugeTests : IAsyncLifetime, IDisposable Assert.Equal(1, ReadQueueDepthGauge()); } + /// + /// StoreAndForward-025: after a graceful + /// the service must deregister its queue-depth provider from the process-global gauge + /// slot, so the gauge stops reporting the stopped instance's (now-frozen) depth and the + /// provider closure no longer pins the dead service. With the provider cleared the gauge + /// falls back to 0. + /// + [Fact] + public async Task StopAsync_ClearsQueueDepthProvider_GaugeFallsBackToZero() + { + var fresh = new StoreAndForwardService( + _storage, + new StoreAndForwardOptions { RetryTimerInterval = TimeSpan.FromMinutes(10) }, + NullLogger.Instance); + + // Register a Pending row this instance owns, then start so the instance registers + // its provider and seeds the cached count to 1 → gauge reports 1. + await _storage.EnqueueAsync(new StoreAndForwardMessage + { + Id = Guid.NewGuid().ToString("N"), + Category = StoreAndForwardCategory.ExternalSystem, + Target = "api", + PayloadJson = "{}", + Status = StoreAndForwardMessageStatus.Pending, + CreatedAt = DateTimeOffset.UtcNow, + MaxRetries = 3 + }); + await fresh.StartAsync(); + Assert.Equal(1, ReadQueueDepthGauge()); + + // Graceful stop must deregister the provider; the gauge falls back to 0 rather + // than reporting this dead instance's frozen depth of 1. + await fresh.StopAsync(); + Assert.Equal(0, ReadQueueDepthGauge()); + } + + /// + /// StoreAndForward-025 (compare-and-clear): when a newer instance has already + /// registered its provider into the process-global slot, a late + /// of an older instance must NOT clear + /// the slot — the identity-checked clear only removes the slot when it still holds the + /// stopping instance's own delegate. After the late stop the gauge must still report + /// the newer instance's depth, not 0. + /// + [Fact] + public async Task StopAsync_DoesNotClobberNewerInstanceProvider() + { + // Old instance: starts over an empty store, registers its provider (gauge → 0), + // then takes a single buffered message so it would report 1 if it stayed live. + var older = new StoreAndForwardService( + _storage, + new StoreAndForwardOptions { RetryTimerInterval = TimeSpan.FromMinutes(10) }, + NullLogger.Instance); + await older.StartAsync(); + older.TestOnly_IncrementBufferedCount(); // older's depth would be 1 + Assert.Equal(1, ReadQueueDepthGauge()); + + // New instance: starts and re-registers into the same global slot, winning it. + // It seeds from the (empty) store and stands in two buffered messages → depth 2. + var newer = new StoreAndForwardService( + _storage, + new StoreAndForwardOptions { RetryTimerInterval = TimeSpan.FromMinutes(10) }, + NullLogger.Instance); + await newer.StartAsync(); + newer.TestOnly_IncrementBufferedCount(); + newer.TestOnly_IncrementBufferedCount(); + Assert.Equal(2, ReadQueueDepthGauge()); + + // Late stop of the OLDER instance: compare-and-clear must fail the identity check + // (the slot now holds the newer instance's delegate), so the newer provider stays. + await older.StopAsync(); + Assert.Equal(2, ReadQueueDepthGauge()); + + // Cleanup: stop the newer instance, which legitimately clears its own provider. + await newer.StopAsync(); + Assert.Equal(0, ReadQueueDepthGauge()); + } + [Fact] public async Task Gauge_SeedsFromExistingPendingRows_OnStart() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.TemplateEngine.Tests/Services/TemplateFolderServiceTests.cs b/tests/ZB.MOM.WW.ScadaBridge.TemplateEngine.Tests/Services/TemplateFolderServiceTests.cs index 9c729b0e..6b146303 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.TemplateEngine.Tests/Services/TemplateFolderServiceTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.TemplateEngine.Tests/Services/TemplateFolderServiceTests.cs @@ -30,7 +30,8 @@ public class TemplateFolderServiceTests Assert.Equal("Dev", result.Value.Name); Assert.Null(result.Value.ParentFolderId); _repoMock.Verify(r => r.AddFolderAsync(It.IsAny(), It.IsAny()), Times.Once); - _repoMock.Verify(r => r.SaveChangesAsync(It.IsAny()), Times.Once); + // Two saves: the folder entity, then the staged audit row (TemplateEngine-024). + _repoMock.Verify(r => r.SaveChangesAsync(It.IsAny()), Times.Exactly(2)); } [Fact] @@ -485,9 +486,120 @@ public class TemplateFolderServiceTests // Both swapped siblings persisted. _repoMock.Verify(r => r.UpdateFolderAsync(a, It.IsAny()), Times.Once); _repoMock.Verify(r => r.UpdateFolderAsync(b, It.IsAny()), Times.Once); - _repoMock.Verify(r => r.SaveChangesAsync(It.IsAny()), Times.Once); + // Two saves: the swapped siblings, then the staged audit row (TemplateEngine-024). + _repoMock.Verify(r => r.SaveChangesAsync(It.IsAny()), Times.Exactly(2)); // Audit entry written (mirrors Move/Rename audit assertions). _auditMock.Verify(au => au.LogAsync("admin", "Reorder", "TemplateFolder", "2", "B", It.IsAny(), It.IsAny()), Times.Once); } + + // ======================================================================== + // TemplateEngine-024 — each folder mutator must SaveChanges *after* LogAsync + // so the staged audit row is persisted (and not discarded when the scope is + // disposed). Verified by tracking call order across the mutators. + // ======================================================================== + + [Fact] + public async Task CreateFolder_PersistsAuditRow_SaveFollowsLog() + { + AssertAuditRowPersisted(await BuildOrderTracker(async () => + await _sut.CreateFolderAsync("Dev", null, "admin"), + seed: () => _repoMock.Setup(r => r.GetAllFoldersAsync(It.IsAny())) + .ReturnsAsync(new List()))); + } + + [Fact] + public async Task RenameFolder_PersistsAuditRow_SaveFollowsLog() + { + var folder = new TemplateFolder("Old") { Id = 1, ParentFolderId = null }; + AssertAuditRowPersisted(await BuildOrderTracker(async () => + await _sut.RenameFolderAsync(1, "New", "admin"), + seed: () => + { + _repoMock.Setup(r => r.GetFolderByIdAsync(1, It.IsAny())).ReturnsAsync(folder); + _repoMock.Setup(r => r.GetAllFoldersAsync(It.IsAny())) + .ReturnsAsync(new List { folder }); + })); + } + + [Fact] + public async Task MoveFolder_PersistsAuditRow_SaveFollowsLog() + { + var f1 = new TemplateFolder("A") { Id = 1, ParentFolderId = null }; + var f2 = new TemplateFolder("B") { Id = 2, ParentFolderId = null }; + AssertAuditRowPersisted(await BuildOrderTracker(async () => + await _sut.MoveFolderAsync(1, 2, "admin"), + seed: () => + { + _repoMock.Setup(r => r.GetFolderByIdAsync(1, It.IsAny())).ReturnsAsync(f1); + _repoMock.Setup(r => r.GetFolderByIdAsync(2, It.IsAny())).ReturnsAsync(f2); + _repoMock.Setup(r => r.GetAllFoldersAsync(It.IsAny())) + .ReturnsAsync(new List { f1, f2 }); + })); + } + + [Fact] + public async Task ReorderFolder_PersistsAuditRow_SaveFollowsLog() + { + var a = new TemplateFolder("A") { Id = 1, ParentFolderId = null, SortOrder = 0 }; + var b = new TemplateFolder("B") { Id = 2, ParentFolderId = null, SortOrder = 1 }; + AssertAuditRowPersisted(await BuildOrderTracker(async () => + await _sut.ReorderFolderAsync(2, ReorderDirection.Up, "admin"), + seed: () => + { + _repoMock.Setup(r => r.GetFolderByIdAsync(2, It.IsAny())).ReturnsAsync(b); + _repoMock.Setup(r => r.GetAllFoldersAsync(It.IsAny())) + .ReturnsAsync(new List { a, b }); + })); + } + + [Fact] + public async Task DeleteFolder_PersistsAuditRow_SaveFollowsLog() + { + var f = new TemplateFolder("Empty") { Id = 1 }; + AssertAuditRowPersisted(await BuildOrderTracker(async () => + await _sut.DeleteFolderAsync(1, "admin"), + seed: () => + { + _repoMock.Setup(r => r.GetFolderByIdAsync(1, It.IsAny())).ReturnsAsync(f); + _repoMock.Setup(r => r.GetAllFoldersAsync(It.IsAny())) + .ReturnsAsync(new List { f }); + _repoMock.Setup(r => r.GetAllTemplatesAsync(It.IsAny())) + .ReturnsAsync(new List