Compare commits
54 Commits
phase-3-pr
...
phase-6-1-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9dd5e4e745 | ||
| 6b3a67fd9e | |||
|
|
1d9008e354 | ||
|
|
ef6b0bb8fc | ||
| a06fcb16a2 | |||
|
|
d2f3a243cd | ||
|
|
29bcaf277b | ||
|
|
b6d2803ff6 | ||
|
|
f3850f8914 | ||
|
|
90f7792c92 | ||
|
|
c04b13f436 | ||
| 6a30f3dde7 | |||
|
|
ba31f200f6 | ||
| 81a1f7f0f6 | |||
|
|
4695a5c88e | ||
| 0109fab4bf | |||
|
|
c9e856178a | ||
| 63eb569fd6 | |||
|
|
fad04bbdf7 | ||
| 17f901bb65 | |||
|
|
ba3a5598e1 | ||
| 8cd932e7c9 | |||
|
|
28328def5d | ||
| d3bf544abc | |||
|
|
24435712c4 | ||
| 3f7b4d05e6 | |||
|
|
a79c5f3008 | ||
| a5299a2fee | |||
|
|
a65215684c | ||
| 82f2dfcfa3 | |||
|
|
0433d3a35e | ||
| 141673fc80 | |||
|
|
db56a95819 | ||
| 89bd726fa8 | |||
|
|
238748bc98 | ||
| b21d550836 | |||
|
|
91eaf534c8 | ||
| d33e38e059 | |||
|
|
d8ef35d5bd | ||
| 5e318a1ab6 | |||
|
|
394d126b2e | ||
| 0eab1271be | |||
|
|
d5034c40f7 | ||
| 5e67c49f7c | |||
|
|
0575280a3b | ||
| 8150177296 | |||
|
|
56d8af8bdb | ||
| be8261a4ac | |||
| 65de2b4a09 | |||
| fccb566a30 | |||
| 9ccc7338b8 | |||
| e33783e042 | |||
|
|
a44fc7a610 | ||
|
|
d4c1873998 |
@@ -9,6 +9,8 @@
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Proxy.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/ZB.MOM.WW.OtOpcUa.Driver.Modbus.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.S7/ZB.MOM.WW.OtOpcUa.Driver.S7.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Client.Shared/ZB.MOM.WW.OtOpcUa.Client.Shared.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Client.CLI/ZB.MOM.WW.OtOpcUa.Client.CLI.csproj"/>
|
||||
<Project Path="src/ZB.MOM.WW.OtOpcUa.Client.UI/ZB.MOM.WW.OtOpcUa.Client.UI.csproj"/>
|
||||
@@ -26,6 +28,8 @@
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.E2E/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.E2E.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests/ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.S7.Tests/ZB.MOM.WW.OtOpcUa.Driver.S7.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Client.Shared.Tests/ZB.MOM.WW.OtOpcUa.Client.Shared.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Client.CLI.Tests/ZB.MOM.WW.OtOpcUa.Client.CLI.Tests.csproj"/>
|
||||
<Project Path="tests/ZB.MOM.WW.OtOpcUa.Client.UI.Tests/ZB.MOM.WW.OtOpcUa.Client.UI.Tests.csproj"/>
|
||||
|
||||
@@ -1,56 +1,47 @@
|
||||
# V1 Archive Status (Phase 2 Stream D, 2026-04-18)
|
||||
# V1 Archive Status — CLOSED (Phase 2 Streams D + E complete)
|
||||
|
||||
This document inventories every v1 surface that's been **functionally superseded** by v2 but
|
||||
**physically retained** in the build until the deletion PR (Phase 2 PR 3). Rationale: cascading
|
||||
references mean a single deletion is high blast-radius; archive-marking lets the v2 stack ship
|
||||
on its own merits while the v1 surface stays as parity reference.
|
||||
> **Status as of 2026-04-18: the v1 archive has been fully removed from the tree.**
|
||||
> This document is retained as historical record of the Phase 2 Stream D / E closure.
|
||||
|
||||
## Archived projects
|
||||
## Final state
|
||||
|
||||
| Path | Status | Replaced by | Build behavior |
|
||||
|---|---|---|---|
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Host/` | Archive (executable in build) | `OtOpcUa.Server` + `Driver.Galaxy.Host` + `Driver.Galaxy.Proxy` | Builds; not deployed by v2 install scripts |
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/` | Archive (plugin in build) | TODO: port into `Driver.Galaxy.Host/Backend/Historian/` (Task B.1.h follow-up) | Builds; loaded only by archived Host |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive/` | Archive | `Driver.Galaxy.E2E` + per-component test projects | `<IsTestProject>false</IsTestProject>` — `dotnet test slnx` skips |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.IntegrationTests/` | Archive | `Driver.Galaxy.E2E` | `<IsTestProject>false</IsTestProject>` — `dotnet test slnx` skips |
|
||||
All five v1 archive directories have been deleted:
|
||||
|
||||
## How to run the archived suites explicitly
|
||||
| Path | Deleted | Replaced by |
|
||||
|---|---|---|
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Host/` | ✅ | `OtOpcUa.Server` + `Driver.Galaxy.Host` + `Driver.Galaxy.Proxy` |
|
||||
| `src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/` | ✅ | `Driver.Galaxy.Host/Backend/Historian/` (ported in Phase 3 PRs 51-55) |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.Historian.Aveva.Tests/` | ✅ | `Driver.Galaxy.Host.Tests/Historian/` |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive/` | ✅ | Per-component `*.Tests` projects + `Driver.Galaxy.E2E` |
|
||||
| `tests/ZB.MOM.WW.OtOpcUa.IntegrationTests/` | ✅ | `Driver.Galaxy.E2E` + `Driver.Modbus.IntegrationTests` |
|
||||
|
||||
```powershell
|
||||
# v1 unit tests (494):
|
||||
dotnet test tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive
|
||||
## Closure timeline
|
||||
|
||||
# v1 integration tests (6):
|
||||
dotnet test tests/ZB.MOM.WW.OtOpcUa.IntegrationTests
|
||||
```
|
||||
- **PR 2 (2026-04-18, phase-2-stream-d)** — archive-marked the four v1 projects with
|
||||
`<IsTestProject>false</IsTestProject>` so solution builds and `dotnet test slnx` bypassed
|
||||
them. Capture: `docs/v2/implementation/exit-gate-phase-2-final.md`.
|
||||
- **Phase 3 PR 18 (2026-04-18)** — deleted the archived project source trees. Leftover
|
||||
`bin/` and `obj/` residue remained on disk from pre-deletion builds.
|
||||
- **Phase 2 PR 61 (2026-04-18, this closure PR)** — scrubbed the empty residue directories
|
||||
and confirmed `dotnet build ZB.MOM.WW.OtOpcUa.slnx` clean with 0 errors.
|
||||
|
||||
Both still pass on this dev box — they're the parity reference for Phase 2 PR 3's deletion
|
||||
decision.
|
||||
## Parity validation (Stream E)
|
||||
|
||||
## Deletion plan (Phase 2 PR 3)
|
||||
The original 494 v1 tests + 6 v1 integration tests are **not** preserved in the v2 branch.
|
||||
Their parity-bar role is now filled by:
|
||||
|
||||
Pre-conditions:
|
||||
- [ ] `Driver.Galaxy.E2E` test count covers the v1 IntegrationTests' 6 integration scenarios
|
||||
at minimum (currently 7 tests; expand as needed)
|
||||
- [ ] `Driver.Galaxy.Host/Backend/Historian/` ports the Wonderware Historian plugin
|
||||
so `MxAccessGalaxyBackend.HistoryReadAsync` returns real data (Task B.1.h)
|
||||
- [ ] Operator review on a separate PR — destructive change
|
||||
|
||||
Steps:
|
||||
1. `git rm -r src/ZB.MOM.WW.OtOpcUa.Host/`
|
||||
2. `git rm -r src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/`
|
||||
(or move it under Driver.Galaxy.Host first if the lift is part of the same PR)
|
||||
3. `git rm -r tests/ZB.MOM.WW.OtOpcUa.Tests.v1Archive/`
|
||||
4. `git rm -r tests/ZB.MOM.WW.OtOpcUa.IntegrationTests/`
|
||||
5. Edit `ZB.MOM.WW.OtOpcUa.slnx` — remove the four project lines
|
||||
6. `dotnet build ZB.MOM.WW.OtOpcUa.slnx` → confirm clean
|
||||
7. `dotnet test ZB.MOM.WW.OtOpcUa.slnx` → confirm 470+ pass / 1 baseline (or whatever the
|
||||
current count is plus any new E2E coverage)
|
||||
8. Commit: "Phase 2 Stream D — delete v1 archive (Host + Historian.Aveva + v1Tests + IntegrationTests)"
|
||||
9. PR 3 against `v2`, link this doc + exit-gate-phase-2-final.md
|
||||
10. One reviewer signoff
|
||||
- `Driver.Galaxy.E2E` — cross-FX subprocess parity (spawns the net48 x86 Galaxy.Host.exe
|
||||
+ connects via real named pipe, exercises every `IDriver` capability through the
|
||||
supervisor). Stability-findings regression tests (4 × 2026-04-13 findings) live here.
|
||||
- Per-component `*.Tests` projects — cover the code that moved out of the monolith into
|
||||
discrete v2 projects. Running `dotnet test ZB.MOM.WW.OtOpcUa.slnx` executes all of them
|
||||
as one solution-level gate.
|
||||
- `Driver.Modbus.IntegrationTests` — adds Modbus TCP driver coverage that didn't exist in
|
||||
v1 (DL205, S7-1500, Mitsubishi MELSEC via pymodbus sim profiles — PRs 30, 56-60).
|
||||
- Live-stack smoke tests (`Driver.Galaxy.E2E/LiveStack/`) — optional, gated on presence
|
||||
of the `OtOpcUaGalaxyHost` service + Galaxy repository on the dev box (PRs 33, 36, 37).
|
||||
|
||||
## Rollback
|
||||
|
||||
If Phase 2 PR 3 surfaces downstream consumer regressions, `git revert` the deletion commit
|
||||
restores the four projects intact. The v2 stack continues to ship from the v2 branch.
|
||||
`git revert` of the deletion commits restores the projects intact. The v2 stack continues
|
||||
to ship from the `v2` branch regardless.
|
||||
|
||||
149
docs/v2/implementation/phase-6-1-resilience-and-observability.md
Normal file
149
docs/v2/implementation/phase-6-1-resilience-and-observability.md
Normal file
@@ -0,0 +1,149 @@
|
||||
# Phase 6.1 — Resilience & Observability Runtime
|
||||
|
||||
> **Status**: DRAFT — implementation plan for a cross-cutting phase that was never formalised. The v2 `plan.md` specifies Polly, Tier A/B/C protections, structured logging, and local-cache fallback by decision; none are wired end-to-end.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-1-resilience-observability`
|
||||
> **Estimated duration**: 3 weeks
|
||||
> **Predecessor**: Phase 5 (drivers) — partial; S7 + OPC UA Client shipped, AB/TwinCAT/FOCAS paused
|
||||
> **Successor**: Phase 6.2 (Authorization runtime)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Land the cross-cutting runtime protections + operability features that `plan.md` + `driver-stability.md` specify by decision but that no driver-phase actually wires. End-state: every driver goes through the same Polly resilience layer, health endpoints render the live driver fleet, structured logs carry per-request correlation IDs, and the config substrate survives a central DB outage via a LiteDB local cache.
|
||||
|
||||
Closes these gaps flagged in the 2026-04-19 audit:
|
||||
|
||||
1. Polly v8 resilience pipelines wired to every `IDriver` capability (no-op per-driver today; Galaxy has a hand-rolled `CircuitBreaker` only).
|
||||
2. Tier A/B/C enforcement at runtime — `driver-stability.md` §2–4 and decisions #63–73 define memory watchdog, bounded queues, scheduled recycle, wedge detection; `MemoryWatchdog` exists only inside `Driver.Galaxy.Host`.
|
||||
3. Health endpoints (`/healthz`, `/readyz`) on `OtOpcUa.Server`.
|
||||
4. Structured Serilog with per-request correlation IDs (driver instance, OPC UA session, IPC call).
|
||||
5. LiteDB local cache + Polly retry + fallback on central-DB outage (decision #36).
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `Core` → new `Core.Resilience` sub-namespace | Shared Polly pipeline builder (`DriverResiliencePipelines`). **Pipeline key = `(DriverInstanceId, HostName)`** so one dead PLC behind a multi-device driver doesn't open the breaker for healthy siblings (decision #35 per-device isolation). **Per-capability policy** — Read / HistoryRead / Discover / Probe / Alarm get retries; **Write does NOT** unless `[WriteIdempotent]` on the tag definition (decisions #44-45). |
|
||||
| Every capability-interface consumer in the server | Wrap `IReadable.ReadAsync`, `IWritable.WriteAsync`, `ITagDiscovery.DiscoverAsync`, `ISubscribable.SubscribeAsync/UnsubscribeAsync`, `IHostConnectivityProbe` probe loop, `IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync`, `IHistoryProvider.ReadRawAsync/ReadProcessedAsync/ReadAtTimeAsync/ReadEventsAsync`. Composition: timeout → (retry when capability supports) → circuit breaker → bulkhead. |
|
||||
| `Core.Abstractions` → new `WriteIdempotentAttribute` | Marker on `ModbusTagDefinition` / `S7TagDefinition` / `OpcUaClientDriver` tag rows; opts that tag into auto-retry on Write. Absence = no retry, per spec. |
|
||||
| `Core` → new `Core.Stability` sub-namespace — **split** | Two separate subsystems: (a) **`MemoryTracking`** runs all tiers; captures baseline (median of first 5 min `GetMemoryFootprint` samples) + applies the hybrid rule `soft = max(multiplier × baseline, baseline + floor)`; soft breach logs + surfaces to Admin; never kills. (b) **`MemoryRecycle`** (Tier C only — requires out-of-process topology) handles hard-breach recycle via the Proxy-side supervisor. Tier A/B overrun escalates to Tier C promotion ticket, not auto-kill. |
|
||||
| `ScheduledRecycleScheduler` | Tier C only per decisions #73-74. Weekly/time-of-day recycle via Proxy supervisor. Tier A/B opt-in recycle lands in a future phase together with a Tier-C-escalation workflow. |
|
||||
| `WedgeDetector` | **Demand-aware**: flips a driver to Faulted only when `(hasPendingWork AND noProgressIn > threshold)`. `hasPendingWork` derives from non-zero Polly bulkhead depth OR ≥1 active MonitoredItem OR ≥1 queued historian read. Idle + subscription-only drivers stay Healthy. |
|
||||
| `DriverTypeRegistry` | Each driver type registers its `DriverTier` {A, B, C}. Tier C drivers must advertise their out-of-process topology; the registry enforces invariants (Tier C has a `Proxy` + `Host` pair). |
|
||||
| `Driver.Galaxy.Proxy/Supervisor/` | **Retains** existing `CircuitBreaker` + `Backoff` — they guard IPC respawn (decision #68), different concern from the per-call Polly layer. Only `HeartbeatMonitor` is referenced downstream (IPC liveness). |
|
||||
| `OtOpcUa.Server` → Minimal API endpoints on `http://+:4841` | `/healthz` = process alive + (config DB reachable OR `UsingStaleConfig=true`). `/readyz` = ANDed driver health; state-machine per `DriverState`: `Unknown`/`Initializing` → 503, `Healthy` → 200, `Degraded` → 200 + `{degradedDrivers: [...]}` in body, `Faulted` → 503. JSON body always reports per-instance detail. |
|
||||
| Serilog configuration | Centralize enrichers in `OtOpcUa.Server/Observability/LogContextEnricher.cs`. Every capability call runs inside a `LogContext.PushProperty` scope with {DriverInstanceId, DriverType, CapabilityName, CorrelationId (UA RequestHandle or internal GUID)}. Sink config stays rolling-file per CLAUDE.md; JSON sink added alongside plain-text (switchable via `Serilog:WriteJson` appsetting). |
|
||||
| `Configuration` project | Add `LiteDbConfigCache` adapter. **Generation-sealed snapshots**: `sp_PublishGeneration` writes `<cache-root>/<cluster>/<generationId>.db` as a read-only sealed file. Reads serve the last-known-sealed generation; mixed-generation reads are impossible. Write path bypasses cache + fails hard on DB outage. Pipeline: timeout (2 s) → retry (3×, jittered) → fallback-to-sealed-snapshot. |
|
||||
| `DriverHostStatus` vs. `DriverInstanceResilienceStatus` | New separate entity `DriverInstanceResilienceStatus { DriverInstanceId, HostName, LastCircuitBreakerOpenUtc, ConsecutiveFailures, CurrentBulkheadDepth, LastRecycleUtc, BaselineFootprintBytes }`. `DriverHostStatus` keeps per-host connectivity only; Admin `/hosts` joins both for display. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| Driver wire protocols | Resilience is a server-side wrapper; individual drivers don't see Polly. Their existing retry logic (ModbusTcpTransport reconnect, SessionReconnectHandler) stays in place as inner layers. |
|
||||
| Config DB schema | LiteDB cache is a read-only mirror; no new central tables except `DriverHostStatus` column additions. |
|
||||
| OPC UA wire behavior visible to clients | Health endpoints live on a separate HTTP port (4841 by convention); the OPC UA server on 4840 is unaffected. |
|
||||
| The four 2026-04-13 Galaxy stability findings | Already closed in Phase 2. Phase 6.1 *generalises* the pattern, doesn't re-fix Galaxy. |
|
||||
| Driver-layer SafeHandle usage | Existing Galaxy `SafeMxAccessHandle` + Modbus `TcpClient` disposal stay — they're driver-internal, not part of the cross-cutting layer. |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phases 0–5 exit gates cleared (or explicitly deferred with task reference)
|
||||
- [ ] `driver-stability.md` §2–4 re-read; decisions #63–73 + #34–36 re-skimmed
|
||||
- [ ] Polly v8 NuGet available (`Microsoft.Extensions.Resilience` + `Polly.Core`) — verify package restore before task breakdown
|
||||
- [ ] LiteDB 5.x NuGet confirmed MIT + actively maintained
|
||||
- [ ] Existing drivers catalogued: Galaxy.Proxy, Modbus, S7, OpcUaClient — confirm test counts baseline so the resilience layer doesn't regress any
|
||||
- [ ] Serilog configuration inventory: locate every `Log.ForContext` call site that will need `LogContext` rewrap
|
||||
- [ ] Admin `/hosts` page's current `DriverHostStatus` consumption reviewed so the schema extensions don't break it
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — Resilience layer (1 week)
|
||||
|
||||
1. **A.1** Add `Polly.Core` + `Microsoft.Extensions.Resilience` to `Core`. Build `DriverResiliencePipelineBuilder` — key on `(DriverInstanceId, HostName)`; composes Timeout → (Retry when the capability allows it; skipped for Write unless `[WriteIdempotent]`) → CircuitBreaker → Bulkhead. Per-capability policy map documented in `DriverResilienceOptions.CapabilityPolicies`.
|
||||
2. **A.2** `DriverResilienceOptions` record bound from `DriverInstance.ResilienceConfig` JSON column (new nullable). **Per-tier × per-capability** defaults: Tier A (OpcUaClient, S7) Read 3 retries/2 s/5-failure-breaker, Write 0 retries/2 s/5-failure-breaker; Tier B (Modbus) Read 3/4 s/5, Write 0/4 s/5; Tier C (Galaxy) Read 1 retry/10 s/no-kill, Write 0/10 s/no-kill. Idempotent writes can opt into Read-shaped retry via the attribute.
|
||||
3. **A.3** `CapabilityInvoker<TCapability, TResult>` wraps every method on the capability interfaces (`IReadable.ReadAsync`, `IWritable.WriteAsync`, `ITagDiscovery.DiscoverAsync`, `ISubscribable.SubscribeAsync/UnsubscribeAsync`, `IHostConnectivityProbe` probe loop, `IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync`, `IHistoryProvider.ReadRawAsync/ReadProcessedAsync/ReadAtTimeAsync/ReadEventsAsync`). Existing server-side dispatch routes through it.
|
||||
4. **A.4** **Retain** `Driver.Galaxy.Proxy/Supervisor/CircuitBreaker.cs` + `Backoff.cs` — they guard IPC process respawn (decision #68), orthogonal to the per-call Polly layer. Only `HeartbeatMonitor` is consumed outside the supervisor.
|
||||
5. **A.5** Unit tests: per-policy, per-composition. Negative integration tests: (a) Modbus FlakeyTransport fails 5× on Read, succeeds 6th — invoker surfaces success; (b) Modbus FlakeyTransport fails 1× on Write with `[WriteIdempotent]=false` — invoker surfaces failure without retry (no duplicate pulse); (c) Modbus FlakeyTransport fails 1× on Write with `[WriteIdempotent]=true` — invoker retries. Bench: no-op overhead < 1%.
|
||||
6. **A.6** `WriteIdempotentAttribute` in `Core.Abstractions`. Modbus/S7/OpcUaClient tag-definition records pick it up; invoker reads via reflection once at driver init.
|
||||
|
||||
### Stream B — Tier A/B/C stability runtime — split into MemoryTracking + MemoryRecycle (1 week)
|
||||
|
||||
1. **B.1** `Core.Abstractions` → `DriverTier` enum {A, B, C}. Extend `DriverTypeRegistry` to require `DriverTier` at registration. Existing driver types stamped (Galaxy = C, Modbus = B, S7 = B, OpcUaClient = A).
|
||||
2. **B.2** **`MemoryTracking`** (all tiers) lifted from `Driver.Galaxy.Host/MemoryWatchdog.cs`. Captures `BaselineFootprintBytes` as the median of first 5 min of `IDriver.GetMemoryFootprint()` samples post-`InitializeAsync`. Applies **decision #70 hybrid formula**: `soft = max(multiplier × baseline, baseline + floor)`; Tier A multiplier=3, floor=50 MB; Tier B multiplier=3, floor=100 MB; Tier C multiplier=2, floor=500 MB. Soft breach → log + `DriverInstanceResilienceStatus.CurrentFootprint` tick; never kills. Hard = 2 × soft.
|
||||
3. **B.3** **`MemoryRecycle`** (Tier C only per decisions #73-74). Hard-breach on a Tier C driver triggers `ScheduledRecycleScheduler.RequestRecycleNow(driverInstanceId)`; scheduler proxies to `Driver.Galaxy.Proxy/Supervisor/` which restarts the Host process. Tier A/B hard-breach logs a promotion-to-Tier-C recommendation; **never auto-kills** the in-process driver.
|
||||
4. **B.4** **`ScheduledRecycleScheduler`** per decision #67: Tier C driver instances opt-in to a weekly recycle at a configured cron. Tier A/B scheduled recycle deferred to a later phase paired with Tier-C escalation.
|
||||
5. **B.5** **`WedgeDetector`** demand-aware: `if (state==Healthy && hasPendingWork && noProgressIn > WedgeThreshold) → force ReinitializeAsync`. `hasPendingWork` = (bulkhead depth > 0) OR (active monitored items > 0) OR (queued historian-read count > 0). `WedgeThreshold` default 5 × PublishingInterval, min 60 s. Idle driver stays Healthy.
|
||||
6. **B.6** Tests: tracking unit tests drive synthetic allocation against a fake `GetMemoryFootprint`; recycle tests use a mock supervisor; wedge tests include the false-fault cases — idle subscriber, slow historian backfill, write-only burst.
|
||||
|
||||
### Stream C — Health endpoints + structured logging (4 days)
|
||||
|
||||
1. **C.1** `OtOpcUa.Server/Observability/HealthEndpoints.cs` — Minimal API on a second Kestrel binding (default `http://+:4841`). `/healthz` reports process uptime + config-DB reachability (or cache-warm). `/readyz` enumerates `DriverInstance` rows + reports each driver's `DriverHealth.State`; returns 503 if ANY driver is Faulted. JSON body per `docs/v2/acl-design.md` §"Operator Dashboards" shape.
|
||||
2. **C.2** `LogContextEnricher` installed at Serilog config time. Every driver-capability call site wraps its body in `using (LogContext.PushProperty("DriverInstanceId", id)) using (LogContext.PushProperty("CorrelationId", correlationId))`. Correlation IDs: reuse OPC UA `RequestHeader.RequestHandle` when in-flight; otherwise generate `Guid.NewGuid().ToString("N")[..12]`.
|
||||
3. **C.3** Add JSON-formatted Serilog sink alongside the existing rolling-file plain-text sink so SIEMs (Splunk, Datadog) can ingest without a regex parser. Sink switchable via `Serilog:WriteJson` appsetting.
|
||||
4. **C.4** Integration test: boot server, issue Modbus read, assert log line contains `DriverInstanceId` + `CorrelationId` structured fields.
|
||||
|
||||
### Stream D — Config DB LiteDB fallback — generation-sealed snapshots (1 week)
|
||||
|
||||
1. **D.1** `LiteDbConfigCache` adapter backed by **sealed generation snapshots**: each successful `sp_PublishGeneration` writes `<cache-root>/<clusterId>/<generationId>.db` as read-only after commit. The adapter maintains a `CurrentSealedGenerationId` pointer updated atomically on successful publish. Mixed-generation reads are **impossible** — every read served from the cache serves one coherent sealed generation.
|
||||
2. **D.2** Write-path queries (draft save, publish) bypass the cache entirely and fail hard on DB outage. Read-path queries (DriverInstance enumeration, LdapGroupRoleMapping, cluster + namespace metadata) go through the pipeline: timeout 2 s → retry 3× jittered → fallback to the current sealed snapshot.
|
||||
3. **D.3** `UsingStaleConfig` flag flips true when a read fell back to the sealed snapshot; cleared on the next successful DB round-trip. Surfaced on `/healthz` body and Admin `/hosts`.
|
||||
4. **D.4** Tests: (a) SQL-container kill mid-operation — read returns sealed snapshot, `UsingStaleConfig=true`, driver stays Healthy; (b) mixed-generation guard — attempt to serve partial generation by corrupting a snapshot file mid-read → adapter fails closed rather than serving mixed data; (c) first-boot-no-snapshot case — adapter refuses to start, driver fails `InitializeAsync` with a clear config-DB-required error.
|
||||
|
||||
### Stream E — Admin `/hosts` page refresh (3 days)
|
||||
|
||||
1. **E.1** Extend `DriverHostStatus` schema with Stream A resilience columns. Generate EF migration.
|
||||
2. **E.2** `Admin/FleetStatusHub` SignalR hub pushes `LastCircuitBreakerOpenUtc` + `CurrentBulkheadDepth` + `LastRecycleUtc` on change.
|
||||
3. **E.3** `/hosts` Blazor page renders new columns; red badge if `ConsecutiveFailures > breakerThreshold / 2`.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **Invoker coverage**: every method on `IReadable` / `IWritable` / `ITagDiscovery` / `ISubscribable` / `IHostConnectivityProbe` / `IAlarmSource` / `IHistoryProvider` in the server dispatch layer routes through `CapabilityInvoker`. Enforce via a Roslyn analyzer (error-level; warning-first is rejected — the compliance check is the gate).
|
||||
- [ ] **Write-retry guard**: writes without `[WriteIdempotent]` never get retried. Unit-test the invoker path asserts zero retry attempts.
|
||||
- [ ] **Pipeline isolation**: pipeline key is `(DriverInstanceId, HostName)`. Integration test with two Modbus hosts under one instance — failing host A does not open the breaker for host B.
|
||||
- [ ] **Tier registry**: every driver type registered in `DriverTypeRegistry` has a non-null `Tier`. Unit test walks the registry + asserts no gaps. Tier C registrations must declare their out-of-process topology.
|
||||
- [ ] **MemoryTracking never kills**: soft/hard breach tests on a Tier A/B driver log + surface without terminating the process.
|
||||
- [ ] **MemoryRecycle Tier C only**: hard breach on a Tier A driver never invokes the supervisor; on Tier C it does.
|
||||
- [ ] **Wedge demand-aware**: test suite includes idle-subscription-only, slow-historian-backfill, and write-only-burst cases — driver stays Healthy.
|
||||
- [ ] **Galaxy supervisor preserved**: `Driver.Galaxy.Proxy/Supervisor/CircuitBreaker.cs` + `Backoff.cs` still present + still invoked on Host crash.
|
||||
- [ ] **Health state machine**: `/healthz` + `/readyz` respond within 500 ms for every `DriverState`; state-machine table in this doc drives the test matrix.
|
||||
- [ ] **Structured log**: CI grep asserts at least one log line per capability call has `"DriverInstanceId"` + `"CorrelationId"` JSON fields.
|
||||
- [ ] **Generation-sealed cache**: integration tests cover (a) SQL-kill mid-operation serves last-sealed snapshot; (b) mixed-generation corruption fails closed; (c) first-boot no-snapshot + DB-down → `InitializeAsync` fails with clear error.
|
||||
- [ ] No regression in existing test suites — `dotnet test ZB.MOM.WW.OtOpcUa.slnx` count equal-or-greater than pre-Phase-6.1 baseline.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| Polly pipeline adds per-request latency on hot path | Medium | Medium | Benchmark Stream A.5 before merging; 1 % overhead budget; inline hot path short-circuits when retry count = 0 |
|
||||
| LiteDB cache diverges from central DB | Medium | High | Stale-data banner in Admin UI; `UsingStaleConfig` flag surfaced on `/readyz`; cache refresh on every successful DB round-trip; 24-hour synthetic warning |
|
||||
| Tier watchdog false-positive-kills a legitimate batch load | Low | High | Soft/hard threshold split; soft only logs; hard triggers recycle; thresholds configurable per-instance |
|
||||
| Wedge detector races with slow-but-healthy drivers | Medium | High | Minimum 60 s threshold; detector only activates if driver claims `Healthy`; add circuit-breaker feedback so rapid oscillation trips instead of thrashing |
|
||||
| Roslyn analyzer breaks external driver authors | Low | Medium | Release analyzer as warning-level initially; upgrade to error in Phase 6.1+1 after one release cycle |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: Polly shared pipeline + per-tier defaults + driver-capability invoker + tests
|
||||
- [ ] Stream B: Tier registry + generalised watchdog + scheduled recycle + wedge detector
|
||||
- [ ] Stream C: `/healthz` + `/readyz` + structured logging + JSON Serilog sink
|
||||
- [ ] Stream D: LiteDB cache + Polly fallback in Configuration
|
||||
- [ ] Stream E: Admin `/hosts` page refresh
|
||||
- [ ] Cross-cutting: `phase-6-1-compliance.ps1` exits 0; full solution `dotnet test` passes; exit-gate doc recorded
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, thread `019da489-e317-7aa1-ab1f-6335e0be2447`)
|
||||
|
||||
Plan substantially rewritten before implementation to address these findings. Each entry: severity · verdict · adjustment.
|
||||
|
||||
1. **Crit · ACCEPT** — Auto-retry collides with decisions #44/#45 (no auto-write-retry; opt-in via `WriteIdempotent` + CAS). Pipeline now **capability-specific**: Read/HistoryRead/Discover/Probe/Alarm-subscribe all get retries; **Write does not** unless the tag metadata carries `WriteIdempotent=true`. New `WriteIdempotentAttribute` surfaces on `ModbusTagDefinition` / `S7TagDefinition` / etc.
|
||||
2. **Crit · ACCEPT** — "One pipeline per driver instance" breaks decision #35's per-device isolation. **Change**: pipeline key is `(DriverInstanceId, HostName)` not just `DriverInstanceId`. One dead PLC behind a multi-device Modbus driver no longer opens the breaker for healthy siblings.
|
||||
3. **Crit · ACCEPT** — Memory watchdog + scheduled recycle at Tier A/B breaches decisions #73/#74 (process-kill protections are Tier-C-only). **Change**: Stream B splits into two — `MemoryTracking` (all tiers, soft/hard thresholds log + surface to Admin `/hosts`; never kills) and `MemoryRecycle` (Tier C only, requires out-of-process topology). Tier A/B overrun paths escalate to Tier C via a future PR, not auto-kill.
|
||||
4. **High · ACCEPT** — Removing Galaxy's hand-rolled `CircuitBreaker` drops decision #68 host-supervision crash-loop protection. **Change**: keep `Driver.Galaxy.Proxy/Supervisor/CircuitBreaker.cs` + `Backoff.cs` — they guard the IPC *process* re-spawn, not the per-call data path. Data-path Polly is an orthogonal layer.
|
||||
5. **High · ACCEPT** — Roslyn analyzer targeting `IDriver` misses the hot paths (`IReadable.ReadAsync`, `IWritable.WriteAsync`, `ISubscribable.SubscribeAsync` etc.). **Change**: analyzer rule now matches every method on the capability interfaces; compliance doc enumerates the full call-site list.
|
||||
6. **High · ACCEPT** — `/healthz` + `/readyz` under-specified for degraded-running. **Change**: add a state-matrix sub-section explicitly covering `Unknown` (pre-init: `/readyz` 503), `Initializing` (503), `Healthy` (200), `Degraded` (200 with JSON body flagging the degraded driver; `/readyz` is OR across drivers), `Faulted` (503), plus cached-config-serving (`/healthz` returns 200 + `UsingStaleConfig: true` in JSON body).
|
||||
7. **High · ACCEPT** — `WedgeDetector` based on "no successful Read" false-fires on write-only subscriptions + idle systems. **Change**: wedge criteria now `(hasPendingWork AND noProgressIn > threshold)` where `hasPendingWork` comes from the Polly bulkhead depth + active MonitoredItem count. Idle driver stays Healthy.
|
||||
8. **High · ACCEPT** — LiteDB cache serving mixed-generation reads breaks publish atomicity. **Change**: cache is snapshot-per-generation. Each published generation writes a sealed snapshot into `<cache-root>/<cluster>/<generationId>.db`; reads serve the last-known-sealed generation and never mix. Central DB outage during a *publish* means that publish fails (write path doesn't use cache); reads continue from the prior sealed snapshot.
|
||||
9. **Med · ACCEPT** — `DriverHostStatus` schema conflates per-host connectivity with per-driver-instance resilience counters. **Change**: new `DriverInstanceResilienceStatus` table separate from `DriverHostStatus`. Admin `/hosts` joins both for display.
|
||||
10. **Med · ACCEPT** — Compliance says analyzer-error; risks say analyzer-warning. **Change**: phase 6.1 ships at **error** level (this phase is the gate); warning-mode option removed.
|
||||
11. **Med · ACCEPT** — Hardcoded per-tier MB bands ignore decision #70's `max(multiplier × baseline, baseline + floor)` formula with observed-baseline capture. **Change**: watchdog captures baseline at post-init plateau (median of first 5 min GetMemoryFootprint samples) + applies the hybrid formula. Tier constants now encode the multiplier + floor, not raw MB.
|
||||
12. **Med · ACCEPT** — Tests mostly cover happy path. **Change**: Stream A.5 adds negative tests for duplicate-write-replay-under-timeout; Stream B.5 adds false-wedge-on-idle-subscription + false-wedge-on-slow-historic-backfill; Stream D.4 adds mixed-generation cache test + corrupt-first-boot cache test.
|
||||
|
||||
147
docs/v2/implementation/phase-6-2-authorization-runtime.md
Normal file
147
docs/v2/implementation/phase-6-2-authorization-runtime.md
Normal file
@@ -0,0 +1,147 @@
|
||||
# Phase 6.2 — Authorization Runtime (ACL + LDAP grants)
|
||||
|
||||
> **Status**: DRAFT — the v2 `plan.md` decision #129 + `acl-design.md` specify a 6-level permission-trie evaluator with `NodePermissions` bitmask grants, but no runtime evaluator exists. ACL tables are schematized but unread by the data path.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-2-authorization-runtime`
|
||||
> **Estimated duration**: 2.5 weeks
|
||||
> **Predecessor**: Phase 6.1 (Resilience & Observability) — reuses the Polly pipeline for ACL-cache refresh retries
|
||||
> **Successor**: Phase 6.3 (Redundancy)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Wire ACL enforcement on every OPC UA Read / Write / Subscribe / Call path + LDAP group → admin role grants that the v2 plan specified but never ran. End-state: a user's effective permissions resolve through a per-session permission-trie over the 6-level `Cluster / Namespace / UnsArea / UnsLine / Equipment / Tag` hierarchy, cached per session, invalidated on generation-apply + LDAP group expiry.
|
||||
|
||||
Closes these gaps:
|
||||
|
||||
1. **Data-path ACL enforcement** — `NodeAcl` table + `NodePermissions` flags shipped; `NodeAclService.cs` present as a CRUD surface; no code consults ACLs at `Read`/`Write` time. OPC UA server answers everything to everyone.
|
||||
2. **`LdapGroupRoleMapping` for cluster-scoped admin grants** — decision #105 shipped as the *design*; admin roles are hardcoded (`FleetAdmin` / `ConfigEditor` / `ReadOnly`) with no cluster-scoping and no LDAP-to-grant table. Decision #105 explicitly lifts this from v2.1 into v2.0.
|
||||
3. **Explicit Deny pathway** — deferred to v2.1 (decision #129 note). Phase 6.2 ships *grants only*; `Deny` stays out.
|
||||
4. **Admin UI ACL grant editor** — `AclsTab.razor` exists but edits the now-unused `NodeAcl` table; needs to wire to the runtime evaluator + the new `LdapGroupRoleMapping` table.
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
**Architectural separation** (critical for correctness): `LdapGroupRoleMapping` is **control-plane only** — it maps LDAP groups to Admin UI roles (`FleetAdmin` / `ConfigEditor` / `ReadOnly`) and cluster scopes for Admin access. **It is NOT consulted by the OPC UA data-path evaluator.** The data-path evaluator reads `NodeAcl` rows joined directly against the session's **resolved LDAP group memberships**. The two concerns share zero runtime code path.
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `Configuration` project | New entity `LdapGroupRoleMapping { Id, LdapGroup, Role, ClusterId? (nullable = system-wide), IsSystemWide, GeneratedAtUtc }`. **Consumed only by Admin UI role routing.** Migration. Admin CRUD. |
|
||||
| `Core` → new `Core.Authorization` sub-namespace | `IPermissionEvaluator.Authorize(IEnumerable<Claim> identity, OpcUaOperation op, NodeId nodeId) → AuthorizationDecision`. `op` covers every OPC UA surface: Browse, Read, Write, HistoryRead, HistoryUpdate, CreateMonitoredItems, TransferSubscriptions, Call, Acknowledge, Confirm, Shelve. Result is tri-state (internal model distinguishes `Allow` / `NotGranted` / `Denied` + carries matched-grant provenance). Phase 6.2 only produces `Allow` + `NotGranted`; v2.1 Deny lands without API break. |
|
||||
| `PermissionTrieBuilder` | Builds trie from `NodeAcl` rows joined against **resolved LDAP group memberships**, keyed on 6-level scope hierarchy for Equipment namespaces. **SystemPlatform namespaces (Galaxy)** use a `FolderSegment` scope level between Namespace and Tag, populated from `Tag.FolderPath` segments, so folder subtree authorization works on Galaxy trees the same way UNS works on Equipment trees. Trie node carries `ScopeKind` enum. |
|
||||
| `PermissionTrieCache` + freshness | One trie per `(ClusterId, GenerationId)`. Invalidated on `sp_PublishGeneration` via in-process event bus AND generation-ID check on hot path — every authz call looks up `CurrentGenerationId` (Polly-wrapped, sub-second cache); a Backup that cached a stale generation detects the mismatch + forces re-load. **Redundancy-safe**. |
|
||||
| `UserAuthorizationState` freshness | Cached per session BUT bounded by `MembershipFreshnessInterval` (default **15 min**). Past that, the next hot-path authz call re-resolves LDAP group memberships via `LdapGroupService`. Failure to re-resolve (LDAP unreachable) → **fail-closed**: evaluator returns `NotGranted` for every call until memberships refresh successfully. Decoupled from Phase 6.1's availability-oriented 24h cache. |
|
||||
| `AuthCacheMaxStaleness` | Separate from Phase 6.1's `UsingStaleConfig` window. Default 5 min — beyond that, authz fails closed regardless of Phase 6.1 cache warmth. |
|
||||
| OPC UA server dispatch — all enforcement surfaces | `DriverNodeManager` wires evaluator on: **Browse + TranslateBrowsePathsToNodeIds** (ancestors implicitly visible if any descendant has a grant; denied ancestors filter from results), **Read** (per-attribute StatusCode `BadUserAccessDenied` in mixed-authorization batches; batch never poisons), **Write** (uses `NodePermissions.WriteOperate/Tune/Configure` based on driver `SecurityClassification`), **HistoryRead** (uses `NodePermissions.HistoryRead` — **distinct** flag, not Read), **HistoryUpdate** (`NodePermissions.HistoryUpdate`), **CreateMonitoredItems** (per-`MonitoredItemCreateResult` denial), **TransferSubscriptions** (re-evaluates items on transfer), **Call** (`NodePermissions.MethodCall`), **Acknowledge/Confirm/Shelve** (per-alarm flags). |
|
||||
| Subscription re-authorization | Each `MonitoredItem` is stamped with `(AuthGenerationId, MembershipVersion)` at create time. On every Publish, items with a stamp mismatching the session's current `(AuthGenerationId, MembershipVersion)` get re-evaluated; revoked items drop to `BadUserAccessDenied` within one publish cycle. Unchanged items stay fast-path. |
|
||||
| `LdapAuthService` | On cookie-auth success: resolves LDAP group memberships; loads matching `LdapGroupRoleMapping` rows → role claims + cluster-scope claims (control plane); stores `UserAuthorizationState.LdapGroups` on the session for the data-plane evaluator. |
|
||||
| `ValidatedNodeAclAuthoringService` | Replaces CRUD-only `NodeAclService` for authoring. Validates (LDAP group exists, scope exists in current or target draft, grant shape is valid, no duplicate `(LdapGroup, Scope)` pair). Admin UI writes only through it. |
|
||||
| Admin UI `AclsTab.razor` | Writes via `ValidatedNodeAclAuthoringService`. Adds Probe-This-Permission row that runs the real evaluator against a chosen `(LDAP group, node, operation)` and shows `Allow` / `NotGranted` + matched-grant provenance. |
|
||||
| Admin UI new tab `RoleGrantsTab.razor` | CRUD over `LdapGroupRoleMapping`. Per-cluster + system-wide grants. FleetAdmin only. **Documentation explicit** that this only affects Admin UI access, not OPC UA data plane. |
|
||||
| Audit log | Every Grant/Revoke/Publish on `LdapGroupRoleMapping` or `NodeAcl` writes an `AuditLog` row with old/new state + user. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| OPC UA authn | Already done (PR 19 LDAP user identity + Basic256Sha256 profile). Phase 6.2 is authorization only. |
|
||||
| Explicit `Deny` grants | Decision #129 note explicitly defers to v2.1. Default-deny + additive grants only. |
|
||||
| Driver-side `SecurityClassification` metadata | Drivers keep reporting `Operate` / `ViewOnly` / etc. — the evaluator uses them as *part* of the decision but doesn't replace them. |
|
||||
| Galaxy namespace (SystemPlatform kind) | UNS levels don't apply; evaluator treats Galaxy nodes as `Cluster → Namespace → Tag` (skip UnsArea/UnsLine/Equipment). |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phase 6.1 merged (reuse `Core.Resilience` Polly pipeline for the ACL cache-refresh retries)
|
||||
- [ ] `acl-design.md` re-read in full
|
||||
- [ ] Decision log #105, #129, corrections-doc B1 re-skimmed
|
||||
- [ ] Existing `NodeAcl` + `NodePermissions` flag enum audited; confirm bitmask flags match `acl-design.md` table
|
||||
- [ ] Existing `LdapAuthService` group-resolution code path traced end-to-end — confirm it already queries group memberships (we only need the caller to consume the result)
|
||||
- [ ] Test DB scenarios catalogued: two clusters, three LDAP groups per cluster, mixed grant shapes; captured as seed-data fixtures
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — `LdapGroupRoleMapping` table + migration (3 days)
|
||||
|
||||
1. **A.1** Entity + EF Core migration. Columns per §Scope table. Unique constraint on `(LdapGroup, ClusterId)` with null-tolerant comparer for the system-wide case. Index on `LdapGroup` for the hot-path lookup on auth.
|
||||
2. **A.2** `ILdapGroupRoleMappingService` CRUD. Wrap in the Phase 6.1 Polly pipeline (timeout → retry → fallback-to-cache).
|
||||
3. **A.3** Seed-data migration: preserve the current hardcoded `FleetAdmin` / `ConfigEditor` / `ReadOnly` mappings by seeding rows for the existing LDAP groups the dev box uses (`cn=fleet-admin,…`, `cn=config-editor,…`, `cn=read-only,…`). Op no-op migration for existing deployments.
|
||||
|
||||
### Stream B — Permission-trie evaluator (1 week)
|
||||
|
||||
1. **B.1** `IPermissionEvaluator.Authorize(IEnumerable<Claim> identity, NodeId nodeId, NodePermissions needed)` — returns `bool`. Phase 6.2 returns only `true` / `false`; v2.1 can widen to `Allow`/`Deny`/`Indeterminate` if Deny lands.
|
||||
2. **B.2** `PermissionTrieBuilder` builds the trie from `NodeAcl` + `LdapGroupRoleMapping` joined to the current generation's `UnsArea` + `UnsLine` + `Equipment` + `Tag` tables. One trie per `(ClusterId, GenerationId)` so rollback doesn't smear permissions across generations.
|
||||
3. **B.3** Trie node structure: `{ Level: enum, ScopeId: Guid, AllowedPermissions: NodePermissions, ChildrenByLevel: Dictionary<Guid, TrieNode> }`. Evaluation walks from Cluster → Namespace → UnsArea → UnsLine → Equipment → Tag, ORing allowed permissions at each level. Additive semantics: a grant at Cluster level cascades to every descendant tag.
|
||||
4. **B.4** `PermissionTrieCache` service scoped as singleton; exposes `GetTrieAsync(ClusterId, ct)` that returns the current-generation trie. Invalidated on `sp_PublishGeneration` via an in-process event bus; also on TTL expiry (24 h safety net).
|
||||
5. **B.5** Per-session cached evaluator: OPC UA Session authentication produces `UserAuthorizationState { ClusterId, LdapGroups[], Trie }`; cached on the session until session close or generation-apply.
|
||||
6. **B.6** Unit tests: trie-walk theory covering (a) Cluster-level grant cascades to tags, (b) Equipment-level grant doesn't leak to sibling Equipment, (c) multi-group union, (d) no-grant → deny, (e) Galaxy nodes skip UnsArea/UnsLine levels.
|
||||
|
||||
### Stream C — OPC UA server dispatch wiring (6 days, widened)
|
||||
|
||||
1. **C.1** `DriverNodeManager.Read` — evaluator consulted per `ReadValueId` with `OpcUaOperation.Read`. Denied attributes get `BadUserAccessDenied` per-item; batch never poisons. Integration test covers mixed-authorization batch (3 authorized + 2 denied → 3 Good values + 2 Bad StatusCodes, request completes).
|
||||
2. **C.2** `DriverNodeManager.Write` — evaluator chooses `NodePermissions.WriteOperate` / `WriteTune` / `WriteConfigure` based on the driver-reported `SecurityClassification`.
|
||||
3. **C.3** `DriverNodeManager.HistoryRead` — **uses `NodePermissions.HistoryRead`**, which is a **distinct flag** from Read. Test: user with Read but not HistoryRead can read live values but gets `BadUserAccessDenied` on `HistoryRead`.
|
||||
4. **C.4** `DriverNodeManager.HistoryUpdate` — uses `NodePermissions.HistoryUpdate`.
|
||||
5. **C.5** `DriverNodeManager.CreateMonitoredItems` — per-`MonitoredItemCreateResult` denial in mixed-authorization batch; partial success path per OPC UA Part 4. Each created item stamped `(AuthGenerationId, MembershipVersion)`.
|
||||
6. **C.6** `DriverNodeManager.TransferSubscriptions` — on reconnect, re-evaluate every transferred `MonitoredItem` against the session's current auth state. Stale-stamp items drop to `BadUserAccessDenied`.
|
||||
7. **C.7** **Browse + TranslateBrowsePathsToNodeIds** — evaluator called with `OpcUaOperation.Browse`. Ancestor visibility implied when any descendant has a grant (per `acl-design.md` §Browse). Denied ancestors filter from browse results — the UA browser sees a hierarchy truncated at the denied ancestor rather than an inconsistent child-without-parent view.
|
||||
8. **C.8** `DriverNodeManager.Call` — `NodePermissions.MethodCall`.
|
||||
9. **C.9** Alarm actions (Acknowledge / Confirm / Shelve) — per-alarm `NodePermissions.AlarmAck` / `AlarmConfirm` / `AlarmShelve`.
|
||||
10. **C.10** Publish path — for each `MonitoredItem` with a mismatched `(AuthGenerationId, MembershipVersion)` stamp, re-evaluate. Unchanged items stay fast-path; changes happen at next publish cycle.
|
||||
11. **C.11** Integration tests: three-user seed with different memberships; matrix covers every operation in §Scope. Mixed-batch tests for Read + CreateMonitoredItems.
|
||||
|
||||
### Stream D — Admin UI refresh (4 days)
|
||||
|
||||
1. **D.1** `RoleGrantsTab.razor` — FleetAdmin-gated CRUD on `LdapGroupRoleMapping`. Per-cluster dropdown + system-wide checkbox. Validation: LDAP group must exist in the dev LDAP (GLAuth) before saving — best-effort probe with graceful degradation.
|
||||
2. **D.2** `AclsTab.razor` rewrites its edit path to write through the new `NodeAclService`. Adds a "Probe this permission" row: choose `(LDAP group, node, action)` → shows Allow / Deny + the reason (which grant matched).
|
||||
3. **D.3** Draft-generation diff viewer now includes an ACL section: "X grants added, Y grants removed, Z grants changed."
|
||||
4. **D.4** SignalR notification: `PermissionTrieCache` invalidation on `sp_PublishGeneration` pushes to Admin UI so operators see "this clusters permissions were just updated" within 2 s.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **Control/data-plane separation**: `LdapGroupRoleMapping` consumed only by Admin UI; the data-path evaluator has zero references to it. Enforced via a project-reference audit (Admin project references the mapping service; `Core.Authorization` does not).
|
||||
- [ ] **Every operation wired**: Browse, Read, Write, HistoryRead, HistoryUpdate, CreateMonitoredItems, TransferSubscriptions, Call, Acknowledge, Confirm, Shelve all consult the evaluator. Integration test matrix covers every operation × allow/deny.
|
||||
- [ ] **HistoryRead uses its own flag**: test "user with Read + no HistoryRead gets `BadUserAccessDenied` on HistoryRead".
|
||||
- [ ] **Mixed-batch semantics**: Read of 5 nodes (3 allowed + 2 denied) returns 3 Good + 2 `BadUserAccessDenied` per-`ReadValueId`; CreateMonitoredItems equivalent.
|
||||
- [ ] **Browse ancestor visibility**: user with a grant only on a deep equipment node can browse the path to it (ancestors implied); denied ancestors filter from browse results otherwise.
|
||||
- [ ] **Galaxy FolderSegment coverage**: a grant on a Galaxy folder subtree cascades to its tags; sibling folders are unaffected. Trie test covers this.
|
||||
- [ ] **Subscription re-authorization**: integration test — create item, revoke grant via draft+publish, next publish cycle the item returns `BadUserAccessDenied` (not silently still-notifying).
|
||||
- [ ] **Membership freshness**: test — 15 min MembershipFreshnessInterval elapses on a long-lived session + LDAP now unreachable → authz fails closed on the next request until LDAP recovers.
|
||||
- [ ] **Auth cache fail-closed**: test — Phase 6.1 cache serves stale config for 6 min; authz evaluator refuses all calls after 5 min regardless.
|
||||
- [ ] **Trie invariants**: `PermissionTrieBuilder` is idempotent (build twice with identical inputs → equal tries).
|
||||
- [ ] **Additive grants + cluster isolation**: cluster-grant cascades; cross-cluster leakage impossible.
|
||||
- [ ] **Redundancy-safe invalidation**: integration test — two nodes, a publish on one, authorize a request on the other before in-process event propagates → generation-mismatch forces re-load, no stale decision.
|
||||
- [ ] **Authoring validation**: `AclsTab` cannot save a `(LdapGroup, Scope)` pair that already exists in the draft; operator sees the validation error pre-save.
|
||||
- [ ] **AuthorizationDecision shape stability**: API surface exposes `Allow` + `NotGranted` only; `Denied` variant exists in the type but is never produced; v2.1 can add Deny without API break.
|
||||
- [ ] No regression in driver test counts.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| ACL evaluator latency on per-read hot path | Medium | High | Trie lookup is O(depth) = O(6); session-cached UserAuthorizationState avoids per-Read trie rebuild; benchmark in Stream B.6 |
|
||||
| Trie cache stale after a rollback | Medium | High | `sp_PublishGeneration` + `sp_RollbackGeneration` both emit the invalidation event; trie keyed on `(ClusterId, GenerationId)` so rollback fetches the prior trie cleanly |
|
||||
| `BadUserAccessDenied` returns expose sensitive browse-name metadata | Low | Medium | Server returns only the status code + NodeId; no message leak per OPC UA Part 4 §7.34 guidance |
|
||||
| LdapGroupRoleMapping migration breaks existing deployments | Low | High | Seed-migration preserves the hardcoded groups' effective grants verbatim; smoke test exercises the post-migration fleet admin login |
|
||||
| Deny semantics accidentally ship (would break `acl-design.md` defer) | Low | Medium | `IPermissionEvaluator.Authorize` returns `bool` (not tri-state) through Phase 6.2; widening to `Allow`/`Deny`/`Indeterminate` is a v2.1 ticket |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: `LdapGroupRoleMapping` entity + migration + CRUD + seed
|
||||
- [ ] Stream B: evaluator + trie builder + cache + per-session state + unit tests
|
||||
- [ ] Stream C: OPC UA dispatch wiring on Read/Write/HistoryRead/Subscribe/Alarm paths
|
||||
- [ ] Stream D: Admin UI `RoleGrantsTab` + `AclsTab` refresh + SignalR invalidation
|
||||
- [ ] `phase-6-2-compliance.ps1` exits 0; exit-gate doc recorded
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, thread `019da48d-0d2b-7171-aed2-fc05f1f39ca3`)
|
||||
|
||||
1. **Crit · ACCEPT** — Trie must not conflate `LdapGroupRoleMapping` (control-plane admin claims per decision #105) with data-plane ACLs (decision #129). **Change**: `LdapGroupRoleMapping` is consumed only by the Admin UI role router. Data-plane trie reads `NodeAcl` rows joined against the session's **resolved LDAP groups**, never admin roles. Stream B.2 updated.
|
||||
2. **Crit · ACCEPT** — Cached `UserAuthorizationState` survives LDAP group changes because memberships only refresh at cookie-auth. Change: add `MembershipFreshnessInterval` (default 15 min); past that, next hot-path authz call forces group re-resolution (fail-closed if LDAP unreachable). Session-close-wins on config-rollback.
|
||||
3. **High · ACCEPT** — Node-local invalidation doesn't extend across redundant pair. **Change**: trie keyed on `(ClusterId, GenerationId)`; hot-path authz looks up `CurrentGenerationId` from the shared config DB (Polly-wrapped + sub-second cache). A Backup that read stale generation gets a mismatched trie → forces re-load. Implementation note added to Stream B.4.
|
||||
4. **High · ACCEPT** — Browse enforcement missing. **Change**: new Stream C.7 (`Browse + TranslateBrowsePathsToNodeIds` enforcement). Ancestor visibility implied when any descendant has a grant; denied ancestors filter from browse results per `acl-design.md` §Browse.
|
||||
5. **High · ACCEPT** — `HistoryRead` should use `NodePermissions.HistoryRead` bit, not `Read`. **Change**: Stream C.3 revised; separate unit test asserts `Read+no-HistoryRead` denies HistoryRead while allowing current-value reads.
|
||||
6. **High · ACCEPT** — Galaxy shallow-path (Cluster→Namespace→Tag) loses folder hierarchy authorization. **Change**: SystemPlatform namespaces use a `FolderSegment` scope-level between Namespace and Tag, populated from `Tag.FolderPath`; UNS-kind namespaces keep the 6-level hierarchy. Trie supports both via `ScopeKind` on each node.
|
||||
7. **High · ACCEPT** — Subscription re-authorization policy unresolved between create-time-only (fast, wrong on revoke) and per-publish (slow). **Change**: stamp each `MonitoredItem` with `(AuthGenerationId, MembershipVersion)`; re-evaluate on Publish only when either version changed. Revoked items drop to `BadUserAccessDenied` within one publish cycle.
|
||||
8. **Med · ACCEPT** — Mixed-authorization batch `Read` / `CreateMonitoredItems` service-result semantics underspecified. **Change**: Stream C.6 explicitly tests per-`ReadValueId` + per-`MonitoredItemCreateResult` denial in mixed batches; batch never collapses to a coarse failure.
|
||||
9. **Med · ACCEPT** — Missing surfaces: `Method.Call`, `HistoryUpdate`, event filter on subscriptions, subscription-transfer on reconnect, alarm-ack. **Change**: scope expanded — every OPC UA authorization surface enumerated in Stream C: Read, Write, HistoryRead, HistoryUpdate, CreateMonitoredItems, TransferSubscriptions, Call, Acknowledge/Confirm/Shelve, Browse, TranslateBrowsePathsToNodeIds.
|
||||
10. **Med · ACCEPT** — `bool` evaluator bakes in grant-only semantics; collides with v2.1 Deny. **Change**: internal model uses `AuthorizationDecision { Allow | NotGranted | Denied, IReadOnlyList<MatchedGrant> Provenance }`. Phase 6.2 maps `Denied` → never produced; UI + audit log use the full record so v2.1 Deny lands without API break.
|
||||
11. **Med · ACCEPT** — 6.1 cache fallback is availability-oriented; applying it to auth is correctness-dangerous. **Change**: auth-specific staleness budget `AuthCacheMaxStaleness` (default 5 min, not 24 h). Past that, hot-path evaluator fails closed on cached reads; all authorization calls return `NotGranted` until fresh data lands. Documented in risks + compliance.
|
||||
12. **Low · ACCEPT** — Existing `NodeAclService` is raw CRUD. **Change**: new `ValidatedNodeAclAuthoringService` enforces scope-uniqueness + draft/publish invariants + rejects invalid (LDAP group, scope) pairs; Admin UI writes through it only. Stream D.2 adjusted.
|
||||
|
||||
150
docs/v2/implementation/phase-6-3-redundancy-runtime.md
Normal file
150
docs/v2/implementation/phase-6-3-redundancy-runtime.md
Normal file
@@ -0,0 +1,150 @@
|
||||
# Phase 6.3 — Redundancy Runtime
|
||||
|
||||
> **Status**: DRAFT — `CLAUDE.md` + `docs/Redundancy.md` describe a non-transparent warm/hot redundancy model with unique ApplicationUris, `RedundancySupport` advertisement, `ServerUriArray`, and dynamic `ServiceLevel`. Entities (`ServerCluster`, `ClusterNode`, `RedundancyRole`, `RedundancyMode`) exist; the runtime behavior (actual `ServiceLevel` number computation, mid-apply dip, `ServerUriArray` broadcast) is not wired.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-3-redundancy-runtime`
|
||||
> **Estimated duration**: 2 weeks
|
||||
> **Predecessor**: Phase 6.2 (Authorization) — reuses the Phase 6.1 health endpoints for cluster-peer probing
|
||||
> **Successor**: Phase 6.4 (Admin UI completion)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Land the non-transparent redundancy protocol end-to-end: two `OtOpcUa.Server` instances in a `ServerCluster` each expose a live `ServiceLevel` node whose value reflects that instance's suitability to serve traffic, advertise each other via `ServerUriArray`, and transition role (Primary ↔ Backup) based on health + operator intent.
|
||||
|
||||
Closes these gaps:
|
||||
|
||||
1. **Dynamic `ServiceLevel`** — OPC UA Part 5 §6.3.34 specifies a Byte (0..255) that clients poll to pick the healthiest server. Our server publishes it as a static value today.
|
||||
2. **`ServerUriArray` broadcast** — Part 4 specifies that every node in a redundant pair should advertise its peers' ApplicationUris. Currently advertises only its own.
|
||||
3. **Primary / Backup role coordination** — entities carry `RedundancyRole` but the runtime doesn't read it; no peer health probing; no role-transfer on primary failure.
|
||||
4. **Mid-apply dip** — decision-level expectation that a server mid-generation-apply should report a *lower* ServiceLevel so clients cut over to the peer during the apply window. Not implemented.
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `OtOpcUa.Server` → new `Server.Redundancy` sub-namespace | `RedundancyCoordinator` singleton. Resolves the current node's `ClusterNode` row at startup, loads peers, runs **two-layer peer health probe**: (a) `/healthz` every 2 s as the fast-fail (inherits Phase 6.1 semantics — HTTP + DB/cache healthy); (b) `UaHealthProbe` every 10 s — opens a lightweight OPC UA client session to the peer + reads its `ServiceLevel` node + verifies endpoint serves data. Authority decisions use UaHealthProbe; `/healthz` is used only to avoid wasting UA probes when peer is obviously down. |
|
||||
| Publish-generation fencing | Topology + role decisions are stamped with a monotonic `ConfigGenerationId` from the shared config DB. Coordinator re-reads topology via CAS on `(ClusterId, ExpectedGeneration)` → new row; peers reject state propagated from a lower generation. Prevents split-publish races. |
|
||||
| `InvalidTopology` runtime state | If both nodes detect >1 Primary AFTER startup (config-DB drift during a publish), both self-demote to ServiceLevel 2 until convergence. Neither node serves authoritatively; clients pick the healthier alternative or reconnect later. |
|
||||
| OPC UA server root | `ServiceLevel` variable node becomes a `BaseDataVariable` whose value updates on `RedundancyCoordinator` state change. `ServerUriArray` array variable includes **self + peers** in stable deterministic ordering (decision per OPC UA Part 4 §6.6.2.2). `RedundancySupport` stays static (set from `RedundancyMode` at startup); `Transparent` mode validated pre-publish, not rejected at startup. |
|
||||
| `RedundancyCoordinator` computation | **8-state ServiceLevel matrix** — avoids OPC UA Part 5 §6.3.34 collision (`0=Maintenance`, `1=NoData`). Operator-declared maintenance only = **0**. Unreachable / Faulted = **1**. In-range operational states occupy **2..255**: Authoritative-Primary = **255**; Isolated-Primary (peer unreachable, self serving) = **230**; Primary-Mid-Apply = **200**; Recovering-Primary (post-fault, dwell not met) = **180**; Authoritative-Backup = **100**; Isolated-Backup (primary unreachable, "take over if asked") = **80**; Backup-Mid-Apply = **50**; Recovering-Backup = **30**; `InvalidTopology` (runtime detects >1 Primary) = **2** (detected-inconsistency band — below normal operation). Full matrix documented in `docs/Redundancy.md` update. |
|
||||
| Role transition | Split-brain avoidance: role is *declared* in the shared config DB (`ClusterNode.RedundancyRole`), not elected at runtime. An operator flips the row (or a failover script does). Coordinator only reads; never writes. |
|
||||
| `sp_PublishGeneration` hook | Uses named **apply leases** keyed to `(ConfigGenerationId, PublishRequestId)`. `await using var lease = coordinator.BeginApplyLease(...)`. Disposal on any exit path (success, exception, cancellation) decrements. Watchdog auto-closes any lease older than `ApplyMaxDuration` (default 10 min) → ServiceLevel can't stick at mid-apply. Pre-publish validator rejects unsupported `RedundancyMode` (e.g. `Transparent`) with a clear error so runtime never sees an invalid state. |
|
||||
| Admin UI `/cluster/{id}` page | New `RedundancyTab.razor` — shows current node's role + ServiceLevel + peer reachability. FleetAdmin can trigger a role-swap by editing `ClusterNode.RedundancyRole` + publishing a draft. |
|
||||
| Metrics | New OpenTelemetry metrics: `ot_opcua_service_level{cluster,node}`, `ot_opcua_peer_reachable{cluster,node,peer}`, `ot_opcua_apply_in_progress{cluster,node}`. Sink via Phase 6.1 observability layer. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| OPC UA authn / authz | Phases 6.2 + prior. Redundancy is orthogonal. |
|
||||
| Driver layer | Drivers aren't redundancy-aware; they run on each node independently against the same equipment. The server layer handles the ServiceLevel story. |
|
||||
| Automatic failover / election | Explicitly out of scope. Non-transparent = client picks which server to use via ServiceLevel + ServerUriArray. We do NOT ship consensus, leader election, or automatic promotion. Operator-driven failover is the v2.0 model per decision #79–85. |
|
||||
| Transparent redundancy (`RedundancySupport=Transparent`) | Not supported. If the operator asks for it the server fails startup with a clear error. |
|
||||
| Historian redundancy | Galaxy Historian's own redundancy (two historians on two CPUs) is out of scope. The Galaxy driver talks to whichever historian is reachable from its node. |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phase 6.1 merged (uses `/healthz` for peer probing)
|
||||
- [ ] `CLAUDE.md` §Redundancy + `docs/Redundancy.md` re-read
|
||||
- [ ] Decisions #79–85 re-skimmed
|
||||
- [ ] `ServerCluster`/`ClusterNode`/`RedundancyRole`/`RedundancyMode` entities + existing migration reviewed
|
||||
- [ ] OPC UA Part 4 §Redundancy + Part 5 §6.3.34 (ServiceLevel) re-skimmed
|
||||
- [ ] Dev box has two OtOpcUa.Server instances configured against the same cluster — one designated Primary, one Backup — for integration testing
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — Cluster topology loader (3 days)
|
||||
|
||||
1. **A.1** `RedundancyCoordinator` startup path: reads `ClusterNode` row for the current node (identified by `appsettings.json` `Cluster:NodeId`), reads the cluster's peer list, validates invariants (no duplicate `ApplicationUri`, at most one `Primary` per cluster if `RedundancyMode.WarmActive`, at most two nodes total in v2.0 per decision #83).
|
||||
2. **A.2** Topology subscription — coordinator re-reads on `sp_PublishGeneration` confirmation so an operator role-swap takes effect after publish (no process restart needed).
|
||||
3. **A.3** Tests: two-node cluster seed, one-node cluster seed (degenerate), duplicate-uri rejection.
|
||||
|
||||
### Stream B — Peer health probing + ServiceLevel computation (6 days, widened)
|
||||
|
||||
1. **B.1** `PeerHttpProbeLoop` per peer at 2 s — calls peer's `/healthz`, 1 s timeout, exponential backoff on sustained failure. Used as fast-fail.
|
||||
2. **B.2** `PeerUaProbeLoop` per peer at 10 s — opens an OPC UA client session to the peer (reuses Phase 5 `Driver.OpcUaClient` stack), reads peer's `ServiceLevel` node + verifies endpoint serves data. Short-circuit: if HTTP probe is failing, skip UA probe (no wasted sessions).
|
||||
3. **B.3** `ServiceLevelCalculator.Compute(role, selfHealth, peerHttpHealthy, peerUaHealthy, applyInProgress, recoveryDwellMet, topologyValid) → byte`. 8-state matrix per §Scope. `topologyValid=false` forces InvalidTopology = 2 regardless of other inputs.
|
||||
4. **B.4** `RecoveryStateManager`: after a `Faulted → Healthy` transition, hold driver in `Recovering` band (180 Primary / 30 Backup) for `RecoveryDwellTime` (default 60 s) AND require one positive publish witness (successful `Read` on a reference node) before entering Authoritative band.
|
||||
5. **B.5** Calculator reacts to inputs via `IObserver` so changes immediately push to the OPC UA `ServiceLevel` node.
|
||||
6. **B.6** Tests: **64-case matrix** covering role × self-health × peer-http × peer-ua × apply × recovery × topology. Specific cases flagged: Primary-with-unreachable-peer-serves-at-230 (authority retained); Backup-with-unreachable-primary-escalates-to-80 (not auto-promote); InvalidTopology demotes both nodes; Recovering dwell + publish-witness blocks premature return to 255.
|
||||
|
||||
### Stream C — OPC UA node wiring (3 days)
|
||||
|
||||
1. **C.1** `ServiceLevel` variable node created under `ServerStatus` at server startup. Type `Byte`, AccessLevel = CurrentRead only. Subscribe to `ServiceLevelCalculator` observable; push updates via `DataChangeNotification`.
|
||||
2. **C.2** `ServerUriArray` variable node under `ServerCapabilities`. Array of `String`, **includes self + peers** with deterministic ordering (self first). Updates on topology change. Compliance test asserts local-plus-peer membership.
|
||||
3. **C.3** `RedundancySupport` variable — static at startup from `RedundancyMode`. Values: `None`, `Cold`, `Warm`, `WarmActive`, `Hot`. Unsupported values (`Transparent`, `HotAndMirrored`) are rejected **pre-publish** by validator — runtime never sees them.
|
||||
4. **C.4** Client.CLI cutover test: connect to primary, read `ServiceLevel` → 255; pause primary apply → 200; unreachable peer while apply in progress → 200 (apply dominates peer-unreachable per matrix); client sees peer via `ServerUriArray`; fail primary → client reconnects to peer at 80 (isolated-backup band).
|
||||
|
||||
### Stream D — Apply-window integration (3 days)
|
||||
|
||||
1. **D.1** `sp_PublishGeneration` caller wraps the apply in `await using var lease = coordinator.BeginApplyLease(generationId, publishRequestId)`. Lease keyed to `(ConfigGenerationId, PublishRequestId)` so concurrent publishes stay isolated. Disposal decrements on every exit path.
|
||||
2. **D.2** `ApplyLeaseWatchdog` auto-closes leases older than `ApplyMaxDuration` (default 10 min) so a crashed publisher can't pin the node at mid-apply.
|
||||
3. **D.3** Pre-publish validator in `sp_PublishGeneration` rejects unsupported `RedundancyMode` values (`Transparent`, `HotAndMirrored`) with a clear error message — runtime never sees an invalid mode.
|
||||
4. **D.4** Tests: (a) mid-apply client subscribes → sees ServiceLevel drop → sees restore; (b) lease leak via `ThreadAbort` / cancellation → watchdog closes; (c) publish rejected for `Transparent` → operator-actionable error.
|
||||
|
||||
### Stream E — Admin UI + metrics (3 days)
|
||||
|
||||
1. **E.1** `RedundancyTab.razor` under `/cluster/{id}/redundancy`. Shows each node's role, current ServiceLevel (with band label per 8-state matrix), peer reachability (HTTP + UA probe separately), last apply timestamp. Role-swap button posts a draft edit on `ClusterNode.RedundancyRole`; publish applies.
|
||||
2. **E.2** OpenTelemetry meter export: `ot_opcua_service_level{cluster,node}` gauge + `ot_opcua_peer_reachable{cluster,node,peer,kind=http|ua}` + `ot_opcua_apply_in_progress{cluster,node}` + `ot_opcua_topology_valid{cluster}`. Sink via Phase 6.1 observability.
|
||||
3. **E.3** SignalR push: `FleetStatusHub` broadcasts ServiceLevel changes so the Admin UI updates within ~1 s of the coordinator observing a peer flip.
|
||||
|
||||
### Stream F — Client-interoperability matrix (3 days, new)
|
||||
|
||||
1. **F.1** Validate ServiceLevel-driven cutover against **Ignition 8.1 + 8.3**, **Kepware KEPServerEX 6.x**, **Aveva OI Gateway 2020R2 + 2023R1**. For each: configure the client with both endpoints, verify it honors `ServiceLevel` + `ServerUriArray` during primary failover.
|
||||
2. **F.2** Clients that don't honour the standards (doc field — may include Kepware and OI Gateway per Codex review) get an explicit compatibility-matrix entry: "requires manual backup-endpoint config / vendor-specific redundancy primitives". Documented in `docs/Redundancy.md`.
|
||||
3. **F.3** Galaxy MXAccess failover test — boot Galaxy.Proxy on both nodes, kill Primary, assert Galaxy consumer reconnects to Backup within `(SessionTimeout + KeepAliveInterval × 3)`. Document required session-timeout config in `docs/Redundancy.md`.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **OPC UA band compliance**: `0=Maintenance` reserved, `1=NoData` reserved. Operational states in 2..255 per 8-state matrix.
|
||||
- [ ] **Authoritative-Primary** ServiceLevel = 255.
|
||||
- [ ] **Isolated-Primary** (peer unreachable, self serving) = 230 — Primary retains authority.
|
||||
- [ ] **Primary-Mid-Apply** = 200.
|
||||
- [ ] **Recovering-Primary** = 180 with dwell + publish witness enforced.
|
||||
- [ ] **Authoritative-Backup** = 100.
|
||||
- [ ] **Isolated-Backup** (primary unreachable) = 80 — does NOT auto-promote.
|
||||
- [ ] **InvalidTopology** = 2 — both nodes self-demote when >1 Primary detected runtime.
|
||||
- [ ] **ServerUriArray** returns self + peer URIs, self first.
|
||||
- [ ] **UaHealthProbe authority**: integration test — peer returns HTTP 200 but OPC UA endpoint unreachable → coordinator treats peer as UA-unhealthy; peer is not a valid authority source.
|
||||
- [ ] **Apply-lease disposal**: leases close on exception, cancellation, and watchdog timeout; ServiceLevel never sticks at mid-apply band.
|
||||
- [ ] **Transparent-mode rejection**: attempting to publish `RedundancyMode=Transparent` is blocked at `sp_PublishGeneration`; runtime never sees an invalid mode.
|
||||
- [ ] **Role transition via operator publish**: FleetAdmin swaps `RedundancyRole` in a draft, publishes; both nodes re-read topology on publish confirmation + flip ServiceLevel — no restart.
|
||||
- [ ] **Client.CLI cutover**: with primary halted, Client.CLI that was connected to primary sees primary drop + reconnects to backup via `ServerUriArray`.
|
||||
- [ ] **Client interoperability matrix** (Stream F): Ignition 8.1 + 8.3 honour ServiceLevel; Kepware + Aveva OI Gateway findings documented.
|
||||
- [ ] **Galaxy MXAccess failover**: end-to-end test — primary kill → Galaxy consumer reconnects to backup within session-timeout budget.
|
||||
- [ ] No regression in existing driver test suites; no regression in `/healthz` reachability under redundancy load.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| Split-brain from operator race (both nodes marked Primary) | Low | High | Coordinator rejects startup if its cluster has >1 Primary row; logs + fails fast. Document as a publish-time validation in `sp_PublishGeneration`. |
|
||||
| ServiceLevel thrashing on flaky peer | Medium | Medium | 2 s probe interval + 3-sample smoothing window; only declares a peer unreachable after 3 consecutive failed probes |
|
||||
| Client ignores ServiceLevel and stays on broken primary | Medium | Medium | Documented in `docs/Redundancy.md` — non-transparent redundancy requires client cooperation; most SCADA clients (Ignition, Kepware, Aveva OI Gateway) honor it. Unit-test the advertised values; field behavior is client-responsibility |
|
||||
| Apply-window counter leaks on exception | Low | High | `BeginApplyWindow` returns `IDisposable`; `using` syntax enforces paired decrement; unit test for exception-in-apply path |
|
||||
| `HttpClient` probe leaks sockets | Low | Medium | Single shared `HttpClient` per coordinator (not per-probe); timeouts tight to avoid keeping connections open during peer downtime |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: topology loader + tests
|
||||
- [ ] Stream B: peer probe + ServiceLevel calculator + 32-case matrix tests
|
||||
- [ ] Stream C: ServiceLevel / ServerUriArray / RedundancySupport node wiring + Client.CLI smoke test
|
||||
- [ ] Stream D: apply-window integration + nested-apply counter
|
||||
- [ ] Stream E: Admin `RedundancyTab` + OpenTelemetry metrics + SignalR push
|
||||
- [ ] `phase-6-3-compliance.ps1` exits 0; exit-gate doc; `docs/Redundancy.md` updated with the ServiceLevel matrix
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, thread `019da490-3fa0-7340-98b8-cceeca802550`)
|
||||
|
||||
1. **Crit · ACCEPT** — No publish-generation fencing enables split-publish advertising both as authoritative. **Change**: coordinator CAS on a monotonic `ConfigGenerationId`; every topology decision is generation-stamped; peers reject state propagated from a lower generation.
|
||||
2. **Crit · ACCEPT** — `>1 Primary` at startup covered but runtime containment missing when invalid topology appears later (mid-apply race). **Change**: add runtime `InvalidTopology` state — both nodes self-demote to ServiceLevel 2 (the "detected inconsistency" band, below normal operation) until convergence.
|
||||
3. **High · ACCEPT** — `0 = Faulted` collides with OPC UA Part 5 §6.3.34 semantics where 0 means **Maintenance** and 1 means NoData. **Change**: reserve **0** for operator-declared maintenance-mode only; Faulted/unreachable uses **1** (NoData); in-range degraded states occupy 2..199.
|
||||
4. **High · ACCEPT** — Matrix collapses distinct operational states onto the same value. **Change**: matrix expanded to Authoritative-Primary=255, Isolated-Primary=230 (peer unreachable — still serving), Primary-Mid-Apply=200, Recovering-Primary=180, Authoritative-Backup=100, Isolated-Backup=80 (primary unreachable — "take over if asked"), Backup-Mid-Apply=50, Recovering-Backup=30.
|
||||
5. **High · ACCEPT** — `/healthz` from 6.1 is HTTP-healthy but doesn't guarantee OPC UA data plane. **Change**: add a redundancy-specific probe `UaHealthProbe` — issues a `ReadAsync(ServiceLevel)` against the peer's OPC UA endpoint via a lightweight client session. `/healthz` remains the fast-fail; the UA probe is the authority signal.
|
||||
6. **High · ACCEPT** — `ServerUriArray` must include self + peers, not peers only. **Change**: array contains `[self.ApplicationUri, peer.ApplicationUri]` in stable deterministic ordering; compliance test asserts local-plus-peer membership.
|
||||
7. **Med · ACCEPT** — No `Faulted → Recovering → Healthy` path. **Change**: add `Recovering` state with min dwell time (60 s default) + positive publish witness (one successful Read on a reference node) before returning to Healthy. Thrash-prevention.
|
||||
8. **Med · ACCEPT** — Topology change during in-flight probe undefined. **Change**: every probe task tagged with `ConfigGenerationId` at dispatch; obsolete results discarded; in-flight probes cancelled on topology reload.
|
||||
9. **Med · ACCEPT** — Apply-window counter race on exception/cancellation/async ownership. **Change**: apply-window is a named lease keyed to `(ConfigGenerationId, PublishRequestId)` with disposal enforced via `await using`; watchdog detects leased-but-abandoned and force-closes after `ApplyMaxDuration` (default 10 min).
|
||||
10. **High · ACCEPT** — Ignition + Kepware + Aveva OI Gateway `ServiceLevel` compliance is unverified. **Change**: risk elevated to High; add Stream F (new) — build an interop matrix: validate against Ignition 8.1/8.3, Kepware KEPServerEX 6.x, Aveva OI Gateway 2020R2 + 2023R1. Document per-client cutover behaviour. Field deployments get a documented compatibility table; clients that ignore ServiceLevel documented as requiring explicit backup-endpoint config.
|
||||
11. **Med · ACCEPT** — Galaxy MXAccess re-session on Primary death not in acceptance. **Change**: Stream F adds an end-to-end failover smoke test that boots Galaxy.Proxy on both nodes, kills Primary, asserts Galaxy consumer reconnects to Backup within `(SessionTimeout + KeepAliveInterval × 3)` budget. `docs/Redundancy.md` updated with required session timeouts.
|
||||
12. **Med · ACCEPT** — Transparent-mode startup rejection is outage-prone. **Change**: `sp_PublishGeneration` validates `RedundancyMode` pre-publish — unsupported values reject the publish attempt with a clear validation error; runtime never sees an unsupported mode. Last-good config stays active.
|
||||
|
||||
134
docs/v2/implementation/phase-6-4-admin-ui-completion.md
Normal file
134
docs/v2/implementation/phase-6-4-admin-ui-completion.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# Phase 6.4 — Admin UI Completion
|
||||
|
||||
> **Status**: DRAFT — Phase 1 Stream E shipped the Admin scaffold + core pages; several feature-completeness items from its completion checklist (`phase-1-configuration-and-admin-scaffold.md` §Stream E) never landed. This phase closes them.
|
||||
>
|
||||
> **Branch**: `v2/phase-6-4-admin-ui-completion`
|
||||
> **Estimated duration**: 2 weeks
|
||||
> **Predecessor**: Phase 6.3 (Redundancy runtime) — reuses the `/cluster/{id}` page layout for the new tabs
|
||||
> **Successor**: v2 release-readiness capstone (Task #121)
|
||||
|
||||
## Phase Objective
|
||||
|
||||
Close the Admin UI feature-completeness checklist that Phase 1 Stream E exit gate left open. Each item below is an existing `phase-1-configuration-and-admin-scaffold.md` completion-checklist entry that is currently unchecked.
|
||||
|
||||
Gaps to close:
|
||||
|
||||
1. **UNS Structure tab drag/move with impact preview** — decision #115 + `admin-ui.md` §"UNS". Current state: list-only render; no drag reorder; no "X lines / Y equipment impacted" preview.
|
||||
2. **Equipment CSV import + 5-identifier search** — decision #95 + #117. Current state: basic form; no CSV parser; search indexes only ZTag.
|
||||
3. **Draft-generation diff viewer** — enhance existing `DiffViewer.razor` to show generation-diff not just staged-edit diff; highlight ACL grant changes (lands after Phase 6.2).
|
||||
4. **`_base` equipment-class Identification fields exposure** — decision #138–139. Columns exist on `Equipment`; no Admin UI field group; no address-space exposure of the OPC 40010 sub-folder.
|
||||
|
||||
## Scope — What Changes
|
||||
|
||||
| Concern | Change |
|
||||
|---------|--------|
|
||||
| `Admin/Pages/UnsTab.razor` | Tree component with drag-drop using **`MudBlazor.TreeView` + `MudBlazor.DropTarget`** (existing transitive dep — no new third-party package). Native HTML5 DnD rejected because virtualization + DnD on 500+ nodes doesn't combine reliably. Each drag fires a "Compute Impact" call carrying a `DraftRevisionToken`; modal preview ("Moving Line 'Oven-2' from 'Packaging' to 'Assembly' will re-home 14 equipment + re-parent 237 tags"). **Confirm step re-checks the token** and rejects with a `409 Conflict / refresh-required` modal if the draft advanced between preview and commit. |
|
||||
| `Admin/Services/UnsImpactAnalyzer.cs` | New service. Given a move-operation (line move, area rename, line merge), computes cascade counts + `DraftRevisionToken` at preview time. Pure-function shape; testable in isolation. |
|
||||
| `Admin/Pages/EquipmentTab.razor` | Add CSV-import button → modal with file picker + dry-run preview. **Identifier search** uses the canonical decision #117 set: `ZTag / MachineCode / SAPID / EquipmentId / EquipmentUuid`. Typeahead probes each column with a ranking query (exact match score 100 → prefix 50 → opt-in LIKE 20; published > draft tie-break). Result row shows which field matched via trailing badge. |
|
||||
| `Admin/Services/EquipmentCsvImporter.cs` | New service. CSV header row must start with `# OtOpcUaCsv v1` (version marker — future shape changes bump the version). Columns: `ZTag, MachineCode, SAPID, EquipmentId, EquipmentUuid, Name, UnsAreaName, UnsLineName, Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. Parser rejects unknown columns + blank required fields + duplicate ZTags + missing UnsLines. |
|
||||
| **Staged-import table** `EquipmentImportBatch` | New entity `{ Id, CreatedAtUtc, CreatedBy, RowsStaged, RowsAccepted, RowsRejected, FinalisedAtUtc? }` + child `EquipmentImportRow` records. Import writes rows in chunks to the staging table (not to `Equipment`). `FinaliseImportBatch` is the atomic finalize step that applies all accepted rows to `Equipment` + `ExternalIdReservation` in one transaction — short + bounded regardless of input size. Rollback = drop the batch row; `Equipment` never partially mutates. |
|
||||
| `Admin/Pages/DraftEditor.razor` + `DiffViewer.razor` | Diff viewer refactored into a base component + section plugins: `StructuralDiffSection`, `EquipmentDiffSection`, `TagDiffSection`, `AclDiffSection` (Phase 6.2), `RedundancyDiffSection` (Phase 6.3), `IdentificationDiffSection`. Each section has a **1000-row hard cap**; over-cap renders an aggregate summary + "Load full diff" button streaming 500-row pages via SignalR. Subtree-rename diffs (decision #115 bulk restructure) surface as summary only by default. |
|
||||
| `Admin/Components/IdentificationFields.razor` | New component. Renders the OPC 40010 field set **per decision #139**: `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. `ProductInstanceUri / DeviceRevision / MonthOfConstruction` dropped from this phase — they need a separate decision-log widening. |
|
||||
| `OtOpcUa.Server/OpcUa/DriverNodeManager` — Equipment folder build | When an `Equipment` row has non-null Identification fields, the server adds an `Identification` sub-folder under the Equipment node containing one variable per non-null field. **ACL binding**: the sub-folder + variables inherit the `Equipment` scope's grants from Phase 6.2's trie — no new scope level added. Documented in `acl-design.md` cross-reference update. |
|
||||
|
||||
## Scope — What Does NOT Change
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| Admin UI visual language | Bootstrap 5 / cookie auth / sidebar layout unchanged — consistency with ScadaLink design reference. |
|
||||
| LDAP auth flow | Already shipped in Phase 1. Phase 6.4 is additive UI only. |
|
||||
| Core abstractions / driver layer | Admin UI changes don't touch drivers. |
|
||||
| Equipment-class *template schema validation* | Still deferred (decision #112 — schemas repo not landed). We expose the Identification fields but don't validate against a template hierarchy. |
|
||||
| Drag/move to *other clusters* | Out of scope — equipment is cluster-scoped per decision #82. Cross-cluster migration is a different workflow. |
|
||||
|
||||
## Entry Gate Checklist
|
||||
|
||||
- [ ] Phase 6.2 merged (ACL grants are part of the new diff viewer sections)
|
||||
- [ ] Phase 6.3 merged (redundancy-role changes are part of the diff viewer)
|
||||
- [ ] `phase-1-configuration-and-admin-scaffold.md` §Stream E completion checklist re-read — confirm these are the remaining items
|
||||
- [ ] `admin-ui.md` re-skimmed for screen layouts
|
||||
- [ ] Existing `EquipmentTab.razor` / `UnsTab.razor` / `DraftEditor.razor` diff'd against what ships today so the edits are additive not destructive
|
||||
- [ ] Dev Galaxy available for OPC 40010 exposure smoke testing
|
||||
|
||||
## Task Breakdown
|
||||
|
||||
### Stream A — UNS drag/reorder + impact preview (5 days)
|
||||
|
||||
1. **A.1** 1000-node synthetic seed fixture. Drag-latency bench against `MudBlazor.TreeView` + `MudBlazor.DropTarget` — commit to the component if latency budget (100 ms drag-enter feedback) holds; fall back to flat-list reorder UI (Area/Line dropdowns) with loss of visual drag affordance otherwise.
|
||||
2. **A.2** `UnsImpactAnalyzer` service. Inputs: `(DraftGenerationId, MoveOperation, DraftRevisionToken)`. Outputs: `ImpactPreview { AffectedEquipmentCount, AffectedTagCount, CascadeWarnings[], DraftRevisionToken }`. Pure-function shape; testable in isolation.
|
||||
3. **A.3** Modal preview wired to `UnsImpactAnalyzer`. **Confirm** re-reads the current draft revision + compares against the preview's token; if the draft advanced (another operator saved a different edit), show a `409 Conflict / refresh-required` modal rather than silently overwriting.
|
||||
4. **A.4** Cross-cluster drop attempts: target disabled + toast "Equipment is cluster-scoped (decision #82). To move across clusters, use Export → Import on the Cluster detail page." Plus help link.
|
||||
5. **A.5** Playwright (or equivalent) smoke test: drag a line across areas, assert modal shows right counts, assert draft row reflects the move; concurrent-edit test runs two sessions + asserts the later Confirm hits the 409.
|
||||
|
||||
### Stream B — Equipment CSV import + 5-identifier search (5 days)
|
||||
|
||||
1. **B.1** `EquipmentCsvImporter`. Strict RFC 4180 parser (per decision #95). Header row validation: first line must match `# OtOpcUaCsv v1` — future versions fork parser versions. Required columns: `ZTag, MachineCode, SAPID, EquipmentId, EquipmentUuid, Name, UnsAreaName, UnsLineName`. Optional: `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. Parser rejects unknown columns + blank required fields + duplicate ZTags.
|
||||
2. **B.2** `EquipmentImportBatch` + `EquipmentImportRow` staging tables (migration). Import writes preview rows to staging via chunked inserts; staging never blocks `Equipment` or `ExternalIdReservation`. Preview query reads staging + validates each row against the current `Equipment` state + `ExternalIdReservation` freshness.
|
||||
3. **B.3** `ImportPreview` UI — per-row accept/reject table. Reject reasons: "ZTag already exists in draft", "ExternalIdReservation conflict with Cluster X", "UnsLineName not found in draft UNS tree", etc. Operator reviews + clicks "Commit".
|
||||
4. **B.4** `FinaliseImportBatch` — atomic finalize. One EF transaction applies accepted rows to `Equipment` + `ExternalIdReservation`; duration bounded regardless of input size (the atomic step is a bulk-insert, not per-row row-by-row). Rollback = drop batch row via `DropImportBatch`; `Equipment` never partially mutates.
|
||||
5. **B.5** Five-identifier search. Rank SQL: exact match any identifier = score 100, prefix match = 50, LIKE-fuzzy (opt-in via `?fuzzy=true`) = 20; tie-break `published > draft` then `RowVersion DESC`. Typeahead shows which field matched via trailing badge.
|
||||
6. **B.6** Smoke tests: 100-row CSV with 10 conflicts (5 ZTag dupes, 3 reservation clashes, 2 missing UnsLines); 10k-row perf test asserting finalize txn < 30 s; concurrent import + external `ExternalIdReservation` insert test asserts retryable-conflict handling.
|
||||
|
||||
### Stream C — Diff viewer enhancements (4 days)
|
||||
|
||||
1. **C.1** Refactor `DiffViewer.razor` into a base component + section plugins. Plugins: `StructuralDiffSection` (UNS tree), `EquipmentDiffSection`, `TagDiffSection`, `AclDiffSection` (Phase 6.2), `RedundancyDiffSection` (Phase 6.3), `IdentificationDiffSection`.
|
||||
2. **C.2** Each section renders collapsed by default; counts + top-line summary always visible. **1000-row hard cap** per section — over-cap sections render aggregate summary (e.g. "237 equipment re-parented from Packaging to Assembly") with a "Load full diff" button that streams 500-row pages via SignalR.
|
||||
3. **C.3** Subtree-rename diffs (decision #115 bulk restructure) surface as summary only by default regardless of row count.
|
||||
4. **C.4** Tests: seed two generations with deliberate diffs; assert every section reports the right counts + top-line summary + hard-cap behavior.
|
||||
|
||||
### Stream D — OPC 40010 Identification exposure (3 days)
|
||||
|
||||
1. **D.1** `IdentificationFields.razor` component. Renders the **9 decision #139 fields**: `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. Labelled inputs; nullable columns show empty input; required-field validation on commit only.
|
||||
2. **D.2** `DriverNodeManager` equipment-folder builder — after building the equipment node, inspect the 9 Identification columns; if any non-null, add an `Identification` sub-folder with variable-per-non-null-field. ACL binding: sub-folder + variables inherit the **same `ScopeId` as the Equipment node** (Phase 6.2's trie treats them as part of the Equipment scope — no new scope level).
|
||||
3. **D.3** Address-space smoke test via Client.CLI: browse an equipment node, assert `Identification` sub-folder present when columns are set, absent when all null, variables match the field values.
|
||||
4. **D.4** ACL integration test: a user with Equipment-level grant reads the `Identification` variables without needing a separate grant; a user without the Equipment grant gets `BadUserAccessDenied` on both the Equipment node + its Identification variables.
|
||||
|
||||
## Compliance Checks (run at exit gate)
|
||||
|
||||
- [ ] **UNS drag/move**: drag a line across areas; modal preview shows correct impacted-equipment + impacted-tag counts.
|
||||
- [ ] **Concurrent-edit safety**: two-session test — session B saves a draft edit after session A opened the preview; session A's Confirm returns `409 Conflict / refresh-required` instead of overwriting.
|
||||
- [ ] **Cross-cluster drop**: dropping equipment across cluster boundaries is disabled + shows actionable toast pointing to Export/Import workflow.
|
||||
- [ ] **1000-node tree**: drag operations on a 1000-node seed maintain < 100 ms drag-enter feedback.
|
||||
- [ ] **CSV header version**: file missing `# OtOpcUaCsv v1` first line is rejected pre-parse.
|
||||
- [ ] **CSV canonical identifier set**: columns match decision #117 (ZTag / MachineCode / SAPID / EquipmentId / EquipmentUuid); drift from the earlier draft surfaces as a test failure.
|
||||
- [ ] **Staged-import atomicity**: `FinaliseImportBatch` transaction bounded < 30 s for a 10k-row import; pre-finalize stagings visible only to the importing user; rollback via `DropImportBatch`.
|
||||
- [ ] **Concurrent import + external reservation**: concurrent test — third party inserts to `ExternalIdReservation` mid-finalize; finalize retries with conflict handling; no corruption.
|
||||
- [ ] **5-identifier search ranking**: exact matches outrank prefix matches; published outranks draft for equal scores.
|
||||
- [ ] **Diff viewer section caps**: 2000-row subtree-rename diff renders as summary only; "Load full diff" streams in pages.
|
||||
- [ ] **OPC 40010 field list match**: rendered field group matches decision #139 exactly; no extra fields.
|
||||
- [ ] **OPC 40010 exposure**: Client.CLI browse shows `Identification` sub-folder when equipment has non-null columns; absent when all null.
|
||||
- [ ] **ACL inheritance for Identification**: integration test — Equipment-grant user reads Identification; no-grant user gets `BadUserAccessDenied` on both.
|
||||
- [ ] **Visual parity reviewer**: named role (`FleetAdmin` user, not the implementation lead) compares side-by-side against `admin-ui.md` §Visual-Design reference panels; signoff artefact is a checked-in screenshot set under `docs/v2/visual-compliance/phase-6-4/`.
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|------|:----------:|:------:|------------|
|
||||
| UNS drag-drop janky on large trees (>500 nodes) | Medium | Medium | Virtualize the tree component; default-collapse nested areas; test with a synthetic 1000-equipment seed |
|
||||
| CSV import performance on 10k-row imports | Medium | Medium | Stream-parse rather than load-into-memory; preview renders in batches of 100; commit is chunked-EF-insert with progress bar |
|
||||
| Diff viewer becomes unwieldy with many sections | Low | Medium | Each section collapsed by default; top-line summary row always shown; Phase 6.4 caps at 6 sections |
|
||||
| OPC 40010 sub-folder accidentally exposes NULL/empty identification columns as empty-string variables | Low | Low | Column null-check in the builder; drop variables whose DB value is null |
|
||||
| 5-identifier search pulls full table | Medium | Medium | Indexes on each of ZTag/SAPID/UniqueId/Alias1/Alias2; search query uses a UNION of 5 indexed lookups; falls back to LIKE only on explicit operator opt-in |
|
||||
|
||||
## Completion Checklist
|
||||
|
||||
- [ ] Stream A: `UnsImpactAnalyzer` + drag-drop tree + modal preview + Playwright smoke
|
||||
- [ ] Stream B: `EquipmentCsvImporter` + preview modal + 5-identifier search + conflict-rollback test
|
||||
- [ ] Stream C: `DiffViewer` refactor + 6 section plugins + 2-generation diff test
|
||||
- [ ] Stream D: `IdentificationFields.razor` + address-space builder change + Client.CLI browse test
|
||||
- [ ] Visual-compliance reviewer signoff
|
||||
- [ ] Full solution `dotnet test` passes; `phase-6-4-compliance.ps1` exits 0; exit-gate doc
|
||||
|
||||
## Adversarial Review — 2026-04-19 (Codex, via `codex-rescue` subagent)
|
||||
|
||||
1. **Crit · ACCEPT** — Stale UNS impact preview can overwrite concurrent draft edits. **Change**: each preview carries a `DraftRevisionToken`; `Confirm` compares against the current draft + rejects with a `409 Conflict / refresh-required` modal if any draft edit landed since the preview was generated. Stream A.3 updated.
|
||||
2. **High · ACCEPT** — CSV import atomicity is internally contradictory (single EF transaction vs. chunked inserts). **Change**: one explicit model — staged-import table (`EquipmentImportBatch { Id, CreatedAtUtc, RowsStaged, RowsAccepted, RowsRejected }`) receives rows in chunks; final `FinaliseImportBatch` is atomic over `Equipment` + `ExternalIdReservation`. Rollback is "drop the batch row" — the real Equipment table is never partially mutated.
|
||||
3. **Crit · ACCEPT** — Identifier contract rewrite mis-cites decisions. **Change**: revert to the `admin-ui.md` + decision #117 canonical set — `ZTag / MachineCode / SAPID / EquipmentId / EquipmentUuid`. CSV header follows that set verbatim. Introduce a separate decision entry for versioned CSV header shape before adding any new column; CSV header row must start with `# OtOpcUaCsv v1` so future shape changes are unambiguous.
|
||||
4. **Med · ACCEPT** — Search ordering undefined. **Change**: rank SQL — exact match on any identifier scores 100; prefix match 50; LIKE-fuzzy 20; published > draft tie-breaker; `ORDER BY score DESC, RowVersion DESC`. Typeahead shows which field matched via trailing badge.
|
||||
5. **High · ACCEPT** — HTML5 DnD on virtualized tree is aspirational. **Change**: Stream A.2 rewritten — commits to **`MudBlazor.TreeView` + `MudBlazor.DropTarget`** (already a transitive dep via the existing Admin UI). Build a 1000-node synthetic seed in A.1 + validate drag-latency budget before implementing impact preview. If MudBlazor can't hit the budget, fall back to a flat-list reorder UI with Area/Line dropdowns (loss of visual drag affordance but unblocks the feature).
|
||||
6. **Med · ACCEPT** — Collapsed-by-default doesn't handle generation-sized diffs. **Change**: each diff section has a hard row cap (1000 by default). Over-cap sections render an aggregate summary + "Load full diff" button that streams via SignalR in 500-row pages. Decision #115 subtree renames surface as a "N equipment re-parented under X → Y" summary instead of row-by-row.
|
||||
7. **High · ACCEPT** — OPC 40010 field list doesn't match decision #139. **Change**: field group realigned to `Manufacturer, Model, SerialNumber, HardwareRevision, SoftwareRevision, YearOfConstruction, AssetLocation, ManufacturerUri, DeviceManualUri`. `ProductInstanceUri / DeviceRevision / MonthOfConstruction` dropped from Phase 6.4 — they belong to a future OPC 40010 widening decision.
|
||||
8. **High · ACCEPT** — `Identification` subtree unreconciled with ACL hierarchy (Phase 6.2 6-level scope). **Change**: address-space builder creates the Identification sub-folder under the Equipment node **with the same ScopeId as Equipment** — no new scope level. ACL evaluator treats `…/Equipment/Identification/X` as inheriting the `Equipment` scope's grants. Documented in Phase 6.2's `acl-design.md` cross-reference update.
|
||||
9. **Low · ACCEPT** — Visual-review gate names nonexistent reviewer role. **Change**: rubric defined — a named "Admin UX reviewer" (role `FleetAdmin` user, not the implementation lead) compares side-by-side screenshots against the `admin-ui.md` §Visual-Design reference panels; signoff artefact is a checked-in screenshot set under `docs/v2/visual-compliance/phase-6-4/`.
|
||||
10. **Med · ACCEPT** — Cross-cluster drag/drop lacks loud failure path. **Change**: on drop across cluster boundary, disable the drop target + show a toast "Equipment is cluster-scoped (decision #82). To move across clusters, use the Export → Import workflow on the Cluster detail page." Plus a help link. Tested in Stream A.4.
|
||||
|
||||
@@ -909,6 +909,26 @@ Each step leaves the system runnable. The generic extraction is effectively free
|
||||
| 140 | Enterprise shortname = `zb` (UNS level-1 segment) | Closes corrections-doc D4. Matches the existing `ZB.MOM.WW.*` namespace prefix used throughout the codebase; short by design since this segment appears in every equipment path (`zb/warsaw-west/bldg-3/line-2/cnc-mill-05/RunState`); operators already say "ZB" colloquially. Admin UI cluster-create form default-prefills `zb` for the Enterprise field. Production deployments use it directly from cluster-create | 2026-04-17 |
|
||||
| 141 | Tier 3 (AppServer IO) cutover is feasible — AVEVA's OI Gateway supports arbitrary upstream OPC UA servers as a documented pattern | Closes corrections-doc E2 with **GREEN-YELLOW** verdict. Multiple AVEVA partners (Software Toolbox, InSource) have published working integrations against four different non-AVEVA upstream servers (TOP Server, OPC Router, OmniServer, Cogent DataHub). No re-architecting of OtOpcUa required. Path: `OPC UA node → OI Gateway → SuiteLink → $DDESuiteLinkDIObject → AppServer attribute`. Recommended AppServer floor: System Platform 2023 R2 Patch 01. Two integrator-burden risks tracked: validation/GxP paperwork (no AVEVA blueprint exists for non-AVEVA upstream servers in Part 11 deployments) and unpublished scale benchmarks (in-house benchmark required before cutover scheduling). See `aveva-system-platform-io-research.md` | 2026-04-17 |
|
||||
| 142 | Phase 1 acceptance includes an end-to-end AppServer-via-OI-Gateway smoke test against OtOpcUa | Catches AppServer-specific quirks (cert exchange via reject-and-trust workflow, endpoint URL must NOT include `/discovery` suffix per Inductive Automation forum failure mode, service-account install required because OI Gateway under SYSTEM cannot connect to remote OPC servers, `Basic256Sha256` + `SignAndEncrypt` + LDAP-username token combination must work end-to-end) early — well before the Year 3 tier-3 cutover schedule. Adds one task to `phase-1-configuration-and-admin-scaffold.md` Stream E (Admin smoke test) | 2026-04-17 |
|
||||
| 143 | Polly per-capability policy — Read / HistoryRead / Discover / Probe / Alarm-subscribe auto-retry; Write does NOT auto-retry unless the tag metadata carries `[WriteIdempotent]` | Decisions #44-45 forbid auto-retry on Write because a timed-out write can succeed on the device + be replayed by the pipeline, duplicating pulses / alarm acks / counter increments / recipe-step advances. Per-capability policy in the shared Polly layer makes the retry safety story explicit; `WriteIdempotentAttribute` on tag definitions is the opt-in surface | 2026-04-19 |
|
||||
| 144 | Polly pipeline key = `(DriverInstanceId, HostName)`, not DriverInstanceId alone | Decision #35 requires per-device isolation. One dead PLC behind a multi-device Modbus driver must NOT open the circuit breaker for healthy sibling hosts. Per-instance pipelines would poison every device behind one bad endpoint | 2026-04-19 |
|
||||
| 145 | Tier A/B/C runtime enforcement splits into `MemoryTracking` (all tiers — soft/hard thresholds log + surface, NEVER kill) and `MemoryRecycle` (Tier C only — requires out-of-process topology). Tier A/B hard-breach logs a promotion-to-Tier-C recommendation; the runtime never auto-kills an in-process driver | Decisions #73-74 reserve process-kill protections for Tier C. An in-process Tier A/B "recycle" would kill every OPC UA session + every other in-proc driver for one leaky instance, blast-radius worse than the leak | 2026-04-19 |
|
||||
| 146 | Memory watchdog uses the hybrid formula `soft = max(multiplier × baseline, baseline + floor)`, with baseline captured as the median of the first 5 min of `GetMemoryFootprint()` samples post-InitializeAsync. Tier-specific constants: A multiplier=3 floor=50 MB, B multiplier=3 floor=100 MB, C multiplier=2 floor=500 MB. Hard = 2 × soft | Codex adversarial review on the Phase 6.1 plan flagged that hardcoded per-tier MB bands diverge from decision #70's specified formula. Static bands false-trigger on small-footprint drivers + miss meaningful growth on large ones. Observed-baseline + hybrid formula recovers the original intent | 2026-04-19 |
|
||||
| 147 | `WedgeDetector` uses demand-aware criteria `(state==Healthy AND hasPendingWork AND noProgressIn > threshold)`. `hasPendingWork` = (Polly bulkhead depth > 0) OR (active MonitoredItem count > 0) OR (queued historian read count > 0). Idle + subscription-only + write-only-burst drivers stay Healthy without false-fault | Previous "no successful Read in N intervals" formulation flipped legitimate idle subscribers, slow historian backfills, and write-heavy drivers to Faulted. The demand-aware check only fires when the driver claims work is outstanding | 2026-04-19 |
|
||||
| 148 | LiteDB config cache is **generation-sealed**: `sp_PublishGeneration` writes `<cache-root>/<cluster>/<generationId>.db` as a read-only sealed file; cache reads serve the last-known-sealed generation. Mixed-generation reads are impossible | Prior "refresh on every successful query" cache could serve LDAP role mapping from one generation alongside UNS topology from another, producing impossible states. Sealed-snapshot invariant keeps cache-served reads coherent with a real published state | 2026-04-19 |
|
||||
| 149 | `AuthorizationDecision { Allow \| NotGranted \| Denied, IReadOnlyList<MatchedGrant> Provenance }` — tri-state internal model. Phase 6.2 only produces `Allow` + `NotGranted` (grant-only semantics per decision #129); v2.1 Deny widens without API break | bool return would collapse `no-matching-grant` and `explicit-deny` into the same runtime state + UI explanation; provenance record is needed for the audit log anyway. Making the shape tri-state from Phase 6.2 avoids a breaking change in v2.1 | 2026-04-19 |
|
||||
| 150 | Data-plane ACL evaluator consumes `NodeAcl` rows joined against the session's resolved LDAP group memberships. `LdapGroupRoleMapping` (decision #105) is control-plane only — routes LDAP groups to Admin UI roles. Zero runtime overlap between the two | Codex adversarial review flagged that Phase 6.2 draft conflated the two — building the data-plane trie from `LdapGroupRoleMapping` would let a user inherit tag permissions from an admin-role claim path never intended as a data-path grant | 2026-04-19 |
|
||||
| 151 | `UserAuthorizationState` cached per session but bounded by `MembershipFreshnessInterval` (default 15 min). Past that interval the next hot-path authz call re-resolves LDAP group memberships; failure to re-resolve (LDAP unreachable) → fail-closed (evaluator returns `NotGranted` until memberships refresh successfully) | Previous design cached memberships until session close, so a user removed from a privileged LDAP group could keep authorized access for hours. Bounded freshness + fail-closed covers the revoke-takes-effect story | 2026-04-19 |
|
||||
| 152 | Auth cache has its own staleness budget `AuthCacheMaxStaleness` (default 5 min), independent of decision #36's availability-oriented config cache (24 h). Past 5 min on authorization data, evaluator fails closed regardless of whether the underlying config is still serving from cache | Availability-oriented caches trade correctness for uptime. Authorization data is correctness-sensitive — stale ACLs silently extend revoked access. Auth-specific budget keeps the two concerns from colliding | 2026-04-19 |
|
||||
| 153 | MonitoredItem carries `(AuthGenerationId, MembershipVersion)` stamp at create time. On every Publish, items with a mismatching stamp re-evaluate; unchanged items stay fast-path. Revoked items drop to `BadUserAccessDenied` within one publish cycle | Create-time-only authorization leaves revoked users receiving data forever; per-publish re-authorization at 100 ms cadence across 50 groups × 6 levels is too expensive. Stamp-then-reevaluate-on-change balances correctness with cost | 2026-04-19 |
|
||||
| 154 | ServiceLevel reserves `0` for operator-declared maintenance only; `1` = NoData (unreachable / Faulted); operational states occupy `2..255` in an 8-state matrix (Authoritative-Primary=255, Isolated-Primary=230, Primary-Mid-Apply=200, Recovering-Primary=180, Authoritative-Backup=100, Isolated-Backup=80, Backup-Mid-Apply=50, Recovering-Backup=30, InvalidTopology=2) | OPC UA Part 5 §6.3.34 defines `0=Maintenance` + `1=NoData`; using `0` for our Faulted case collides with spec + triggers spec-compliant clients to enter maintenance-mode cutover. Expanded 8-state matrix covers operational states the 5-state original collapsed together (e.g. Isolated-Primary vs Primary-Mid-Apply were both 200) | 2026-04-19 |
|
||||
| 155 | `ServerUriArray` includes self + peers (self first, deterministic ordering), per OPC UA Part 4 §6.6.2.2 | Previous design excluded self from the array — spec violation + clients lose the ability to map server identities consistently during failover | 2026-04-19 |
|
||||
| 156 | Redundancy peer health uses a two-layer probe: `/healthz` (2 s) as fast-fail + `UaHealthProbe` (10 s, opens OPC UA client session to peer + reads its `ServiceLevel` node) as the authority signal. HTTP-healthy ≠ UA-authoritative | `/healthz` returns 200 whenever HTTP + config DB/cache is healthy — but a peer can be HTTP-healthy with a broken OPC UA endpoint or a stuck subscription publisher. Using HTTP alone would advertise authority against servers that can't actually publish data | 2026-04-19 |
|
||||
| 157 | Publish-generation fencing — coordinator CAS on a monotonic `ConfigGenerationId`; every topology + role decision is generation-stamped; peers reject state propagated from a lower generation. Runtime `InvalidTopology` state (both self-demote to ServiceLevel 2) when >1 Primary detected post-startup | Operator race publishing two drafts with different roles can produce two locally-valid views; without fencing + runtime containment both nodes can serve as Primary until manual intervention | 2026-04-19 |
|
||||
| 158 | Apply-window uses named leases keyed to `(ConfigGenerationId, PublishRequestId)` via `await using`. `ApplyLeaseWatchdog` auto-closes any lease older than `ApplyMaxDuration` (default 10 min) | Simple `IDisposable`-counter design leaks on cancellation / async-ownership races; a stuck positive count leaves the node permanently mid-apply. Generation-keyed leases + watchdog bound worst case | 2026-04-19 |
|
||||
| 159 | CSV import header row must start with `# OtOpcUaCsv v1` (version marker). Future shape changes bump the version; parser forks per version. Canonical identifier columns follow decision #117: `ZTag, MachineCode, SAPID, EquipmentId, EquipmentUuid` | Without a version marker the CSV schema has no upgrade path — adding a required column breaks every old export silently. The version prefix makes parser dispatch explicit + future-compatible | 2026-04-19 |
|
||||
| 160 | Equipment CSV import uses a staged-import pattern: `EquipmentImportBatch` + `EquipmentImportRow` tables receive chunked inserts; `FinaliseImportBatch` is one atomic transaction that applies accepted rows to `Equipment` + `ExternalIdReservation`. Rollback = drop the batch row; `Equipment` never partially mutates | 10k-row single-transaction import holds locks too long; chunked direct writes lose all-or-nothing rollback. Staging + atomic finalize bounds transaction duration + preserves rollback semantics | 2026-04-19 |
|
||||
| 161 | UNS drag-reorder impact preview carries a `DraftRevisionToken`; Confirm re-checks against the current draft + returns `409 Conflict / refresh-required` if the draft advanced between preview and commit | Without concurrency control, two operators editing the same draft can overwrite each other's changes silently. Draft-revision token + 409 response makes the race visible + forces refresh | 2026-04-19 |
|
||||
| 162 | OPC 40010 Identification sub-folder exposed under each equipment node inherits the Equipment scope's ACL grants — the ACL trie does NOT add a new scope level for Identification | Adding a new scope level for Identification would require every grant to add a second grant for `Equipment/Identification`; inheriting the Equipment scope keeps the grant model flat + prevents operator-forgot-to-grant-Identification access surprises | 2026-04-19 |
|
||||
|
||||
## Reference Documents
|
||||
|
||||
|
||||
79
scripts/compliance/phase-6-1-compliance.ps1
Normal file
79
scripts/compliance/phase-6-1-compliance.ps1
Normal file
@@ -0,0 +1,79 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.1 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.1 (Resilience & Observability runtime) completion. Checks
|
||||
enumerated in `docs/v2/implementation/phase-6-1-resilience-and-observability.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.1 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-1-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.1 compliance — Resilience & Observability runtime ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — Resilience layer"
|
||||
Assert-Todo "Invoker coverage — every capability-interface method routes through CapabilityInvoker (analyzer error-level)" "Stream A.3"
|
||||
Assert-Todo "Write-retry guard — writes without [WriteIdempotent] never retry" "Stream A.5"
|
||||
Assert-Todo "Pipeline isolation — `(DriverInstanceId, HostName)` key; one dead host does not open breaker for siblings" "Stream A.5"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — Tier A/B/C runtime"
|
||||
Assert-Todo "Tier registry — every driver type has non-null Tier; Tier C declares out-of-process topology" "Stream B.1"
|
||||
Assert-Todo "MemoryTracking never kills — soft/hard breach on Tier A/B logs + surfaces without terminating" "Stream B.6"
|
||||
Assert-Todo "MemoryRecycle Tier C only — hard breach on Tier A never invokes supervisor; Tier C does" "Stream B.6"
|
||||
Assert-Todo "Wedge demand-aware — idle/historic-backfill/write-only cases stay Healthy" "Stream B.6"
|
||||
Assert-Todo "Galaxy supervisor preserved — Driver.Galaxy.Proxy/Supervisor/CircuitBreaker + Backoff still present + invoked" "Stream A.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — Health + logging"
|
||||
Assert-Todo "Health state machine — /healthz + /readyz respond < 500 ms for every DriverState per matrix in plan" "Stream C.4"
|
||||
Assert-Todo "Structured log — CI grep asserts DriverInstanceId + CorrelationId JSON fields present" "Stream C.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — LiteDB cache"
|
||||
Assert-Todo "Generation-sealed snapshot — SQL kill mid-op serves last-sealed snapshot; UsingStaleConfig=true" "Stream D.4"
|
||||
Assert-Todo "Mixed-generation guard — corruption of snapshot file fails closed; no mixed reads" "Stream D.4"
|
||||
Assert-Todo "First-boot no-snapshot + DB-down — InitializeAsync fails with clear error" "Stream D.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "No test-count regression — dotnet test ZB.MOM.WW.OtOpcUa.slnx count ≥ pre-Phase-6.1 baseline" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.1 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.1 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
81
scripts/compliance/phase-6-2-compliance.ps1
Normal file
81
scripts/compliance/phase-6-2-compliance.ps1
Normal file
@@ -0,0 +1,81 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.2 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.2 (Authorization runtime) completion. Checks enumerated
|
||||
in `docs/v2/implementation/phase-6-2-authorization-runtime.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.2 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-2-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.2 compliance — Authorization runtime ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — LdapGroupRoleMapping (control plane)"
|
||||
Assert-Todo "Control/data-plane separation — Core.Authorization has zero refs to LdapGroupRoleMapping" "Stream A.2"
|
||||
Assert-Todo "Authoring validation — AclsTab rejects duplicate (LdapGroup, Scope) pre-save" "Stream A.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — Evaluator + trie + cache"
|
||||
Assert-Todo "Trie invariants — PermissionTrieBuilder idempotent (build twice == equal)" "Stream B.1"
|
||||
Assert-Todo "Additive grants + cluster isolation — cross-cluster leakage impossible" "Stream B.1"
|
||||
Assert-Todo "Galaxy FolderSegment coverage — folder-subtree grant cascades; siblings unaffected" "Stream B.2"
|
||||
Assert-Todo "Redundancy-safe invalidation — generation-mismatch forces trie re-load on peer" "Stream B.4"
|
||||
Assert-Todo "Membership freshness — 15 min interval elapsed + LDAP down = fail-closed" "Stream B.5"
|
||||
Assert-Todo "Auth cache fail-closed — 5 min AuthCacheMaxStaleness exceeded = NotGranted" "Stream B.5"
|
||||
Assert-Todo "AuthorizationDecision shape — Allow + NotGranted only; Denied variant exists unused" "Stream B.6"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — OPC UA operation wiring"
|
||||
Assert-Todo "Every operation wired — Browse/Read/Write/HistoryRead/HistoryUpdate/CreateMonitoredItems/TransferSubscriptions/Call/Ack/Confirm/Shelve" "Stream C.1-C.7"
|
||||
Assert-Todo "HistoryRead uses its own flag — Read+no-HistoryRead denies HistoryRead" "Stream C.3"
|
||||
Assert-Todo "Mixed-batch semantics — 3 allowed + 2 denied returns per-item status, no coarse failure" "Stream C.6"
|
||||
Assert-Todo "Browse ancestor visibility — deep grant implies ancestor browse; denied ancestors filter" "Stream C.7"
|
||||
Assert-Todo "Subscription re-authorization — revoked grant surfaces BadUserAccessDenied in one publish" "Stream C.5"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — Admin UI + SignalR invalidation"
|
||||
Assert-Todo "SignalR invalidation — sp_PublishGeneration pushes PermissionTrieCache invalidate < 2 s" "Stream D.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "No test-count regression — dotnet test ZB.MOM.WW.OtOpcUa.slnx count ≥ pre-Phase-6.2 baseline" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.2 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.2 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
85
scripts/compliance/phase-6-3-compliance.ps1
Normal file
85
scripts/compliance/phase-6-3-compliance.ps1
Normal file
@@ -0,0 +1,85 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.3 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.3 (Redundancy runtime) completion. Checks enumerated in
|
||||
`docs/v2/implementation/phase-6-3-redundancy-runtime.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.3 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-3-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.3 compliance — Redundancy runtime ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — Topology loader"
|
||||
Assert-Todo "Transparent-mode rejection — sp_PublishGeneration blocks RedundancyMode=Transparent" "Stream A.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — Peer probe + ServiceLevel calculator"
|
||||
Assert-Todo "OPC UA band compliance — 0=Maintenance / 1=NoData reserved; operational 2..255" "Stream B.2"
|
||||
Assert-Todo "Authoritative-Primary ServiceLevel = 255" "Stream B.2"
|
||||
Assert-Todo "Isolated-Primary (peer unreachable, self serving) = 230" "Stream B.2"
|
||||
Assert-Todo "Primary-Mid-Apply = 200" "Stream B.2"
|
||||
Assert-Todo "Recovering-Primary = 180 with dwell + publish witness enforced" "Stream B.2"
|
||||
Assert-Todo "Authoritative-Backup = 100" "Stream B.2"
|
||||
Assert-Todo "Isolated-Backup (primary unreachable) = 80 — no auto-promote" "Stream B.2"
|
||||
Assert-Todo "InvalidTopology = 2 — >1 Primary self-demotes both nodes" "Stream B.2"
|
||||
Assert-Todo "UaHealthProbe authority — HTTP-200 + UA-down peer treated as UA-unhealthy" "Stream B.1"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — OPC UA node wiring"
|
||||
Assert-Todo "ServerUriArray — returns self + peer URIs, self first" "Stream C.2"
|
||||
Assert-Todo "Client.CLI cutover — primary halt triggers reconnect to backup via ServerUriArray" "Stream C.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — Apply-lease + publish fencing"
|
||||
Assert-Todo "Apply-lease disposal — leases close on exception, cancellation, watchdog timeout" "Stream D.2"
|
||||
Assert-Todo "Role transition via operator publish — no restart; both nodes flip ServiceLevel on publish confirm" "Stream D.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream F — Interop matrix"
|
||||
Assert-Todo "Client interoperability matrix — Ignition 8.1/8.3 / Kepware / Aveva OI Gateway findings documented" "Stream F.1-F.2"
|
||||
Assert-Todo "Galaxy MXAccess failover — primary kill; Galaxy consumer reconnects within session-timeout budget" "Stream F.3"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "No regression in driver test suites; /healthz reachable under redundancy load" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.3 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.3 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
83
scripts/compliance/phase-6-4-compliance.ps1
Normal file
83
scripts/compliance/phase-6-4-compliance.ps1
Normal file
@@ -0,0 +1,83 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Phase 6.4 exit-gate compliance check — stub. Each `Assert-*` either passes
|
||||
(Write-Host green) or throws. Non-zero exit = fail.
|
||||
|
||||
.DESCRIPTION
|
||||
Validates Phase 6.4 (Admin UI completion) completion. Checks enumerated in
|
||||
`docs/v2/implementation/phase-6-4-admin-ui-completion.md`
|
||||
§"Compliance Checks (run at exit gate)".
|
||||
|
||||
Current status: SCAFFOLD. Every check writes a TODO line and does NOT throw.
|
||||
Each implementation task in Phase 6.4 is responsible for replacing its TODO
|
||||
with a real check before closing that task.
|
||||
|
||||
.NOTES
|
||||
Usage: pwsh ./scripts/compliance/phase-6-4-compliance.ps1
|
||||
Exit: 0 = all checks passed (or are still TODO); non-zero = explicit fail
|
||||
#>
|
||||
[CmdletBinding()]
|
||||
param()
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$script:failures = 0
|
||||
|
||||
function Assert-Todo {
|
||||
param([string]$Check, [string]$ImplementationTask)
|
||||
Write-Host " [TODO] $Check (implement during $ImplementationTask)" -ForegroundColor Yellow
|
||||
}
|
||||
|
||||
function Assert-Pass {
|
||||
param([string]$Check)
|
||||
Write-Host " [PASS] $Check" -ForegroundColor Green
|
||||
}
|
||||
|
||||
function Assert-Fail {
|
||||
param([string]$Check, [string]$Reason)
|
||||
Write-Host " [FAIL] $Check — $Reason" -ForegroundColor Red
|
||||
$script:failures++
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Phase 6.4 compliance — Admin UI completion ===" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Stream A — UNS drag/move + impact preview"
|
||||
Assert-Todo "UNS drag/move — drag line across areas; modal shows correct impacted-equipment + tag counts" "Stream A.2"
|
||||
Assert-Todo "Concurrent-edit safety — session B saves draft mid-preview; session A Confirm returns 409" "Stream A.3 (DraftRevisionToken)"
|
||||
Assert-Todo "Cross-cluster drop disabled — actionable toast points to Export/Import" "Stream A.2"
|
||||
Assert-Todo "1000-node tree — drag-enter feedback < 100 ms" "Stream A.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream B — CSV import + staged-import + 5-identifier search"
|
||||
Assert-Todo "CSV header version — file missing '# OtOpcUaCsv v1' rejected pre-parse" "Stream B.1"
|
||||
Assert-Todo "CSV canonical identifier set — columns match decision #117 exactly" "Stream B.1"
|
||||
Assert-Todo "Staged-import atomicity — 10k-row FinaliseImportBatch < 30 s; user-scoped visibility; DropImportBatch rollback" "Stream B.3"
|
||||
Assert-Todo "Concurrent import + external reservation — finalize retries with conflict handling; no corruption" "Stream B.3"
|
||||
Assert-Todo "5-identifier search ranking — exact > prefix; published > draft for equal scores" "Stream B.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream C — DiffViewer sections"
|
||||
Assert-Todo "Diff viewer section caps — 2000-row subtree-rename summary-only; 'Load full diff' paginates" "Stream C.2"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Stream D — Identification (OPC 40010)"
|
||||
Assert-Todo "OPC 40010 field list match — rendered fields match decision #139 exactly; no extras" "Stream D.1"
|
||||
Assert-Todo "OPC 40010 exposure — Identification sub-folder shows when non-null; absent when all null" "Stream D.3"
|
||||
Assert-Todo "ACL inheritance for Identification — Equipment-grant reads; no-grant denies both" "Stream D.4"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Visual compliance"
|
||||
Assert-Todo "Visual parity reviewer — FleetAdmin signoff vs admin-ui.md §Visual-Design; screenshot set checked in under docs/v2/visual-compliance/phase-6-4/" "Visual review"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Cross-cutting"
|
||||
Assert-Todo "Full solution dotnet test passes; no test-count regression vs pre-Phase-6.4 baseline" "Final exit-gate"
|
||||
|
||||
Write-Host ""
|
||||
if ($script:failures -eq 0) {
|
||||
Write-Host "Phase 6.4 compliance: scaffold-mode PASS (all checks TODO)" -ForegroundColor Green
|
||||
exit 0
|
||||
}
|
||||
Write-Host "Phase 6.4 compliance: $script:failures FAIL(s)" -ForegroundColor Red
|
||||
exit 1
|
||||
@@ -25,6 +25,14 @@ namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
/// OPC UA <c>AlarmConditionState</c> when true. Defaults to false so existing non-Galaxy
|
||||
/// drivers aren't forced to flow a flag they don't produce.
|
||||
/// </param>
|
||||
/// <param name="WriteIdempotent">
|
||||
/// True when a timed-out or failed write to this attribute is safe to replay. Per
|
||||
/// <c>docs/v2/plan.md</c> decisions #44, #45, #143 — writes are NOT auto-retried by default
|
||||
/// because replaying a pulse / alarm-ack / counter-increment / recipe-step advance can
|
||||
/// duplicate field actions. Drivers flag only tags whose semantics make retry safe
|
||||
/// (holding registers with level-set values, set-point writes to analog tags) — the
|
||||
/// capability invoker respects this flag when deciding whether to apply Polly retry.
|
||||
/// </param>
|
||||
public sealed record DriverAttributeInfo(
|
||||
string FullName,
|
||||
DriverDataType DriverDataType,
|
||||
@@ -32,4 +40,5 @@ public sealed record DriverAttributeInfo(
|
||||
uint? ArrayDim,
|
||||
SecurityClassification SecurityClass,
|
||||
bool IsHistorized,
|
||||
bool IsAlarm = false);
|
||||
bool IsAlarm = false,
|
||||
bool WriteIdempotent = false);
|
||||
|
||||
42
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverCapability.cs
Normal file
42
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverCapability.cs
Normal file
@@ -0,0 +1,42 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Enumerates the driver-capability surface points guarded by Phase 6.1 resilience pipelines.
|
||||
/// Each value corresponds to one method (or tightly-related method group) on the
|
||||
/// <c>Core.Abstractions</c> capability interfaces (<see cref="IReadable"/>, <see cref="IWritable"/>,
|
||||
/// <see cref="ITagDiscovery"/>, <see cref="ISubscribable"/>, <see cref="IHostConnectivityProbe"/>,
|
||||
/// <see cref="IAlarmSource"/>, <see cref="IHistoryProvider"/>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decision #143 (per-capability retry policy): Read / HistoryRead /
|
||||
/// Discover / Probe / AlarmSubscribe auto-retry; <see cref="Write"/> does NOT retry unless the
|
||||
/// tag-definition carries <see cref="WriteIdempotentAttribute"/>. Alarm-acknowledge is treated
|
||||
/// as a write for retry semantics (an alarm-ack is not idempotent at the plant-floor acknowledgement
|
||||
/// level even if the OPC UA spec permits re-issue).
|
||||
/// </remarks>
|
||||
public enum DriverCapability
|
||||
{
|
||||
/// <summary>Batch <see cref="IReadable.ReadAsync"/>. Retries by default.</summary>
|
||||
Read,
|
||||
|
||||
/// <summary>Batch <see cref="IWritable.WriteAsync"/>. Does not retry unless tag is <see cref="WriteIdempotentAttribute">idempotent</see>.</summary>
|
||||
Write,
|
||||
|
||||
/// <summary><see cref="ITagDiscovery.DiscoverAsync"/>. Retries by default.</summary>
|
||||
Discover,
|
||||
|
||||
/// <summary><see cref="ISubscribable.SubscribeAsync"/> and unsubscribe. Retries by default.</summary>
|
||||
Subscribe,
|
||||
|
||||
/// <summary><see cref="IHostConnectivityProbe"/> probe loop. Retries by default.</summary>
|
||||
Probe,
|
||||
|
||||
/// <summary><see cref="IAlarmSource.SubscribeAlarmsAsync"/>. Retries by default.</summary>
|
||||
AlarmSubscribe,
|
||||
|
||||
/// <summary><see cref="IAlarmSource.AcknowledgeAsync"/>. Does NOT retry — ack is a write-shaped operation (decision #143).</summary>
|
||||
AlarmAcknowledge,
|
||||
|
||||
/// <summary><see cref="IHistoryProvider"/> reads (Raw/Processed/AtTime/Events). Retries by default.</summary>
|
||||
HistoryRead,
|
||||
}
|
||||
34
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTier.cs
Normal file
34
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTier.cs
Normal file
@@ -0,0 +1,34 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Stability tier of a driver type. Determines which cross-cutting runtime protections
|
||||
/// apply — per-tier retry defaults, memory-tracking thresholds, and whether out-of-process
|
||||
/// supervision with process-level recycle is in play.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/driver-stability.md</c> §2-4 and <c>docs/v2/plan.md</c> decisions #63-74.
|
||||
///
|
||||
/// <list type="bullet">
|
||||
/// <item><b>A</b> — managed, known-good SDK; low blast radius. In-process. Fast retries.
|
||||
/// Examples: OPC UA Client (OPCFoundation stack), S7 (S7NetPlus).</item>
|
||||
/// <item><b>B</b> — native or semi-trusted SDK with an in-process footprint. Examples: Modbus.</item>
|
||||
/// <item><b>C</b> — unmanaged SDK with COM/STA constraints, leak risk, or other out-of-process
|
||||
/// requirements. Must run as a separate Host process behind a Proxy with a supervisor that
|
||||
/// can recycle the process on hard-breach. Example: Galaxy (MXAccess COM).</item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para>Process-kill protections (<c>MemoryRecycle</c>, <c>ScheduledRecycleScheduler</c>) are
|
||||
/// Tier C only per decisions #73-74 and #145 — killing an in-process Tier A/B driver also kills
|
||||
/// every OPC UA session and every co-hosted driver, blast-radius worse than the leak.</para>
|
||||
/// </remarks>
|
||||
public enum DriverTier
|
||||
{
|
||||
/// <summary>Managed SDK, in-process, low blast radius.</summary>
|
||||
A,
|
||||
|
||||
/// <summary>Native or semi-trusted SDK, in-process.</summary>
|
||||
B,
|
||||
|
||||
/// <summary>Unmanaged SDK, out-of-process required with Proxy+Host+Supervisor.</summary>
|
||||
C,
|
||||
}
|
||||
@@ -69,12 +69,20 @@ public sealed class DriverTypeRegistry
|
||||
/// <param name="DriverConfigJsonSchema">JSON Schema (Draft 2020-12) the driver's <c>DriverConfig</c> column must validate against.</param>
|
||||
/// <param name="DeviceConfigJsonSchema">JSON Schema for <c>DeviceConfig</c> (multi-device drivers); null if the driver has no device layer.</param>
|
||||
/// <param name="TagConfigJsonSchema">JSON Schema for <c>TagConfig</c>; required for every driver since every driver has tags.</param>
|
||||
/// <param name="Tier">
|
||||
/// Stability tier per <c>docs/v2/driver-stability.md</c> §2-4 and <c>docs/v2/plan.md</c>
|
||||
/// decisions #63-74. Drives the shared resilience pipeline defaults
|
||||
/// (<see cref="Tier"/> × capability → <c>CapabilityPolicy</c>), the <c>MemoryTracking</c>
|
||||
/// hybrid-formula constants, and whether process-level <c>MemoryRecycle</c> / scheduled-
|
||||
/// recycle protections apply (Tier C only). Every registered driver type must declare one.
|
||||
/// </param>
|
||||
public sealed record DriverTypeMetadata(
|
||||
string TypeName,
|
||||
NamespaceKindCompatibility AllowedNamespaceKinds,
|
||||
string DriverConfigJsonSchema,
|
||||
string? DeviceConfigJsonSchema,
|
||||
string TagConfigJsonSchema);
|
||||
string TagConfigJsonSchema,
|
||||
DriverTier Tier);
|
||||
|
||||
/// <summary>Bitmask of namespace kinds a driver type may populate. Per decision #111.</summary>
|
||||
[Flags]
|
||||
|
||||
26
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs
Normal file
26
src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs
Normal file
@@ -0,0 +1,26 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Process-level supervisor contract a Tier C driver's out-of-process topology provides
|
||||
/// (e.g. <c>Driver.Galaxy.Proxy/Supervisor/</c>). Concerns: restart the Host process when a
|
||||
/// hard fault is detected (memory breach, wedge, scheduled recycle window).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #68, #73-74, and #145. Tier A/B drivers do NOT have
|
||||
/// a supervisor because they run in-process — recycling would kill every OPC UA session and
|
||||
/// every co-hosted driver. The Core.Stability layer only invokes this interface for Tier C
|
||||
/// instances after asserting the tier via <see cref="DriverTypeMetadata.Tier"/>.
|
||||
/// </remarks>
|
||||
public interface IDriverSupervisor
|
||||
{
|
||||
/// <summary>Driver instance this supervisor governs.</summary>
|
||||
string DriverInstanceId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Request the supervisor to recycle (terminate + restart) the Host process. Implementations
|
||||
/// are expected to be idempotent under repeat calls during an in-flight recycle.
|
||||
/// </summary>
|
||||
/// <param name="reason">Human-readable reason — flows into the supervisor's logs.</param>
|
||||
/// <param name="cancellationToken">Cancels the recycle request; an in-flight restart is not interrupted.</param>
|
||||
Task RecycleAsync(string reason, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Opts a tag-definition record into auto-retry on <see cref="IWritable.WriteAsync"/> failures.
|
||||
/// Absence of this attribute means writes are <b>not</b> retried — a timed-out write may have
|
||||
/// already succeeded at the device, and replaying pulses, alarm acks, counter increments, or
|
||||
/// recipe-step advances can duplicate irreversible field actions.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #44, #45, and #143. Applied to tag-definition POCOs
|
||||
/// (e.g. <c>ModbusTagDefinition</c>, <c>S7TagDefinition</c>, OPC UA client tag rows) at the
|
||||
/// property or record level. The <c>CapabilityInvoker</c> in <c>ZB.MOM.WW.OtOpcUa.Core.Resilience</c>
|
||||
/// reads this attribute via reflection once at driver-init time and caches the result; no
|
||||
/// per-write reflection cost.
|
||||
/// </remarks>
|
||||
[AttributeUsage(AttributeTargets.Property | AttributeTargets.Class | AttributeTargets.Struct, AllowMultiple = false, Inherited = true)]
|
||||
public sealed class WriteIdempotentAttribute : Attribute
|
||||
{
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Domain-layer health aggregation for Phase 6.1 Stream C. Pure functions over the driver
|
||||
/// fleet — given each driver's <see cref="DriverState"/>, produce a <see cref="ReadinessVerdict"/>
|
||||
/// that maps to HTTP status codes at the endpoint layer.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// State matrix per <c>docs/v2/implementation/phase-6-1-resilience-and-observability.md</c>
|
||||
/// §Stream C.1:
|
||||
/// <list type="bullet">
|
||||
/// <item><see cref="DriverState.Unknown"/> / <see cref="DriverState.Initializing"/>
|
||||
/// → /readyz 503 (not yet ready).</item>
|
||||
/// <item><see cref="DriverState.Healthy"/> → /readyz 200.</item>
|
||||
/// <item><see cref="DriverState.Degraded"/> → /readyz 200 with flagged driver IDs.</item>
|
||||
/// <item><see cref="DriverState.Faulted"/> → /readyz 503.</item>
|
||||
/// </list>
|
||||
/// The overall verdict is computed across the fleet: any Faulted → Faulted; any
|
||||
/// Unknown/Initializing → NotReady; any Degraded → Degraded; else Healthy. An empty fleet
|
||||
/// is Healthy (nothing to degrade).
|
||||
/// </remarks>
|
||||
public static class DriverHealthReport
|
||||
{
|
||||
/// <summary>Compute the fleet-wide readiness verdict from per-driver states.</summary>
|
||||
public static ReadinessVerdict Aggregate(IReadOnlyList<DriverHealthSnapshot> drivers)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(drivers);
|
||||
if (drivers.Count == 0) return ReadinessVerdict.Healthy;
|
||||
|
||||
var anyFaulted = drivers.Any(d => d.State == DriverState.Faulted);
|
||||
if (anyFaulted) return ReadinessVerdict.Faulted;
|
||||
|
||||
var anyInitializing = drivers.Any(d =>
|
||||
d.State == DriverState.Unknown || d.State == DriverState.Initializing);
|
||||
if (anyInitializing) return ReadinessVerdict.NotReady;
|
||||
|
||||
// Reconnecting = driver alive but not serving live data; report as Degraded so /readyz
|
||||
// stays 200 (the fleet can still serve cached / last-good data) while operators see the
|
||||
// affected driver in the body.
|
||||
var anyDegraded = drivers.Any(d =>
|
||||
d.State == DriverState.Degraded || d.State == DriverState.Reconnecting);
|
||||
if (anyDegraded) return ReadinessVerdict.Degraded;
|
||||
|
||||
return ReadinessVerdict.Healthy;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Map a <see cref="ReadinessVerdict"/> to the HTTP status the /readyz endpoint should
|
||||
/// return per the Stream C.1 state matrix.
|
||||
/// </summary>
|
||||
public static int HttpStatus(ReadinessVerdict verdict) => verdict switch
|
||||
{
|
||||
ReadinessVerdict.Healthy => 200,
|
||||
ReadinessVerdict.Degraded => 200,
|
||||
ReadinessVerdict.NotReady => 503,
|
||||
ReadinessVerdict.Faulted => 503,
|
||||
_ => 500,
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Per-driver snapshot fed into <see cref="DriverHealthReport.Aggregate"/>.</summary>
|
||||
/// <param name="DriverInstanceId">Driver instance identifier (from <c>IDriver.DriverInstanceId</c>).</param>
|
||||
/// <param name="State">Current <see cref="DriverState"/> from <c>IDriver.GetHealth</c>.</param>
|
||||
/// <param name="DetailMessage">Optional driver-supplied detail (e.g. "primary PLC unreachable").</param>
|
||||
public sealed record DriverHealthSnapshot(
|
||||
string DriverInstanceId,
|
||||
DriverState State,
|
||||
string? DetailMessage = null);
|
||||
|
||||
/// <summary>Overall fleet readiness — derived from driver states by <see cref="DriverHealthReport.Aggregate"/>.</summary>
|
||||
public enum ReadinessVerdict
|
||||
{
|
||||
/// <summary>All drivers Healthy (or fleet is empty).</summary>
|
||||
Healthy,
|
||||
|
||||
/// <summary>At least one driver Degraded; none Faulted / NotReady.</summary>
|
||||
Degraded,
|
||||
|
||||
/// <summary>At least one driver Unknown / Initializing; none Faulted.</summary>
|
||||
NotReady,
|
||||
|
||||
/// <summary>At least one driver Faulted.</summary>
|
||||
Faulted,
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
using Serilog.Context;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Convenience wrapper around Serilog <see cref="LogContext"/> — attaches the set of
|
||||
/// structured properties a capability call should carry (DriverInstanceId, DriverType,
|
||||
/// CapabilityName, CorrelationId). Callers wrap their call-site body in a <c>using</c>
|
||||
/// block; inner <c>Log.Information</c> / <c>Log.Warning</c> calls emit the context
|
||||
/// automatically via the Serilog enricher chain.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/implementation/phase-6-1-resilience-and-observability.md</c> §Stream C.2.
|
||||
/// The correlation ID should be the OPC UA <c>RequestHeader.RequestHandle</c> when in-flight;
|
||||
/// otherwise a short random GUID. Callers supply whichever is available.
|
||||
/// </remarks>
|
||||
public static class LogContextEnricher
|
||||
{
|
||||
/// <summary>Attach the capability-call property set. Dispose the returned scope to pop.</summary>
|
||||
public static IDisposable Push(string driverInstanceId, string driverType, DriverCapability capability, string correlationId)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(driverInstanceId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(driverType);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(correlationId);
|
||||
|
||||
var a = LogContext.PushProperty("DriverInstanceId", driverInstanceId);
|
||||
var b = LogContext.PushProperty("DriverType", driverType);
|
||||
var c = LogContext.PushProperty("CapabilityName", capability.ToString());
|
||||
var d = LogContext.PushProperty("CorrelationId", correlationId);
|
||||
return new CompositeScope(a, b, c, d);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generate a short correlation ID when no OPC UA RequestHandle is available.
|
||||
/// 12-hex-char slice of a GUID — long enough for log correlation, short enough to
|
||||
/// scan visually.
|
||||
/// </summary>
|
||||
public static string NewCorrelationId() => Guid.NewGuid().ToString("N")[..12];
|
||||
|
||||
private sealed class CompositeScope : IDisposable
|
||||
{
|
||||
private readonly IDisposable[] _inner;
|
||||
public CompositeScope(params IDisposable[] inner) => _inner = inner;
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
// Reverse-order disposal matches Serilog's stack semantics.
|
||||
for (var i = _inner.Length - 1; i >= 0; i--)
|
||||
_inner[i].Dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
120
src/ZB.MOM.WW.OtOpcUa.Core/Resilience/CapabilityInvoker.cs
Normal file
120
src/ZB.MOM.WW.OtOpcUa.Core/Resilience/CapabilityInvoker.cs
Normal file
@@ -0,0 +1,120 @@
|
||||
using Polly;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Executes driver-capability calls through a shared Polly pipeline. One invoker per
|
||||
/// <c>(DriverInstance, IDriver)</c> pair; the underlying <see cref="DriverResiliencePipelineBuilder"/>
|
||||
/// is process-singleton so all invokers share its cache.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #143-144 and Phase 6.1 Stream A.3. The server's dispatch
|
||||
/// layer routes every capability call (<c>IReadable.ReadAsync</c>, <c>IWritable.WriteAsync</c>,
|
||||
/// <c>ITagDiscovery.DiscoverAsync</c>, <c>ISubscribable.SubscribeAsync/UnsubscribeAsync</c>,
|
||||
/// <c>IHostConnectivityProbe</c> probe loop, <c>IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync</c>,
|
||||
/// and all four <c>IHistoryProvider</c> reads) through this invoker.
|
||||
/// </remarks>
|
||||
public sealed class CapabilityInvoker
|
||||
{
|
||||
private readonly DriverResiliencePipelineBuilder _builder;
|
||||
private readonly string _driverInstanceId;
|
||||
private readonly string _driverType;
|
||||
private readonly Func<DriverResilienceOptions> _optionsAccessor;
|
||||
|
||||
/// <summary>
|
||||
/// Construct an invoker for one driver instance.
|
||||
/// </summary>
|
||||
/// <param name="builder">Shared, process-singleton pipeline builder.</param>
|
||||
/// <param name="driverInstanceId">The <c>DriverInstance.Id</c> column value.</param>
|
||||
/// <param name="optionsAccessor">
|
||||
/// Snapshot accessor for the current resilience options. Invoked per call so Admin-edit +
|
||||
/// pipeline-invalidate can take effect without restarting the invoker.
|
||||
/// </param>
|
||||
/// <param name="driverType">Driver type name for structured-log enrichment (e.g. <c>"Modbus"</c>).</param>
|
||||
public CapabilityInvoker(
|
||||
DriverResiliencePipelineBuilder builder,
|
||||
string driverInstanceId,
|
||||
Func<DriverResilienceOptions> optionsAccessor,
|
||||
string driverType = "Unknown")
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
ArgumentNullException.ThrowIfNull(optionsAccessor);
|
||||
|
||||
_builder = builder;
|
||||
_driverInstanceId = driverInstanceId;
|
||||
_driverType = driverType;
|
||||
_optionsAccessor = optionsAccessor;
|
||||
}
|
||||
|
||||
/// <summary>Execute a capability call returning a value, honoring the per-capability pipeline.</summary>
|
||||
/// <typeparam name="TResult">Return type of the underlying driver call.</typeparam>
|
||||
public async ValueTask<TResult> ExecuteAsync<TResult>(
|
||||
DriverCapability capability,
|
||||
string hostName,
|
||||
Func<CancellationToken, ValueTask<TResult>> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
var pipeline = ResolvePipeline(capability, hostName);
|
||||
using (LogContextEnricher.Push(_driverInstanceId, _driverType, capability, LogContextEnricher.NewCorrelationId()))
|
||||
{
|
||||
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Execute a void-returning capability call, honoring the per-capability pipeline.</summary>
|
||||
public async ValueTask ExecuteAsync(
|
||||
DriverCapability capability,
|
||||
string hostName,
|
||||
Func<CancellationToken, ValueTask> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
var pipeline = ResolvePipeline(capability, hostName);
|
||||
using (LogContextEnricher.Push(_driverInstanceId, _driverType, capability, LogContextEnricher.NewCorrelationId()))
|
||||
{
|
||||
await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Execute a <see cref="DriverCapability.Write"/> call honoring <see cref="WriteIdempotentAttribute"/>
|
||||
/// semantics — if <paramref name="isIdempotent"/> is <c>false</c>, retries are disabled regardless
|
||||
/// of the tag-level configuration (the pipeline for a non-idempotent write never retries per
|
||||
/// decisions #44-45). If <c>true</c>, the call runs through the capability's pipeline which may
|
||||
/// retry when the tier configuration permits.
|
||||
/// </summary>
|
||||
public async ValueTask<TResult> ExecuteWriteAsync<TResult>(
|
||||
string hostName,
|
||||
bool isIdempotent,
|
||||
Func<CancellationToken, ValueTask<TResult>> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
if (!isIdempotent)
|
||||
{
|
||||
var noRetryOptions = _optionsAccessor() with
|
||||
{
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = _optionsAccessor().Resolve(DriverCapability.Write) with { RetryCount = 0 },
|
||||
},
|
||||
};
|
||||
var pipeline = _builder.GetOrCreate(_driverInstanceId, $"{hostName}::non-idempotent", DriverCapability.Write, noRetryOptions);
|
||||
using (LogContextEnricher.Push(_driverInstanceId, _driverType, DriverCapability.Write, LogContextEnricher.NewCorrelationId()))
|
||||
{
|
||||
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
return await ExecuteAsync(DriverCapability.Write, hostName, callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private ResiliencePipeline ResolvePipeline(DriverCapability capability, string hostName) =>
|
||||
_builder.GetOrCreate(_driverInstanceId, hostName, capability, _optionsAccessor());
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Per-tier × per-capability resilience policy configuration for a driver instance.
|
||||
/// Bound from <c>DriverInstance.ResilienceConfig</c> JSON (nullable column; null = tier defaults).
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #143 and #144.
|
||||
/// </summary>
|
||||
public sealed record DriverResilienceOptions
|
||||
{
|
||||
/// <summary>Tier the owning driver type is registered as; drives the default map.</summary>
|
||||
public required DriverTier Tier { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-capability policy overrides. Capabilities absent from this map fall back to
|
||||
/// <see cref="GetTierDefaults(DriverTier)"/> for the configured <see cref="Tier"/>.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<DriverCapability, CapabilityPolicy> CapabilityPolicies { get; init; }
|
||||
= new Dictionary<DriverCapability, CapabilityPolicy>();
|
||||
|
||||
/// <summary>Bulkhead (max concurrent in-flight calls) for every capability. Default 32.</summary>
|
||||
public int BulkheadMaxConcurrent { get; init; } = 32;
|
||||
|
||||
/// <summary>
|
||||
/// Bulkhead queue depth. Zero = no queueing; overflow fails fast with
|
||||
/// <c>BulkheadRejectedException</c>. Default 64.
|
||||
/// </summary>
|
||||
public int BulkheadMaxQueue { get; init; } = 64;
|
||||
|
||||
/// <summary>
|
||||
/// Look up the effective policy for a capability, falling back to tier defaults when no
|
||||
/// override is configured. Never returns null.
|
||||
/// </summary>
|
||||
public CapabilityPolicy Resolve(DriverCapability capability)
|
||||
{
|
||||
if (CapabilityPolicies.TryGetValue(capability, out var policy))
|
||||
return policy;
|
||||
|
||||
var defaults = GetTierDefaults(Tier);
|
||||
return defaults[capability];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-tier per-capability default policy table, per decisions #143-144 and the Phase 6.1
|
||||
/// Stream A.2 specification. Retries skipped on <see cref="DriverCapability.Write"/> and
|
||||
/// <see cref="DriverCapability.AlarmAcknowledge"/> regardless of tier.
|
||||
/// </summary>
|
||||
public static IReadOnlyDictionary<DriverCapability, CapabilityPolicy> GetTierDefaults(DriverTier tier) =>
|
||||
tier switch
|
||||
{
|
||||
DriverTier.A => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 3),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 5, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 5),
|
||||
},
|
||||
DriverTier.B => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 4, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 3),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 8, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 5),
|
||||
},
|
||||
DriverTier.C => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 10, RetryCount: 0, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 15, RetryCount: 0, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
},
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No default policy table defined for tier {tier}."),
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Policy for one capability on one driver instance.</summary>
|
||||
/// <param name="TimeoutSeconds">Per-call timeout (wraps the inner Polly execution).</param>
|
||||
/// <param name="RetryCount">Number of retry attempts after the first failure; zero = no retry.</param>
|
||||
/// <param name="BreakerFailureThreshold">
|
||||
/// Consecutive-failure count that opens the circuit breaker; zero = no breaker
|
||||
/// (Tier C uses the supervisor's process-level breaker instead, per decision #68).
|
||||
/// </param>
|
||||
public sealed record CapabilityPolicy(int TimeoutSeconds, int RetryCount, int BreakerFailureThreshold);
|
||||
@@ -0,0 +1,118 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Polly;
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Retry;
|
||||
using Polly.Timeout;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Builds and caches Polly resilience pipelines keyed on
|
||||
/// <c>(DriverInstanceId, HostName, DriverCapability)</c>. One dead PLC behind a multi-device
|
||||
/// driver cannot open the circuit breaker for healthy sibling hosts.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decision #144 (per-device isolation). Composition from outside-in:
|
||||
/// <b>Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead</b>.
|
||||
///
|
||||
/// <para>Pipeline resolution is lock-free on the hot path: the inner
|
||||
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> caches a <see cref="ResiliencePipeline"/> per key;
|
||||
/// first-call cost is one <see cref="ResiliencePipelineBuilder"/>.Build. Thereafter reads are O(1).</para>
|
||||
/// </remarks>
|
||||
public sealed class DriverResiliencePipelineBuilder
|
||||
{
|
||||
private readonly ConcurrentDictionary<PipelineKey, ResiliencePipeline> _pipelines = new();
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>Construct with the ambient clock (use <see cref="TimeProvider.System"/> in prod).</summary>
|
||||
public DriverResiliencePipelineBuilder(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get or build the pipeline for a given <c>(driver instance, host, capability)</c> triple.
|
||||
/// Calls with the same key + same options reuse the same pipeline instance; the first caller
|
||||
/// wins if a race occurs (both pipelines would be behaviourally identical).
|
||||
/// </summary>
|
||||
/// <param name="driverInstanceId">DriverInstance primary key — opaque to this layer.</param>
|
||||
/// <param name="hostName">
|
||||
/// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the
|
||||
/// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the
|
||||
/// specific PLC so one dead PLC doesn't poison healthy siblings.
|
||||
/// </param>
|
||||
/// <param name="capability">Which capability surface is being called.</param>
|
||||
/// <param name="options">Per-driver-instance options (tier + per-capability overrides).</param>
|
||||
public ResiliencePipeline GetOrCreate(
|
||||
string driverInstanceId,
|
||||
string hostName,
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
|
||||
|
||||
var key = new PipelineKey(driverInstanceId, hostName, capability);
|
||||
return _pipelines.GetOrAdd(key, static (_, state) => Build(state.capability, state.options, state.timeProvider),
|
||||
(capability, options, timeProvider: _timeProvider));
|
||||
}
|
||||
|
||||
/// <summary>Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.</summary>
|
||||
public int Invalidate(string driverInstanceId)
|
||||
{
|
||||
var removed = 0;
|
||||
foreach (var key in _pipelines.Keys)
|
||||
{
|
||||
if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _))
|
||||
removed++;
|
||||
}
|
||||
return removed;
|
||||
}
|
||||
|
||||
/// <summary>Snapshot of the current number of cached pipelines. For diagnostics only.</summary>
|
||||
public int CachedPipelineCount => _pipelines.Count;
|
||||
|
||||
private static ResiliencePipeline Build(
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
var policy = options.Resolve(capability);
|
||||
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
|
||||
|
||||
builder.AddTimeout(new TimeoutStrategyOptions
|
||||
{
|
||||
Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds),
|
||||
});
|
||||
|
||||
if (policy.RetryCount > 0)
|
||||
{
|
||||
builder.AddRetry(new RetryStrategyOptions
|
||||
{
|
||||
MaxRetryAttempts = policy.RetryCount,
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
UseJitter = true,
|
||||
Delay = TimeSpan.FromMilliseconds(100),
|
||||
MaxDelay = TimeSpan.FromSeconds(5),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
});
|
||||
}
|
||||
|
||||
if (policy.BreakerFailureThreshold > 0)
|
||||
{
|
||||
builder.AddCircuitBreaker(new CircuitBreakerStrategyOptions
|
||||
{
|
||||
FailureRatio = 1.0,
|
||||
MinimumThroughput = policy.BreakerFailureThreshold,
|
||||
SamplingDuration = TimeSpan.FromSeconds(30),
|
||||
BreakDuration = TimeSpan.FromSeconds(15),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
});
|
||||
}
|
||||
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability);
|
||||
}
|
||||
65
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs
Normal file
65
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs
Normal file
@@ -0,0 +1,65 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier C only process-recycle companion to <see cref="MemoryTracking"/>. On a
|
||||
/// <see cref="MemoryTrackingAction.HardBreach"/> signal, invokes the supplied
|
||||
/// <see cref="IDriverSupervisor"/> to restart the out-of-process Host.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #74 and #145. Tier A/B hard-breach on an in-process
|
||||
/// driver would kill every OPC UA session and every co-hosted driver, so for Tier A/B this
|
||||
/// class logs a <b>promotion-to-Tier-C recommendation</b> and does NOT invoke any supervisor.
|
||||
/// A future tier-migration workflow acts on the recommendation.
|
||||
/// </remarks>
|
||||
public sealed class MemoryRecycle
|
||||
{
|
||||
private readonly DriverTier _tier;
|
||||
private readonly IDriverSupervisor? _supervisor;
|
||||
private readonly ILogger<MemoryRecycle> _logger;
|
||||
|
||||
public MemoryRecycle(DriverTier tier, IDriverSupervisor? supervisor, ILogger<MemoryRecycle> logger)
|
||||
{
|
||||
_tier = tier;
|
||||
_supervisor = supervisor;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handle a <see cref="MemoryTracking"/> classification for the driver. For Tier C with a
|
||||
/// wired supervisor, <c>HardBreach</c> triggers <see cref="IDriverSupervisor.RecycleAsync"/>.
|
||||
/// All other combinations are no-ops with respect to process state (soft breaches + Tier A/B
|
||||
/// hard breaches just log).
|
||||
/// </summary>
|
||||
/// <returns>True when a recycle was requested; false otherwise.</returns>
|
||||
public async Task<bool> HandleAsync(MemoryTrackingAction action, long footprintBytes, CancellationToken cancellationToken)
|
||||
{
|
||||
switch (action)
|
||||
{
|
||||
case MemoryTrackingAction.SoftBreach:
|
||||
_logger.LogWarning(
|
||||
"Memory soft-breach on driver {DriverId}: footprint={Footprint:N0} bytes, tier={Tier}. Surfaced to Admin; no action.",
|
||||
_supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes, _tier);
|
||||
return false;
|
||||
|
||||
case MemoryTrackingAction.HardBreach when _tier == DriverTier.C && _supervisor is not null:
|
||||
_logger.LogError(
|
||||
"Memory hard-breach on Tier C driver {DriverId}: footprint={Footprint:N0} bytes. Requesting supervisor recycle.",
|
||||
_supervisor.DriverInstanceId, footprintBytes);
|
||||
await _supervisor.RecycleAsync($"Memory hard-breach: {footprintBytes} bytes", cancellationToken).ConfigureAwait(false);
|
||||
return true;
|
||||
|
||||
case MemoryTrackingAction.HardBreach:
|
||||
_logger.LogError(
|
||||
"Memory hard-breach on Tier {Tier} in-process driver {DriverId}: footprint={Footprint:N0} bytes. " +
|
||||
"Recommending promotion to Tier C; NOT auto-killing (decisions #74, #145).",
|
||||
_tier, _supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes);
|
||||
return false;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
136
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs
Normal file
136
src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs
Normal file
@@ -0,0 +1,136 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier-agnostic memory-footprint tracker. Captures the post-initialize <b>baseline</b>
|
||||
/// from the first samples after <c>IDriver.InitializeAsync</c>, then classifies each
|
||||
/// subsequent sample against a hybrid soft/hard threshold per
|
||||
/// <c>docs/v2/plan.md</c> decision #146 — <c>soft = max(multiplier × baseline, baseline + floor)</c>,
|
||||
/// <c>hard = 2 × soft</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Per decision #145, this tracker <b>never kills a process</b>. Soft and hard breaches
|
||||
/// log + surface to the Admin UI via <c>DriverInstanceResilienceStatus</c>. The matching
|
||||
/// process-level recycle protection lives in a separate <c>MemoryRecycle</c> that activates
|
||||
/// for Tier C drivers only (where the driver runs out-of-process behind a supervisor that
|
||||
/// can safely restart it without tearing down the OPC UA session or co-hosted in-proc
|
||||
/// drivers).</para>
|
||||
///
|
||||
/// <para>Baseline capture: the tracker starts in <see cref="TrackingPhase.WarmingUp"/> for
|
||||
/// <see cref="BaselineWindow"/> (default 5 min). During that window samples are collected;
|
||||
/// the baseline is computed as the median once the window elapses. Before that point every
|
||||
/// classification returns <see cref="MemoryTrackingAction.Warming"/>.</para>
|
||||
/// </remarks>
|
||||
public sealed class MemoryTracking
|
||||
{
|
||||
private readonly DriverTier _tier;
|
||||
private readonly TimeSpan _baselineWindow;
|
||||
private readonly List<long> _warmupSamples = [];
|
||||
private long _baselineBytes;
|
||||
private TrackingPhase _phase = TrackingPhase.WarmingUp;
|
||||
private DateTime? _warmupStartUtc;
|
||||
|
||||
/// <summary>Tier-default multiplier/floor constants per decision #146.</summary>
|
||||
public static (int Multiplier, long FloorBytes) GetTierConstants(DriverTier tier) => tier switch
|
||||
{
|
||||
DriverTier.A => (Multiplier: 3, FloorBytes: 50L * 1024 * 1024),
|
||||
DriverTier.B => (Multiplier: 3, FloorBytes: 100L * 1024 * 1024),
|
||||
DriverTier.C => (Multiplier: 2, FloorBytes: 500L * 1024 * 1024),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No memory-tracking constants defined for tier {tier}."),
|
||||
};
|
||||
|
||||
/// <summary>Window over which post-init samples are collected to compute the baseline.</summary>
|
||||
public TimeSpan BaselineWindow => _baselineWindow;
|
||||
|
||||
/// <summary>Current phase: <see cref="TrackingPhase.WarmingUp"/> or <see cref="TrackingPhase.Steady"/>.</summary>
|
||||
public TrackingPhase Phase => _phase;
|
||||
|
||||
/// <summary>Captured baseline; 0 until warmup completes.</summary>
|
||||
public long BaselineBytes => _baselineBytes;
|
||||
|
||||
/// <summary>Effective soft threshold (zero while warming up).</summary>
|
||||
public long SoftThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes);
|
||||
|
||||
/// <summary>Effective hard threshold = 2 × soft (zero while warming up).</summary>
|
||||
public long HardThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes) * 2;
|
||||
|
||||
public MemoryTracking(DriverTier tier, TimeSpan? baselineWindow = null)
|
||||
{
|
||||
_tier = tier;
|
||||
_baselineWindow = baselineWindow ?? TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Submit a memory-footprint sample. Returns the action the caller should surface.
|
||||
/// During warmup, always returns <see cref="MemoryTrackingAction.Warming"/> and accumulates
|
||||
/// samples; once the window elapses the first steady-phase sample triggers baseline capture
|
||||
/// (median of warmup samples).
|
||||
/// </summary>
|
||||
public MemoryTrackingAction Sample(long footprintBytes, DateTime utcNow)
|
||||
{
|
||||
if (_phase == TrackingPhase.WarmingUp)
|
||||
{
|
||||
_warmupStartUtc ??= utcNow;
|
||||
_warmupSamples.Add(footprintBytes);
|
||||
if (utcNow - _warmupStartUtc.Value >= _baselineWindow && _warmupSamples.Count > 0)
|
||||
{
|
||||
_baselineBytes = ComputeMedian(_warmupSamples);
|
||||
_phase = TrackingPhase.Steady;
|
||||
}
|
||||
else
|
||||
{
|
||||
return MemoryTrackingAction.Warming;
|
||||
}
|
||||
}
|
||||
|
||||
if (footprintBytes >= HardThresholdBytes) return MemoryTrackingAction.HardBreach;
|
||||
if (footprintBytes >= SoftThresholdBytes) return MemoryTrackingAction.SoftBreach;
|
||||
return MemoryTrackingAction.None;
|
||||
}
|
||||
|
||||
private static long ComputeSoft(DriverTier tier, long baseline)
|
||||
{
|
||||
var (multiplier, floor) = GetTierConstants(tier);
|
||||
return Math.Max(multiplier * baseline, baseline + floor);
|
||||
}
|
||||
|
||||
private static long ComputeMedian(List<long> samples)
|
||||
{
|
||||
var sorted = samples.Order().ToArray();
|
||||
var mid = sorted.Length / 2;
|
||||
return sorted.Length % 2 == 1
|
||||
? sorted[mid]
|
||||
: (sorted[mid - 1] + sorted[mid]) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Phase of a <see cref="MemoryTracking"/> lifecycle.</summary>
|
||||
public enum TrackingPhase
|
||||
{
|
||||
/// <summary>Collecting post-init samples; baseline not yet computed.</summary>
|
||||
WarmingUp,
|
||||
|
||||
/// <summary>Baseline captured; every sample classified against soft/hard thresholds.</summary>
|
||||
Steady,
|
||||
}
|
||||
|
||||
/// <summary>Classification the tracker returns per sample.</summary>
|
||||
public enum MemoryTrackingAction
|
||||
{
|
||||
/// <summary>Baseline not yet captured; sample collected, no threshold check.</summary>
|
||||
Warming,
|
||||
|
||||
/// <summary>Below soft threshold.</summary>
|
||||
None,
|
||||
|
||||
/// <summary>Between soft and hard thresholds — log + surface, no action.</summary>
|
||||
SoftBreach,
|
||||
|
||||
/// <summary>
|
||||
/// ≥ hard threshold. Log + surface + (Tier C only, via <c>MemoryRecycle</c>) request
|
||||
/// process recycle via the driver supervisor. Tier A/B breach never invokes any
|
||||
/// kill path per decisions #145 and #74.
|
||||
/// </summary>
|
||||
HardBreach,
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier C opt-in periodic-recycle driver per <c>docs/v2/plan.md</c> decision #67.
|
||||
/// A tick method advanced by the caller (fed by a background timer in prod; by test clock
|
||||
/// in unit tests) decides whether the configured interval has elapsed and, if so, drives the
|
||||
/// supplied <see cref="IDriverSupervisor"/> to recycle the Host.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Tier A/B drivers MUST NOT use this class — scheduled recycle for in-process drivers would
|
||||
/// kill every OPC UA session and every co-hosted driver. The ctor throws when constructed
|
||||
/// with any tier other than C to make the misuse structurally impossible.
|
||||
///
|
||||
/// <para>Keeps no background thread of its own — callers invoke <see cref="TickAsync"/> on
|
||||
/// their ambient scheduler tick (Phase 6.1 Stream C's health-endpoint host runs one). That
|
||||
/// decouples the unit under test from wall-clock time and thread-pool scheduling.</para>
|
||||
/// </remarks>
|
||||
public sealed class ScheduledRecycleScheduler
|
||||
{
|
||||
private readonly TimeSpan _recycleInterval;
|
||||
private readonly IDriverSupervisor _supervisor;
|
||||
private readonly ILogger<ScheduledRecycleScheduler> _logger;
|
||||
private DateTime _nextRecycleUtc;
|
||||
|
||||
/// <summary>
|
||||
/// Construct the scheduler for a Tier C driver. Throws if <paramref name="tier"/> isn't C.
|
||||
/// </summary>
|
||||
/// <param name="tier">Driver tier; must be <see cref="DriverTier.C"/>.</param>
|
||||
/// <param name="recycleInterval">Interval between recycles (e.g. 7 days).</param>
|
||||
/// <param name="startUtc">Anchor time; next recycle fires at <paramref name="startUtc"/> + <paramref name="recycleInterval"/>.</param>
|
||||
/// <param name="supervisor">Supervisor that performs the actual recycle.</param>
|
||||
/// <param name="logger">Diagnostic sink.</param>
|
||||
public ScheduledRecycleScheduler(
|
||||
DriverTier tier,
|
||||
TimeSpan recycleInterval,
|
||||
DateTime startUtc,
|
||||
IDriverSupervisor supervisor,
|
||||
ILogger<ScheduledRecycleScheduler> logger)
|
||||
{
|
||||
if (tier != DriverTier.C)
|
||||
throw new ArgumentException(
|
||||
$"ScheduledRecycleScheduler is Tier C only (got {tier}). " +
|
||||
"In-process drivers must not use scheduled recycle; see decisions #74 and #145.",
|
||||
nameof(tier));
|
||||
|
||||
if (recycleInterval <= TimeSpan.Zero)
|
||||
throw new ArgumentException("RecycleInterval must be positive.", nameof(recycleInterval));
|
||||
|
||||
_recycleInterval = recycleInterval;
|
||||
_supervisor = supervisor;
|
||||
_logger = logger;
|
||||
_nextRecycleUtc = startUtc + recycleInterval;
|
||||
}
|
||||
|
||||
/// <summary>Next scheduled recycle UTC. Advances by <see cref="RecycleInterval"/> on each fire.</summary>
|
||||
public DateTime NextRecycleUtc => _nextRecycleUtc;
|
||||
|
||||
/// <summary>Recycle interval this scheduler was constructed with.</summary>
|
||||
public TimeSpan RecycleInterval => _recycleInterval;
|
||||
|
||||
/// <summary>
|
||||
/// Tick the scheduler forward. If <paramref name="utcNow"/> is past
|
||||
/// <see cref="NextRecycleUtc"/>, requests a recycle from the supervisor and advances
|
||||
/// <see cref="NextRecycleUtc"/> by exactly one interval. Returns true when a recycle fired.
|
||||
/// </summary>
|
||||
public async Task<bool> TickAsync(DateTime utcNow, CancellationToken cancellationToken)
|
||||
{
|
||||
if (utcNow < _nextRecycleUtc)
|
||||
return false;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Scheduled recycle due for Tier C driver {DriverId} at {Now:o}; advancing next to {Next:o}.",
|
||||
_supervisor.DriverInstanceId, utcNow, _nextRecycleUtc + _recycleInterval);
|
||||
|
||||
await _supervisor.RecycleAsync("Scheduled periodic recycle", cancellationToken).ConfigureAwait(false);
|
||||
_nextRecycleUtc += _recycleInterval;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>Request an immediate recycle outside the schedule (e.g. MemoryRecycle hard-breach escalation).</summary>
|
||||
public Task RequestRecycleNowAsync(string reason, CancellationToken cancellationToken) =>
|
||||
_supervisor.RecycleAsync(reason, cancellationToken);
|
||||
}
|
||||
81
src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs
Normal file
81
src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs
Normal file
@@ -0,0 +1,81 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Demand-aware driver-wedge detector per <c>docs/v2/plan.md</c> decision #147.
|
||||
/// Flips a driver to <see cref="WedgeVerdict.Faulted"/> only when BOTH of the following hold:
|
||||
/// (a) there is pending work outstanding, AND (b) no progress has been observed for longer
|
||||
/// than <see cref="Threshold"/>. Idle drivers, write-only burst drivers, and subscription-only
|
||||
/// drivers whose signals don't arrive regularly all stay Healthy.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Pending work signal is supplied by the caller via <see cref="DemandSignal"/>:
|
||||
/// non-zero Polly bulkhead depth, ≥1 active MonitoredItem, or ≥1 queued historian read
|
||||
/// each qualifies. The detector itself is state-light: all it remembers is the last
|
||||
/// <c>LastProgressUtc</c> it saw and the last wedge verdict. No history buffer.</para>
|
||||
///
|
||||
/// <para>Default threshold per plan: <c>5 × PublishingInterval</c>, with a minimum of 60 s.
|
||||
/// Concrete values are driver-agnostic and configured per-instance by the caller.</para>
|
||||
/// </remarks>
|
||||
public sealed class WedgeDetector
|
||||
{
|
||||
/// <summary>Wedge-detection threshold; pass < 60 s and the detector clamps to 60 s.</summary>
|
||||
public TimeSpan Threshold { get; }
|
||||
|
||||
/// <summary>Whether the driver reported itself <see cref="DriverState.Healthy"/> at construction.</summary>
|
||||
public WedgeDetector(TimeSpan threshold)
|
||||
{
|
||||
Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classify the current state against the demand signal. Does not retain state across
|
||||
/// calls — each call is self-contained; the caller owns the <c>LastProgressUtc</c> clock.
|
||||
/// </summary>
|
||||
public WedgeVerdict Classify(DriverState state, DemandSignal demand, DateTime utcNow)
|
||||
{
|
||||
if (state != DriverState.Healthy)
|
||||
return WedgeVerdict.NotApplicable;
|
||||
|
||||
if (!demand.HasPendingWork)
|
||||
return WedgeVerdict.Idle;
|
||||
|
||||
var sinceProgress = utcNow - demand.LastProgressUtc;
|
||||
return sinceProgress > Threshold ? WedgeVerdict.Faulted : WedgeVerdict.Healthy;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Caller-supplied demand snapshot. All three counters are OR'd — any non-zero means work
|
||||
/// is outstanding, which is the trigger for checking the <see cref="LastProgressUtc"/> clock.
|
||||
/// </summary>
|
||||
/// <param name="BulkheadDepth">Polly bulkhead depth (in-flight capability calls).</param>
|
||||
/// <param name="ActiveMonitoredItems">Number of live OPC UA MonitoredItems bound to this driver.</param>
|
||||
/// <param name="QueuedHistoryReads">Pending historian-read requests the driver owes the server.</param>
|
||||
/// <param name="LastProgressUtc">Last time the driver reported a successful unit of work (read, subscribe-ack, publish).</param>
|
||||
public readonly record struct DemandSignal(
|
||||
int BulkheadDepth,
|
||||
int ActiveMonitoredItems,
|
||||
int QueuedHistoryReads,
|
||||
DateTime LastProgressUtc)
|
||||
{
|
||||
/// <summary>True when any of the three counters is > 0.</summary>
|
||||
public bool HasPendingWork => BulkheadDepth > 0 || ActiveMonitoredItems > 0 || QueuedHistoryReads > 0;
|
||||
}
|
||||
|
||||
/// <summary>Outcome of a single <see cref="WedgeDetector.Classify"/> call.</summary>
|
||||
public enum WedgeVerdict
|
||||
{
|
||||
/// <summary>Driver wasn't Healthy to begin with — wedge detection doesn't apply.</summary>
|
||||
NotApplicable,
|
||||
|
||||
/// <summary>Driver claims Healthy + no pending work → stays Healthy.</summary>
|
||||
Idle,
|
||||
|
||||
/// <summary>Driver claims Healthy + has pending work + has made progress within the threshold → stays Healthy.</summary>
|
||||
Healthy,
|
||||
|
||||
/// <summary>Driver claims Healthy + has pending work + has NOT made progress within the threshold → wedged.</summary>
|
||||
Faulted,
|
||||
}
|
||||
@@ -16,6 +16,11 @@
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Configuration\ZB.MOM.WW.OtOpcUa.Configuration.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Polly.Core" Version="8.6.6"/>
|
||||
<PackageReference Include="Serilog" Version="4.3.0"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Core.Tests"/>
|
||||
</ItemGroup>
|
||||
|
||||
161
src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/MelsecAddress.cs
Normal file
161
src/ZB.MOM.WW.OtOpcUa.Driver.Modbus/MelsecAddress.cs
Normal file
@@ -0,0 +1,161 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus;
|
||||
|
||||
/// <summary>
|
||||
/// Mitsubishi MELSEC PLC family selector for address-translation helpers. The Q/L/iQ-R
|
||||
/// families write bit-device addresses (X, Y) in <b>hexadecimal</b> in GX Works and the
|
||||
/// CPU manuals; the FX and iQ-F families write them in <b>octal</b> (same convention as
|
||||
/// AutomationDirect DirectLOGIC). Mixing the two up is the #1 MELSEC driver bug source —
|
||||
/// an operator typing <c>X20</c> into a Q-series tag config means decimal 32, but the
|
||||
/// same string on an FX3U means decimal 16, so the helper must know the family to route
|
||||
/// correctly.
|
||||
/// </summary>
|
||||
public enum MelsecFamily
|
||||
{
|
||||
/// <summary>
|
||||
/// MELSEC-Q / MELSEC-L / MELSEC iQ-R. X and Y device numbers are interpreted as
|
||||
/// <b>hexadecimal</b>; <c>X20</c> means decimal 32.
|
||||
/// </summary>
|
||||
Q_L_iQR,
|
||||
|
||||
/// <summary>
|
||||
/// MELSEC-F (FX3U / FX3GE / FX3G) and MELSEC iQ-F (FX5U). X and Y device numbers
|
||||
/// are interpreted as <b>octal</b> (same as DirectLOGIC); <c>X20</c> means decimal 16.
|
||||
/// iQ-F has a GX Works3 project toggle that can flip to decimal — if a site uses
|
||||
/// that, configure the tag's Address directly as a decimal PDU address and do not
|
||||
/// route through this helper.
|
||||
/// </summary>
|
||||
F_iQF,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Mitsubishi MELSEC address-translation helpers for the QJ71MT91 / LJ71MT91 / RJ71EN71 /
|
||||
/// iQ-R built-in / iQ-F / FX3U-ENET-P502 Modbus modules. MELSEC does NOT hard-wire
|
||||
/// Modbus-to-device mappings like DL260 does — every site configures its own "Modbus
|
||||
/// Device Assignment Parameter" block of up to 16 entries. The helpers here cover only
|
||||
/// the <b>address-notation</b> portion of the translation (hex X20 vs octal X20 + adding
|
||||
/// the bank base); the caller is still responsible for knowing the assignment-block
|
||||
/// offset for their site.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// See <c>docs/v2/mitsubishi.md</c> §device-assignment + §X-Y-hex-trap for the full
|
||||
/// matrix and primary-source citations.
|
||||
/// </remarks>
|
||||
public static class MelsecAddress
|
||||
{
|
||||
/// <summary>
|
||||
/// Translate a MELSEC X-input address (e.g. <c>"X0"</c>, <c>"X10"</c>) to a 0-based
|
||||
/// Modbus discrete-input address, given the PLC family's address notation (hex or
|
||||
/// octal) and the Modbus Device Assignment block's X-range base.
|
||||
/// </summary>
|
||||
/// <param name="xAddress">MELSEC X address. <c>X</c> prefix optional, case-insensitive.</param>
|
||||
/// <param name="family">The PLC family — determines whether the trailing digits are hex or octal.</param>
|
||||
/// <param name="xBankBase">
|
||||
/// 0-based Modbus DI address the assignment-block has configured X0 to land at.
|
||||
/// Typical default on QJ71MT91 sample projects: 0. Pass the site-specific value.
|
||||
/// </param>
|
||||
public static ushort XInputToDiscrete(string xAddress, MelsecFamily family, ushort xBankBase = 0) =>
|
||||
AddFamilyOffset(xBankBase, StripPrefix(xAddress, 'X'), family);
|
||||
|
||||
/// <summary>
|
||||
/// Translate a MELSEC Y-output address to a 0-based Modbus coil address. Same rules
|
||||
/// as <see cref="XInputToDiscrete"/> for hex/octal parsing.
|
||||
/// </summary>
|
||||
public static ushort YOutputToCoil(string yAddress, MelsecFamily family, ushort yBankBase = 0) =>
|
||||
AddFamilyOffset(yBankBase, StripPrefix(yAddress, 'Y'), family);
|
||||
|
||||
/// <summary>
|
||||
/// Translate a MELSEC M-relay address (internal relay) to a 0-based Modbus coil
|
||||
/// address. M-addresses are <b>decimal</b> on every MELSEC family — unlike X/Y which
|
||||
/// are hex on Q/L/iQ-R. Includes the bank base that the assignment-block configured.
|
||||
/// </summary>
|
||||
public static ushort MRelayToCoil(string mAddress, ushort mBankBase = 0)
|
||||
{
|
||||
var digits = StripPrefix(mAddress, 'M');
|
||||
if (!ushort.TryParse(digits, out var offset))
|
||||
throw new ArgumentException(
|
||||
$"M-relay address '{mAddress}' is not a valid decimal integer", nameof(mAddress));
|
||||
var result = mBankBase + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"M-relay {mAddress} + base {mBankBase} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Translate a MELSEC D-register address (data register) to a 0-based Modbus holding
|
||||
/// register address. D-addresses are <b>decimal</b>. Default assignment convention is
|
||||
/// D0 → HR 0 (pass <paramref name="dBankBase"/> = 0); sites with shifted layouts pass
|
||||
/// their configured base.
|
||||
/// </summary>
|
||||
public static ushort DRegisterToHolding(string dAddress, ushort dBankBase = 0)
|
||||
{
|
||||
var digits = StripPrefix(dAddress, 'D');
|
||||
if (!ushort.TryParse(digits, out var offset))
|
||||
throw new ArgumentException(
|
||||
$"D-register address '{dAddress}' is not a valid decimal integer", nameof(dAddress));
|
||||
var result = dBankBase + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"D-register {dAddress} + base {dBankBase} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
private static string StripPrefix(string address, char expectedPrefix)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(address))
|
||||
throw new ArgumentException("Address must not be empty", nameof(address));
|
||||
var s = address.Trim();
|
||||
if (s.Length > 0 && char.ToUpperInvariant(s[0]) == char.ToUpperInvariant(expectedPrefix))
|
||||
s = s.Substring(1);
|
||||
if (s.Length == 0)
|
||||
throw new ArgumentException($"Address '{address}' has no digits after prefix", nameof(address));
|
||||
return s;
|
||||
}
|
||||
|
||||
private static ushort AddFamilyOffset(ushort baseAddr, string digits, MelsecFamily family)
|
||||
{
|
||||
uint offset = family switch
|
||||
{
|
||||
MelsecFamily.Q_L_iQR => ParseHex(digits),
|
||||
MelsecFamily.F_iQF => ParseOctal(digits),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(family), family, "Unknown MELSEC family"),
|
||||
};
|
||||
var result = baseAddr + offset;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Address {baseAddr}+{offset} exceeds 0xFFFF");
|
||||
return (ushort)result;
|
||||
}
|
||||
|
||||
private static uint ParseHex(string digits)
|
||||
{
|
||||
uint result = 0;
|
||||
foreach (var ch in digits)
|
||||
{
|
||||
uint nibble;
|
||||
if (ch >= '0' && ch <= '9') nibble = (uint)(ch - '0');
|
||||
else if (ch >= 'A' && ch <= 'F') nibble = (uint)(ch - 'A' + 10);
|
||||
else if (ch >= 'a' && ch <= 'f') nibble = (uint)(ch - 'a' + 10);
|
||||
else throw new ArgumentException(
|
||||
$"Address contains non-hex digit '{ch}' — Q/L/iQ-R X/Y addresses are hexadecimal",
|
||||
nameof(digits));
|
||||
result = result * 16 + nibble;
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Hex address exceeds 0xFFFF");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static uint ParseOctal(string digits)
|
||||
{
|
||||
uint result = 0;
|
||||
foreach (var ch in digits)
|
||||
{
|
||||
if (ch < '0' || ch > '7')
|
||||
throw new ArgumentException(
|
||||
$"Address contains non-octal digit '{ch}' — FX/iQ-F X/Y addresses are octal (0-7)",
|
||||
nameof(digits));
|
||||
result = result * 8 + (uint)(ch - '0');
|
||||
if (result > ushort.MaxValue)
|
||||
throw new OverflowException($"Octal address exceeds 0xFFFF");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -115,7 +115,8 @@ public sealed class ModbusDriver(ModbusDriverOptions options, string driverInsta
|
||||
ArrayDim: null,
|
||||
SecurityClass: t.Writable ? SecurityClassification.Operate : SecurityClassification.ViewOnly,
|
||||
IsHistorized: false,
|
||||
IsAlarm: false));
|
||||
IsAlarm: false,
|
||||
WriteIdempotent: t.WriteIdempotent));
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
@@ -92,6 +92,14 @@ public sealed class ModbusProbeOptions
|
||||
/// AutomationDirect DirectLOGIC (DL205/DL260) and a few legacy families pack the first
|
||||
/// character in the low byte instead — see <c>docs/v2/dl205.md</c> §strings.
|
||||
/// </param>
|
||||
/// <param name="WriteIdempotent">
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #44, #45, #143 — flag a tag as safe to replay on
|
||||
/// write timeout / failure. Default <c>false</c>; writes do not auto-retry. Safe candidates:
|
||||
/// holding-register set-points for analog values and configuration registers where the same
|
||||
/// value can be written again without side-effects. Unsafe: coils that drive edge-triggered
|
||||
/// actions (pulse outputs), counter-increment addresses on PLCs that treat writes as deltas,
|
||||
/// any BCD / counter register where repeat-writes advance state.
|
||||
/// </param>
|
||||
public sealed record ModbusTagDefinition(
|
||||
string Name,
|
||||
ModbusRegion Region,
|
||||
@@ -101,7 +109,8 @@ public sealed record ModbusTagDefinition(
|
||||
ModbusByteOrder ByteOrder = ModbusByteOrder.BigEndian,
|
||||
byte BitIndex = 0,
|
||||
ushort StringLength = 0,
|
||||
ModbusStringByteOrder StringByteOrder = ModbusStringByteOrder.HighByteFirst);
|
||||
ModbusStringByteOrder StringByteOrder = ModbusStringByteOrder.HighByteFirst,
|
||||
bool WriteIdempotent = false);
|
||||
|
||||
public enum ModbusRegion { Coils, DiscreteInputs, InputRegisters, HoldingRegisters }
|
||||
|
||||
|
||||
1384
src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
Normal file
1384
src/ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient/OpcUaClientDriver.cs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,180 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient;
|
||||
|
||||
/// <summary>
|
||||
/// OPC UA Client (gateway) driver configuration. Bound from <c>DriverConfig</c> JSON at
|
||||
/// driver-host registration time. Models the settings documented in
|
||||
/// <c>docs/v2/driver-specs.md</c> §8.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This driver connects to a REMOTE OPC UA server and re-exposes its address space
|
||||
/// through the local OtOpcUa server — the opposite direction from the usual "server
|
||||
/// exposes PLC data" flow. Tier A (pure managed, OPC Foundation reference SDK); universal
|
||||
/// protections cover it.
|
||||
/// </remarks>
|
||||
public sealed class OpcUaClientDriverOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Remote OPC UA endpoint URL, e.g. <c>opc.tcp://plc.internal:4840</c>. Convenience
|
||||
/// shortcut for a single-endpoint deployment — equivalent to setting
|
||||
/// <see cref="EndpointUrls"/> to a list with this one URL. When both are provided,
|
||||
/// the list wins and <see cref="EndpointUrl"/> is ignored.
|
||||
/// </summary>
|
||||
public string EndpointUrl { get; init; } = "opc.tcp://localhost:4840";
|
||||
|
||||
/// <summary>
|
||||
/// Ordered list of candidate endpoint URLs for failover. The driver tries each in
|
||||
/// order at <see cref="OpcUaClientDriver.InitializeAsync"/> and on session drop;
|
||||
/// the first URL that successfully connects wins. Typical use-case: an OPC UA server
|
||||
/// pair running in hot-standby (primary 4840 + backup 4841) where either can serve
|
||||
/// the same address space. Leave unset (or empty) to use <see cref="EndpointUrl"/>
|
||||
/// as a single-URL shortcut.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> EndpointUrls { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Per-endpoint connect-attempt timeout during the failover sweep. Short enough that
|
||||
/// cycling through several dead servers doesn't blow the overall init budget, long
|
||||
/// enough to tolerate a slow TLS handshake on a healthy server. Applied independently
|
||||
/// of <see cref="Timeout"/> which governs steady-state operations.
|
||||
/// </summary>
|
||||
public TimeSpan PerEndpointConnectTimeout { get; init; } = TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <summary>
|
||||
/// Security policy to require when selecting an endpoint. Either a
|
||||
/// <see cref="OpcUaSecurityPolicy"/> enum constant or a free-form string (for
|
||||
/// forward-compatibility with future OPC UA policies not yet in the enum).
|
||||
/// Matched against <c>EndpointDescription.SecurityPolicyUri</c> suffix — the driver
|
||||
/// connects to the first endpoint whose policy name matches AND whose mode matches
|
||||
/// <see cref="SecurityMode"/>. When set to <see cref="OpcUaSecurityPolicy.None"/>
|
||||
/// the driver picks any unsecured endpoint regardless of policy string.
|
||||
/// </summary>
|
||||
public OpcUaSecurityPolicy SecurityPolicy { get; init; } = OpcUaSecurityPolicy.None;
|
||||
|
||||
/// <summary>Security mode.</summary>
|
||||
public OpcUaSecurityMode SecurityMode { get; init; } = OpcUaSecurityMode.None;
|
||||
|
||||
/// <summary>Authentication type.</summary>
|
||||
public OpcUaAuthType AuthType { get; init; } = OpcUaAuthType.Anonymous;
|
||||
|
||||
/// <summary>User name (required only for <see cref="OpcUaAuthType.Username"/>).</summary>
|
||||
public string? Username { get; init; }
|
||||
|
||||
/// <summary>Password (required only for <see cref="OpcUaAuthType.Username"/>).</summary>
|
||||
public string? Password { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filesystem path to the user-identity certificate (PFX/PEM). Required when
|
||||
/// <see cref="AuthType"/> is <see cref="OpcUaAuthType.Certificate"/>. The driver
|
||||
/// loads the cert + private key, which the remote server validates against its
|
||||
/// <c>TrustedUserCertificates</c> store to authenticate the session's user token.
|
||||
/// Leave unset to use the driver's application-instance certificate as the user
|
||||
/// token (not typical — most deployments have a separate user cert).
|
||||
/// </summary>
|
||||
public string? UserCertificatePath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional password that unlocks <see cref="UserCertificatePath"/> when the PFX is
|
||||
/// protected. PEM files generally have their password on the adjacent key file; this
|
||||
/// knob only applies to password-locked PFX.
|
||||
/// </summary>
|
||||
public string? UserCertificatePassword { get; init; }
|
||||
|
||||
/// <summary>Server-negotiated session timeout. Default 120s per driver-specs.md §8.</summary>
|
||||
public TimeSpan SessionTimeout { get; init; } = TimeSpan.FromSeconds(120);
|
||||
|
||||
/// <summary>Client-side keep-alive interval.</summary>
|
||||
public TimeSpan KeepAliveInterval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>Initial reconnect delay after a session drop.</summary>
|
||||
public TimeSpan ReconnectPeriod { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>
|
||||
/// When <c>true</c>, the driver accepts any self-signed / untrusted server certificate.
|
||||
/// Dev-only — must be <c>false</c> in production so MITM attacks against the opc.tcp
|
||||
/// channel fail closed.
|
||||
/// </summary>
|
||||
public bool AutoAcceptCertificates { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Application URI the driver reports during session creation. Must match the
|
||||
/// subject-alt-name on the client certificate if one is used, which is why it's a
|
||||
/// config knob rather than hard-coded.
|
||||
/// </summary>
|
||||
public string ApplicationUri { get; init; } = "urn:localhost:OtOpcUa:GatewayClient";
|
||||
|
||||
/// <summary>
|
||||
/// Friendly name sent to the remote server for diagnostics. Shows up in the remote
|
||||
/// server's session-list so operators can identify which gateway instance is calling.
|
||||
/// </summary>
|
||||
public string SessionName { get; init; } = "OtOpcUa-Gateway";
|
||||
|
||||
/// <summary>Connect + per-operation timeout.</summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>
|
||||
/// Root NodeId to mirror. Default <c>null</c> = <c>ObjectsFolder</c> (i=85). Set to
|
||||
/// a scoped root to restrict the address space the driver exposes locally — useful
|
||||
/// when the remote server has tens of thousands of nodes and only a subset is
|
||||
/// needed downstream.
|
||||
/// </summary>
|
||||
public string? BrowseRoot { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Cap on total nodes discovered during <c>DiscoverAsync</c>. Default 10_000 —
|
||||
/// bounds memory on runaway remote servers without being so low that normal
|
||||
/// deployments hit it. When the cap is reached discovery stops and a warning is
|
||||
/// written to the driver health surface; the partially-discovered tree is still
|
||||
/// projected into the local address space.
|
||||
/// </summary>
|
||||
public int MaxDiscoveredNodes { get; init; } = 10_000;
|
||||
|
||||
/// <summary>
|
||||
/// Max hierarchical depth of the browse. Default 10 — deep enough for realistic
|
||||
/// OPC UA information models, shallow enough that cyclic graphs can't spin the
|
||||
/// browse forever.
|
||||
/// </summary>
|
||||
public int MaxBrowseDepth { get; init; } = 10;
|
||||
}
|
||||
|
||||
/// <summary>OPC UA message security mode.</summary>
|
||||
public enum OpcUaSecurityMode
|
||||
{
|
||||
None,
|
||||
Sign,
|
||||
SignAndEncrypt,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// OPC UA security policies recognized by the driver. Maps to the standard
|
||||
/// <c>http://opcfoundation.org/UA/SecurityPolicy#</c> URI suffixes the SDK uses for
|
||||
/// endpoint matching.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <see cref="Basic128Rsa15"/> and <see cref="Basic256"/> are <b>deprecated</b> per OPC UA
|
||||
/// spec v1.04 — they remain in the enum only for brownfield interop with older servers.
|
||||
/// Prefer <see cref="Basic256Sha256"/>, <see cref="Aes128_Sha256_RsaOaep"/>, or
|
||||
/// <see cref="Aes256_Sha256_RsaPss"/> for new deployments.
|
||||
/// </remarks>
|
||||
public enum OpcUaSecurityPolicy
|
||||
{
|
||||
/// <summary>No security. Unsigned, unencrypted wire.</summary>
|
||||
None,
|
||||
/// <summary>Deprecated (OPC UA 1.04). Retained for legacy server interop.</summary>
|
||||
Basic128Rsa15,
|
||||
/// <summary>Deprecated (OPC UA 1.04). Retained for legacy server interop.</summary>
|
||||
Basic256,
|
||||
/// <summary>Recommended baseline for current deployments.</summary>
|
||||
Basic256Sha256,
|
||||
/// <summary>Current OPC UA policy; AES-128 + SHA-256 + RSA-OAEP.</summary>
|
||||
Aes128_Sha256_RsaOaep,
|
||||
/// <summary>Current OPC UA policy; AES-256 + SHA-256 + RSA-PSS.</summary>
|
||||
Aes256_Sha256_RsaPss,
|
||||
}
|
||||
|
||||
/// <summary>User authentication type sent to the remote server.</summary>
|
||||
public enum OpcUaAuthType
|
||||
{
|
||||
Anonymous,
|
||||
Username,
|
||||
Certificate,
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<LangVersion>latest</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<NoWarn>$(NoWarn);CS1591</NoWarn>
|
||||
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient</RootNamespace>
|
||||
<AssemblyName>ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Core.Abstractions\ZB.MOM.WW.OtOpcUa.Core.Abstractions.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Client" Version="1.5.378.106"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Configuration" Version="1.5.378.106"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests"/>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
216
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7AddressParser.cs
Normal file
216
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7AddressParser.cs
Normal file
@@ -0,0 +1,216 @@
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7;
|
||||
|
||||
/// <summary>
|
||||
/// Siemens S7 memory area. The driver's tag-address parser maps every S7 tag string into
|
||||
/// exactly one of these + an offset. Values match the on-wire S7 area codes only
|
||||
/// incidentally — S7.Net uses its own <c>DataType</c> enum (<c>DataBlock</c>, <c>Memory</c>,
|
||||
/// <c>Input</c>, <c>Output</c>, <c>Timer</c>, <c>Counter</c>) so the adapter layer translates.
|
||||
/// </summary>
|
||||
public enum S7Area
|
||||
{
|
||||
DataBlock,
|
||||
Memory, // M (Merker / marker byte)
|
||||
Input, // I (process-image input)
|
||||
Output, // Q (process-image output)
|
||||
Timer,
|
||||
Counter,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Access width for a DB / M / I / Q address. Timers and counters are always 16-bit
|
||||
/// opaque (not user-addressable via size suffixes).
|
||||
/// </summary>
|
||||
public enum S7Size
|
||||
{
|
||||
Bit, // X
|
||||
Byte, // B
|
||||
Word, // W — 16-bit
|
||||
DWord, // D — 32-bit
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parsed form of an S7 tag-address string. Produced by <see cref="S7AddressParser.Parse"/>.
|
||||
/// </summary>
|
||||
/// <param name="Area">Memory area (DB, M, I, Q, T, C).</param>
|
||||
/// <param name="DbNumber">Data block number; only meaningful when <paramref name="Area"/> is <see cref="S7Area.DataBlock"/>.</param>
|
||||
/// <param name="Size">Access width. Always <see cref="S7Size.Word"/> for Timer and Counter.</param>
|
||||
/// <param name="ByteOffset">Byte offset into the area (for DB/M/I/Q) or the timer/counter number.</param>
|
||||
/// <param name="BitOffset">Bit position 0-7 when <paramref name="Size"/> is <see cref="S7Size.Bit"/>; 0 otherwise.</param>
|
||||
public readonly record struct S7ParsedAddress(
|
||||
S7Area Area,
|
||||
int DbNumber,
|
||||
S7Size Size,
|
||||
int ByteOffset,
|
||||
int BitOffset);
|
||||
|
||||
/// <summary>
|
||||
/// Parses Siemens S7 address strings into <see cref="S7ParsedAddress"/>. Accepts the
|
||||
/// Siemens TIA-Portal / STEP 7 Classic syntax documented in <c>docs/v2/driver-specs.md</c> §5:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>DB{n}.DB{X|B|W|D}{offset}[.bit]</c> — e.g. <c>DB1.DBX0.0</c>, <c>DB1.DBW0</c>, <c>DB1.DBD4</c></item>
|
||||
/// <item><c>M{B|W|D}{offset}</c> or <c>M{offset}.{bit}</c> — e.g. <c>MB0</c>, <c>MW0</c>, <c>MD4</c>, <c>M0.0</c></item>
|
||||
/// <item><c>I{B|W|D}{offset}</c> or <c>I{offset}.{bit}</c> — e.g. <c>IB0</c>, <c>IW0</c>, <c>ID0</c>, <c>I0.0</c></item>
|
||||
/// <item><c>Q{B|W|D}{offset}</c> or <c>Q{offset}.{bit}</c> — e.g. <c>QB0</c>, <c>QW0</c>, <c>QD0</c>, <c>Q0.0</c></item>
|
||||
/// <item><c>T{n}</c> — e.g. <c>T0</c>, <c>T15</c></item>
|
||||
/// <item><c>C{n}</c> — e.g. <c>C0</c>, <c>C10</c></item>
|
||||
/// </list>
|
||||
/// Grammar is case-insensitive. Leading/trailing whitespace tolerated. Bit specifiers
|
||||
/// must be 0-7; byte offsets must be non-negative; DB numbers must be >= 1.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Parse is deliberately strict — the parser rejects syntactic garbage up-front so a bad
|
||||
/// tag config fails at driver init time instead of surfacing as a misleading
|
||||
/// <c>BadInternalError</c> on every Read against that tag.
|
||||
/// </remarks>
|
||||
public static class S7AddressParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse an S7 address. Throws <see cref="FormatException"/> on any syntax error with
|
||||
/// the offending input echoed in the message so operators can correlate to the tag
|
||||
/// config that produced the fault.
|
||||
/// </summary>
|
||||
public static S7ParsedAddress Parse(string address)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(address))
|
||||
throw new FormatException("S7 address must not be empty");
|
||||
var s = address.Trim().ToUpperInvariant();
|
||||
|
||||
// --- DB{n}.DB{X|B|W|D}{offset}[.bit] ---
|
||||
if (s.StartsWith("DB") && TryParseDataBlock(s, out var dbResult))
|
||||
return dbResult;
|
||||
|
||||
if (s.Length < 2)
|
||||
throw new FormatException($"S7 address '{address}' is too short to parse");
|
||||
|
||||
var areaChar = s[0];
|
||||
var rest = s.Substring(1);
|
||||
|
||||
switch (areaChar)
|
||||
{
|
||||
case 'M': return ParseMIQ(S7Area.Memory, rest, address);
|
||||
case 'I': return ParseMIQ(S7Area.Input, rest, address);
|
||||
case 'Q': return ParseMIQ(S7Area.Output, rest, address);
|
||||
case 'T': return ParseTimerOrCounter(S7Area.Timer, rest, address);
|
||||
case 'C': return ParseTimerOrCounter(S7Area.Counter, rest, address);
|
||||
default:
|
||||
throw new FormatException($"S7 address '{address}' starts with unknown area '{areaChar}' (expected DB/M/I/Q/T/C)");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Try-parse variant for callers that can't afford an exception on bad input (e.g.
|
||||
/// config validation pages in the Admin UI). Returns <c>false</c> for any input that
|
||||
/// would throw from <see cref="Parse"/>.
|
||||
/// </summary>
|
||||
public static bool TryParse(string address, out S7ParsedAddress result)
|
||||
{
|
||||
try
|
||||
{
|
||||
result = Parse(address);
|
||||
return true;
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
result = default;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static bool TryParseDataBlock(string s, out S7ParsedAddress result)
|
||||
{
|
||||
result = default;
|
||||
// Split on first '.': left side must be DB{n}, right side DB{X|B|W|D}{offset}[.bit]
|
||||
var dot = s.IndexOf('.');
|
||||
if (dot < 0) return false;
|
||||
var head = s.Substring(0, dot); // DB{n}
|
||||
var tail = s.Substring(dot + 1); // DB{X|B|W|D}{offset}[.bit]
|
||||
|
||||
if (head.Length < 3) return false;
|
||||
if (!int.TryParse(head.AsSpan(2), out var dbNumber) || dbNumber < 1)
|
||||
throw new FormatException($"S7 DB number in '{s}' must be a positive integer");
|
||||
|
||||
if (!tail.StartsWith("DB") || tail.Length < 4)
|
||||
throw new FormatException($"S7 DB address tail '{tail}' must start with DB{{X|B|W|D}}");
|
||||
|
||||
var sizeChar = tail[2];
|
||||
var offsetStart = 3;
|
||||
var size = sizeChar switch
|
||||
{
|
||||
'X' => S7Size.Bit,
|
||||
'B' => S7Size.Byte,
|
||||
'W' => S7Size.Word,
|
||||
'D' => S7Size.DWord,
|
||||
_ => throw new FormatException($"S7 DB size '{sizeChar}' in '{s}' must be X/B/W/D"),
|
||||
};
|
||||
|
||||
var (byteOffset, bitOffset) = ParseOffsetAndOptionalBit(tail, offsetStart, size, s);
|
||||
result = new S7ParsedAddress(S7Area.DataBlock, dbNumber, size, byteOffset, bitOffset);
|
||||
return true;
|
||||
}
|
||||
|
||||
private static S7ParsedAddress ParseMIQ(S7Area area, string rest, string original)
|
||||
{
|
||||
if (rest.Length == 0)
|
||||
throw new FormatException($"S7 address '{original}' has no offset");
|
||||
|
||||
var first = rest[0];
|
||||
S7Size size;
|
||||
int offsetStart;
|
||||
switch (first)
|
||||
{
|
||||
case 'B': size = S7Size.Byte; offsetStart = 1; break;
|
||||
case 'W': size = S7Size.Word; offsetStart = 1; break;
|
||||
case 'D': size = S7Size.DWord; offsetStart = 1; break;
|
||||
default:
|
||||
// No size prefix => bit-level address requires explicit .bit. Size stays Bit;
|
||||
// ParseOffsetAndOptionalBit will demand the dot.
|
||||
size = S7Size.Bit;
|
||||
offsetStart = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
var (byteOffset, bitOffset) = ParseOffsetAndOptionalBit(rest, offsetStart, size, original);
|
||||
return new S7ParsedAddress(area, DbNumber: 0, size, byteOffset, bitOffset);
|
||||
}
|
||||
|
||||
private static S7ParsedAddress ParseTimerOrCounter(S7Area area, string rest, string original)
|
||||
{
|
||||
if (rest.Length == 0)
|
||||
throw new FormatException($"S7 address '{original}' has no {area} number");
|
||||
if (!int.TryParse(rest, out var number) || number < 0)
|
||||
throw new FormatException($"S7 {area} number in '{original}' must be a non-negative integer");
|
||||
return new S7ParsedAddress(area, DbNumber: 0, S7Size.Word, number, BitOffset: 0);
|
||||
}
|
||||
|
||||
private static (int byteOffset, int bitOffset) ParseOffsetAndOptionalBit(
|
||||
string s, int start, S7Size size, string original)
|
||||
{
|
||||
var offsetEnd = start;
|
||||
while (offsetEnd < s.Length && s[offsetEnd] >= '0' && s[offsetEnd] <= '9')
|
||||
offsetEnd++;
|
||||
if (offsetEnd == start)
|
||||
throw new FormatException($"S7 address '{original}' has no byte-offset digits");
|
||||
|
||||
if (!int.TryParse(s.AsSpan(start, offsetEnd - start), out var byteOffset) || byteOffset < 0)
|
||||
throw new FormatException($"S7 byte offset in '{original}' must be non-negative");
|
||||
|
||||
// No bit-suffix: done unless size is Bit with no prefix, which requires one.
|
||||
if (offsetEnd == s.Length)
|
||||
{
|
||||
if (size == S7Size.Bit)
|
||||
throw new FormatException($"S7 address '{original}' needs a .{{bit}} suffix for bit access");
|
||||
return (byteOffset, 0);
|
||||
}
|
||||
|
||||
if (s[offsetEnd] != '.')
|
||||
throw new FormatException($"S7 address '{original}' has unexpected character after offset");
|
||||
|
||||
if (size != S7Size.Bit)
|
||||
throw new FormatException($"S7 address '{original}' has a bit suffix but the size is {size} — bit access needs X (DB) or no size prefix (M/I/Q)");
|
||||
|
||||
if (!int.TryParse(s.AsSpan(offsetEnd + 1), out var bitOffset) || bitOffset is < 0 or > 7)
|
||||
throw new FormatException($"S7 bit offset in '{original}' must be 0-7");
|
||||
|
||||
return (byteOffset, bitOffset);
|
||||
}
|
||||
}
|
||||
514
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7Driver.cs
Normal file
514
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7Driver.cs
Normal file
@@ -0,0 +1,514 @@
|
||||
using S7.Net;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7;
|
||||
|
||||
/// <summary>
|
||||
/// Siemens S7 native driver — speaks S7comm over ISO-on-TCP (port 102) via the S7netplus
|
||||
/// library. First implementation of <see cref="IDriver"/> for an in-process .NET Standard
|
||||
/// PLC protocol that is NOT Modbus, validating that the v2 driver-capability interfaces
|
||||
/// generalize beyond Modbus + Galaxy.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// PR 62 ships the scaffold: <see cref="IDriver"/> only (Initialize / Reinitialize /
|
||||
/// Shutdown / GetHealth). <see cref="ITagDiscovery"/>, <see cref="IReadable"/>,
|
||||
/// <see cref="IWritable"/>, <see cref="ISubscribable"/>, <see cref="IHostConnectivityProbe"/>
|
||||
/// land in PRs 63-65 once the address parser (PR 63) is in place.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Single-connection policy</b>: S7netplus documented pattern is one
|
||||
/// <c>Plc</c> instance per PLC, serialized with a <see cref="SemaphoreSlim"/>.
|
||||
/// Parallelising reads against a single S7 CPU doesn't help — the CPU scans the
|
||||
/// communication mailbox at most once per cycle (2-10 ms) and queues concurrent
|
||||
/// requests wire-side anyway. Multiple client-side connections just waste the CPU's
|
||||
/// 8-64 connection-resource budget.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class S7Driver(S7DriverOptions options, string driverInstanceId)
|
||||
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IHostConnectivityProbe, IDisposable, IAsyncDisposable
|
||||
{
|
||||
// ---- ISubscribable + IHostConnectivityProbe state ----
|
||||
|
||||
private readonly System.Collections.Concurrent.ConcurrentDictionary<long, SubscriptionState> _subscriptions = new();
|
||||
private long _nextSubscriptionId;
|
||||
private readonly object _probeLock = new();
|
||||
private HostState _hostState = HostState.Unknown;
|
||||
private DateTime _hostStateChangedUtc = DateTime.UtcNow;
|
||||
private CancellationTokenSource? _probeCts;
|
||||
|
||||
public event EventHandler<DataChangeEventArgs>? OnDataChange;
|
||||
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
|
||||
|
||||
/// <summary>OPC UA StatusCode used when the tag name isn't in the driver's tag map.</summary>
|
||||
private const uint StatusBadNodeIdUnknown = 0x80340000u;
|
||||
/// <summary>OPC UA StatusCode used when the tag's data type isn't implemented yet.</summary>
|
||||
private const uint StatusBadNotSupported = 0x803D0000u;
|
||||
/// <summary>OPC UA StatusCode used when the tag is declared read-only.</summary>
|
||||
private const uint StatusBadNotWritable = 0x803B0000u;
|
||||
/// <summary>OPC UA StatusCode used when write fails validation (e.g. out-of-range value).</summary>
|
||||
private const uint StatusBadInternalError = 0x80020000u;
|
||||
/// <summary>OPC UA StatusCode used for socket / timeout / protocol-layer faults.</summary>
|
||||
private const uint StatusBadCommunicationError = 0x80050000u;
|
||||
/// <summary>OPC UA StatusCode used when S7 returns <c>ErrorCode.WrongCPU</c> / PUT/GET disabled.</summary>
|
||||
private const uint StatusBadDeviceFailure = 0x80550000u;
|
||||
|
||||
private readonly Dictionary<string, S7TagDefinition> _tagsByName = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Dictionary<string, S7ParsedAddress> _parsedByName = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private readonly S7DriverOptions _options = options;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
|
||||
/// <summary>
|
||||
/// Per-connection gate. Internal so PRs 63-65 (read/write/subscribe) can serialize on
|
||||
/// the same semaphore without exposing it publicly. Single-connection-per-PLC is a
|
||||
/// hard requirement of S7netplus — see class remarks.
|
||||
/// </summary>
|
||||
internal SemaphoreSlim Gate => _gate;
|
||||
|
||||
/// <summary>
|
||||
/// Active S7.Net PLC connection. Null until <see cref="InitializeAsync"/> returns; null
|
||||
/// after <see cref="ShutdownAsync"/>. Read-only outside this class; PR 64's Read/Write
|
||||
/// will take the <see cref="_gate"/> before touching it.
|
||||
/// </summary>
|
||||
internal Plc? Plc { get; private set; }
|
||||
|
||||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||||
private bool _disposed;
|
||||
|
||||
public string DriverInstanceId => driverInstanceId;
|
||||
public string DriverType => "S7";
|
||||
|
||||
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||||
{
|
||||
_health = new DriverHealth(DriverState.Initializing, null, null);
|
||||
try
|
||||
{
|
||||
var plc = new Plc(_options.CpuType, _options.Host, _options.Rack, _options.Slot);
|
||||
// S7netplus writes timeouts into the underlying TcpClient via Plc.WriteTimeout /
|
||||
// Plc.ReadTimeout (milliseconds). Set before OpenAsync so the handshake itself
|
||||
// honours the bound.
|
||||
plc.WriteTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||
plc.ReadTimeout = (int)_options.Timeout.TotalMilliseconds;
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
cts.CancelAfter(_options.Timeout);
|
||||
await plc.OpenAsync(cts.Token).ConfigureAwait(false);
|
||||
|
||||
Plc = plc;
|
||||
|
||||
// Parse every tag's address once at init so config typos fail fast here instead
|
||||
// of surfacing as BadInternalError on every Read against the bad tag. The parser
|
||||
// also rejects bit-offset > 7, DB 0, unknown area letters, etc.
|
||||
_tagsByName.Clear();
|
||||
_parsedByName.Clear();
|
||||
foreach (var t in _options.Tags)
|
||||
{
|
||||
var parsed = S7AddressParser.Parse(t.Address); // throws FormatException
|
||||
_tagsByName[t.Name] = t;
|
||||
_parsedByName[t.Name] = parsed;
|
||||
}
|
||||
|
||||
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
||||
|
||||
// Kick off the probe loop once the connection is up. Initial HostState stays
|
||||
// Unknown until the first probe tick succeeds — avoids broadcasting a premature
|
||||
// Running transition before any PDU round-trip has happened.
|
||||
if (_options.Probe.Enabled)
|
||||
{
|
||||
_probeCts = new CancellationTokenSource();
|
||||
_ = Task.Run(() => ProbeLoopAsync(_probeCts.Token), _probeCts.Token);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Clean up a partially-constructed Plc so a retry from the caller doesn't leak
|
||||
// the TcpClient. S7netplus's Close() is best-effort and idempotent.
|
||||
try { Plc?.Close(); } catch { }
|
||||
Plc = null;
|
||||
_health = new DriverHealth(DriverState.Faulted, null, ex.Message);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||||
{
|
||||
await ShutdownAsync(cancellationToken).ConfigureAwait(false);
|
||||
await InitializeAsync(driverConfigJson, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public Task ShutdownAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try { _probeCts?.Cancel(); } catch { }
|
||||
_probeCts?.Dispose();
|
||||
_probeCts = null;
|
||||
|
||||
foreach (var state in _subscriptions.Values)
|
||||
{
|
||||
try { state.Cts.Cancel(); } catch { }
|
||||
state.Cts.Dispose();
|
||||
}
|
||||
_subscriptions.Clear();
|
||||
|
||||
try { Plc?.Close(); } catch { /* best-effort — tearing down anyway */ }
|
||||
Plc = null;
|
||||
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public DriverHealth GetHealth() => _health;
|
||||
|
||||
/// <summary>
|
||||
/// Approximate memory footprint. The Plc instance + one 240-960 byte PDU buffer is
|
||||
/// under 4 KB; return 0 because the <see cref="IDriver"/> contract asks for a
|
||||
/// driver-attributable growth number and S7.Net doesn't expose one.
|
||||
/// </summary>
|
||||
public long GetMemoryFootprint() => 0;
|
||||
|
||||
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
|
||||
// ---- IReadable ----
|
||||
|
||||
public async Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
||||
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
|
||||
{
|
||||
var plc = RequirePlc();
|
||||
var now = DateTime.UtcNow;
|
||||
var results = new DataValueSnapshot[fullReferences.Count];
|
||||
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
for (var i = 0; i < fullReferences.Count; i++)
|
||||
{
|
||||
var name = fullReferences[i];
|
||||
if (!_tagsByName.TryGetValue(name, out var tag))
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadNodeIdUnknown, null, now);
|
||||
continue;
|
||||
}
|
||||
try
|
||||
{
|
||||
var value = await ReadOneAsync(plc, tag, cancellationToken).ConfigureAwait(false);
|
||||
results[i] = new DataValueSnapshot(value, 0u, now, now);
|
||||
_health = new DriverHealth(DriverState.Healthy, now, null);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadNotSupported, null, now);
|
||||
}
|
||||
catch (global::S7.Net.PlcException pex)
|
||||
{
|
||||
// S7.Net's PlcException carries an ErrorCode; PUT/GET-disabled on
|
||||
// S7-1200/1500 surfaces here. Map to BadDeviceFailure so operators see a
|
||||
// device-config problem (toggle PUT/GET in TIA Portal) rather than a
|
||||
// transient fault — per driver-specs.md §5.
|
||||
results[i] = new DataValueSnapshot(null, StatusBadDeviceFailure, null, now);
|
||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, pex.Message);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
results[i] = new DataValueSnapshot(null, StatusBadCommunicationError, null, now);
|
||||
_health = new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, ex.Message);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task<object> ReadOneAsync(global::S7.Net.Plc plc, S7TagDefinition tag, CancellationToken ct)
|
||||
{
|
||||
var addr = _parsedByName[tag.Name];
|
||||
// S7.Net's string-based ReadAsync returns object where the boxed .NET type depends on
|
||||
// the size suffix: DBX=bool, DBB=byte, DBW=ushort, DBD=uint. Our S7DataType enum
|
||||
// specifies the SEMANTIC type (Int16 vs UInt16 vs Float32 etc.); the reinterpret below
|
||||
// converts the raw unsigned boxed value into the requested type without issuing an
|
||||
// extra PLC round-trip.
|
||||
var raw = await plc.ReadAsync(tag.Address, ct).ConfigureAwait(false)
|
||||
?? throw new System.IO.InvalidDataException($"S7.Net returned null for '{tag.Address}'");
|
||||
|
||||
return (tag.DataType, addr.Size, raw) switch
|
||||
{
|
||||
(S7DataType.Bool, S7Size.Bit, bool b) => b,
|
||||
(S7DataType.Byte, S7Size.Byte, byte by) => by,
|
||||
(S7DataType.UInt16, S7Size.Word, ushort u16) => u16,
|
||||
(S7DataType.Int16, S7Size.Word, ushort u16) => unchecked((short)u16),
|
||||
(S7DataType.UInt32, S7Size.DWord, uint u32) => u32,
|
||||
(S7DataType.Int32, S7Size.DWord, uint u32) => unchecked((int)u32),
|
||||
(S7DataType.Float32, S7Size.DWord, uint u32) => BitConverter.UInt32BitsToSingle(u32),
|
||||
|
||||
(S7DataType.Int64, _, _) => throw new NotSupportedException("S7 Int64 reads land in a follow-up PR"),
|
||||
(S7DataType.UInt64, _, _) => throw new NotSupportedException("S7 UInt64 reads land in a follow-up PR"),
|
||||
(S7DataType.Float64, _, _) => throw new NotSupportedException("S7 Float64 (LReal) reads land in a follow-up PR"),
|
||||
(S7DataType.String, _, _) => throw new NotSupportedException("S7 STRING reads land in a follow-up PR"),
|
||||
(S7DataType.DateTime, _, _) => throw new NotSupportedException("S7 DateTime reads land in a follow-up PR"),
|
||||
|
||||
_ => throw new System.IO.InvalidDataException(
|
||||
$"S7 Read type-mismatch: tag '{tag.Name}' declared {tag.DataType} but address '{tag.Address}' " +
|
||||
$"parsed as Size={addr.Size}; S7.Net returned {raw.GetType().Name}"),
|
||||
};
|
||||
}
|
||||
|
||||
// ---- IWritable ----
|
||||
|
||||
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes, CancellationToken cancellationToken)
|
||||
{
|
||||
var plc = RequirePlc();
|
||||
var results = new WriteResult[writes.Count];
|
||||
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
for (var i = 0; i < writes.Count; i++)
|
||||
{
|
||||
var w = writes[i];
|
||||
if (!_tagsByName.TryGetValue(w.FullReference, out var tag))
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadNodeIdUnknown);
|
||||
continue;
|
||||
}
|
||||
if (!tag.Writable)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadNotWritable);
|
||||
continue;
|
||||
}
|
||||
try
|
||||
{
|
||||
await WriteOneAsync(plc, tag, w.Value, cancellationToken).ConfigureAwait(false);
|
||||
results[i] = new WriteResult(0u);
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadNotSupported);
|
||||
}
|
||||
catch (global::S7.Net.PlcException)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadDeviceFailure);
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
results[i] = new WriteResult(StatusBadInternalError);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task WriteOneAsync(global::S7.Net.Plc plc, S7TagDefinition tag, object? value, CancellationToken ct)
|
||||
{
|
||||
// S7.Net's Plc.WriteAsync(string address, object value) expects the boxed value to
|
||||
// match the address's size-suffix type: DBX=bool, DBB=byte, DBW=ushort, DBD=uint.
|
||||
// Our S7DataType lets the caller pass short/int/float; convert to the unsigned
|
||||
// wire representation before handing off.
|
||||
var boxed = tag.DataType switch
|
||||
{
|
||||
S7DataType.Bool => (object)Convert.ToBoolean(value),
|
||||
S7DataType.Byte => (object)Convert.ToByte(value),
|
||||
S7DataType.UInt16 => (object)Convert.ToUInt16(value),
|
||||
S7DataType.Int16 => (object)unchecked((ushort)Convert.ToInt16(value)),
|
||||
S7DataType.UInt32 => (object)Convert.ToUInt32(value),
|
||||
S7DataType.Int32 => (object)unchecked((uint)Convert.ToInt32(value)),
|
||||
S7DataType.Float32 => (object)BitConverter.SingleToUInt32Bits(Convert.ToSingle(value)),
|
||||
|
||||
S7DataType.Int64 => throw new NotSupportedException("S7 Int64 writes land in a follow-up PR"),
|
||||
S7DataType.UInt64 => throw new NotSupportedException("S7 UInt64 writes land in a follow-up PR"),
|
||||
S7DataType.Float64 => throw new NotSupportedException("S7 Float64 (LReal) writes land in a follow-up PR"),
|
||||
S7DataType.String => throw new NotSupportedException("S7 STRING writes land in a follow-up PR"),
|
||||
S7DataType.DateTime => throw new NotSupportedException("S7 DateTime writes land in a follow-up PR"),
|
||||
_ => throw new InvalidOperationException($"Unknown S7DataType {tag.DataType}"),
|
||||
};
|
||||
await plc.WriteAsync(tag.Address, boxed, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private global::S7.Net.Plc RequirePlc() =>
|
||||
Plc ?? throw new InvalidOperationException("S7Driver not initialized");
|
||||
|
||||
// ---- ITagDiscovery ----
|
||||
|
||||
public Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
var folder = builder.Folder("S7", "S7");
|
||||
foreach (var t in _options.Tags)
|
||||
{
|
||||
folder.Variable(t.Name, t.Name, new DriverAttributeInfo(
|
||||
FullName: t.Name,
|
||||
DriverDataType: MapDataType(t.DataType),
|
||||
IsArray: false,
|
||||
ArrayDim: null,
|
||||
SecurityClass: t.Writable ? SecurityClassification.Operate : SecurityClassification.ViewOnly,
|
||||
IsHistorized: false,
|
||||
IsAlarm: false,
|
||||
WriteIdempotent: t.WriteIdempotent));
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private static DriverDataType MapDataType(S7DataType t) => t switch
|
||||
{
|
||||
S7DataType.Bool => DriverDataType.Boolean,
|
||||
S7DataType.Byte => DriverDataType.Int32, // no 8-bit in DriverDataType yet
|
||||
S7DataType.Int16 or S7DataType.UInt16 or S7DataType.Int32 or S7DataType.UInt32 => DriverDataType.Int32,
|
||||
S7DataType.Int64 or S7DataType.UInt64 => DriverDataType.Int32, // widens; lossy for >2^31-1
|
||||
S7DataType.Float32 => DriverDataType.Float32,
|
||||
S7DataType.Float64 => DriverDataType.Float64,
|
||||
S7DataType.String => DriverDataType.String,
|
||||
S7DataType.DateTime => DriverDataType.DateTime,
|
||||
_ => DriverDataType.Int32,
|
||||
};
|
||||
|
||||
// ---- ISubscribable (polling overlay) ----
|
||||
|
||||
public Task<ISubscriptionHandle> SubscribeAsync(
|
||||
IReadOnlyList<string> fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken)
|
||||
{
|
||||
var id = Interlocked.Increment(ref _nextSubscriptionId);
|
||||
var cts = new CancellationTokenSource();
|
||||
// Floor at 100 ms — S7 CPUs scan 2-10 ms but the comms mailbox is processed at most
|
||||
// once per scan; sub-100 ms polling just queues wire-side with worse latency.
|
||||
var interval = publishingInterval < TimeSpan.FromMilliseconds(100)
|
||||
? TimeSpan.FromMilliseconds(100)
|
||||
: publishingInterval;
|
||||
var handle = new S7SubscriptionHandle(id);
|
||||
var state = new SubscriptionState(handle, [.. fullReferences], interval, cts);
|
||||
_subscriptions[id] = state;
|
||||
_ = Task.Run(() => PollLoopAsync(state, cts.Token), cts.Token);
|
||||
return Task.FromResult<ISubscriptionHandle>(handle);
|
||||
}
|
||||
|
||||
public Task UnsubscribeAsync(ISubscriptionHandle handle, CancellationToken cancellationToken)
|
||||
{
|
||||
if (handle is S7SubscriptionHandle h && _subscriptions.TryRemove(h.Id, out var state))
|
||||
{
|
||||
state.Cts.Cancel();
|
||||
state.Cts.Dispose();
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task PollLoopAsync(SubscriptionState state, CancellationToken ct)
|
||||
{
|
||||
// Initial-data push per OPC UA Part 4 convention.
|
||||
try { await PollOnceAsync(state, forceRaise: true, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
catch { /* first-read error — polling continues */ }
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try { await Task.Delay(state.Interval, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
|
||||
try { await PollOnceAsync(state, forceRaise: false, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
catch { /* transient polling error — loop continues, health surface reflects it */ }
|
||||
}
|
||||
}
|
||||
|
||||
private async Task PollOnceAsync(SubscriptionState state, bool forceRaise, CancellationToken ct)
|
||||
{
|
||||
var snapshots = await ReadAsync(state.TagReferences, ct).ConfigureAwait(false);
|
||||
for (var i = 0; i < state.TagReferences.Count; i++)
|
||||
{
|
||||
var tagRef = state.TagReferences[i];
|
||||
var current = snapshots[i];
|
||||
var lastSeen = state.LastValues.TryGetValue(tagRef, out var prev) ? prev : default;
|
||||
|
||||
if (forceRaise || !Equals(lastSeen?.Value, current.Value) || lastSeen?.StatusCode != current.StatusCode)
|
||||
{
|
||||
state.LastValues[tagRef] = current;
|
||||
OnDataChange?.Invoke(this, new DataChangeEventArgs(state.Handle, tagRef, current));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private sealed record SubscriptionState(
|
||||
S7SubscriptionHandle Handle,
|
||||
IReadOnlyList<string> TagReferences,
|
||||
TimeSpan Interval,
|
||||
CancellationTokenSource Cts)
|
||||
{
|
||||
public System.Collections.Concurrent.ConcurrentDictionary<string, DataValueSnapshot> LastValues { get; }
|
||||
= new(StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private sealed record S7SubscriptionHandle(long Id) : ISubscriptionHandle
|
||||
{
|
||||
public string DiagnosticId => $"s7-sub-{Id}";
|
||||
}
|
||||
|
||||
// ---- IHostConnectivityProbe ----
|
||||
|
||||
/// <summary>
|
||||
/// Host identifier surfaced in <see cref="GetHostStatuses"/>. <c>host:port</c> format
|
||||
/// matches the Modbus driver's convention so the Admin UI dashboard renders both
|
||||
/// family's rows uniformly.
|
||||
/// </summary>
|
||||
public string HostName => $"{_options.Host}:{_options.Port}";
|
||||
|
||||
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses()
|
||||
{
|
||||
lock (_probeLock)
|
||||
return [new HostConnectivityStatus(HostName, _hostState, _hostStateChangedUtc)];
|
||||
}
|
||||
|
||||
private async Task ProbeLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
var success = false;
|
||||
try
|
||||
{
|
||||
// Probe via S7.Net's low-cost GetCpuStatus — returns the CPU state (Run/Stop)
|
||||
// and is intentionally light on the comms mailbox. Single-word Plc.ReadAsync
|
||||
// would also work but GetCpuStatus doubles as a "PLC actually up" check.
|
||||
using var probeCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
probeCts.CancelAfter(_options.Probe.Timeout);
|
||||
|
||||
var plc = Plc;
|
||||
if (plc is null) throw new InvalidOperationException("Plc dropped during probe");
|
||||
|
||||
await _gate.WaitAsync(probeCts.Token).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
_ = await plc.ReadStatusAsync(probeCts.Token).ConfigureAwait(false);
|
||||
success = true;
|
||||
}
|
||||
finally { _gate.Release(); }
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
|
||||
catch { /* transport/timeout/exception — treated as Stopped below */ }
|
||||
|
||||
TransitionTo(success ? HostState.Running : HostState.Stopped);
|
||||
|
||||
try { await Task.Delay(_options.Probe.Interval, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { return; }
|
||||
}
|
||||
}
|
||||
|
||||
private void TransitionTo(HostState newState)
|
||||
{
|
||||
HostState old;
|
||||
lock (_probeLock)
|
||||
{
|
||||
old = _hostState;
|
||||
if (old == newState) return;
|
||||
_hostState = newState;
|
||||
_hostStateChangedUtc = DateTime.UtcNow;
|
||||
}
|
||||
OnHostStatusChanged?.Invoke(this, new HostStatusChangedEventArgs(HostName, old, newState));
|
||||
}
|
||||
|
||||
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
try { await ShutdownAsync(CancellationToken.None).ConfigureAwait(false); }
|
||||
catch { /* disposal is best-effort */ }
|
||||
_gate.Dispose();
|
||||
}
|
||||
}
|
||||
120
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7DriverOptions.cs
Normal file
120
src/ZB.MOM.WW.OtOpcUa.Driver.S7/S7DriverOptions.cs
Normal file
@@ -0,0 +1,120 @@
|
||||
using S7NetCpuType = global::S7.Net.CpuType;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7;
|
||||
|
||||
/// <summary>
|
||||
/// Siemens S7 native (S7comm / ISO-on-TCP port 102) driver configuration. Bound from the
|
||||
/// driver's <c>DriverConfig</c> JSON at <c>DriverHost.RegisterAsync</c>. Unlike the Modbus
|
||||
/// driver the S7 driver uses the PLC's *native* protocol — port 102 ISO-on-TCP rather
|
||||
/// than Modbus's 502, and S7-specific area codes (DB, M, I, Q) rather than holding-
|
||||
/// register / coil tables.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The driver requires <b>PUT/GET communication enabled</b> in the TIA Portal
|
||||
/// hardware config for S7-1200/1500. The factory default disables PUT/GET access,
|
||||
/// so a driver configured against a freshly-flashed CPU will see a hard error
|
||||
/// (S7.Net surfaces it as <c>Plc.ReadAsync</c> returning <c>ErrorCode.Accessing</c>).
|
||||
/// The driver maps that specifically to <c>BadNotSupported</c> and flags it as a
|
||||
/// configuration alert rather than a transient fault — blind Polly retry is wasted
|
||||
/// effort when the PLC will keep refusing every request.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// See <c>docs/v2/driver-specs.md</c> §5 for the full specification.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class S7DriverOptions
|
||||
{
|
||||
/// <summary>PLC IP address or hostname.</summary>
|
||||
public string Host { get; init; } = "127.0.0.1";
|
||||
|
||||
/// <summary>TCP port. ISO-on-TCP is 102 on every S7 model; override only for unusual NAT setups.</summary>
|
||||
public int Port { get; init; } = 102;
|
||||
|
||||
/// <summary>
|
||||
/// CPU family. Determines the ISO-TSAP slot byte that S7.Net uses during connection
|
||||
/// setup — pick the family that matches the target PLC exactly.
|
||||
/// </summary>
|
||||
public S7NetCpuType CpuType { get; init; } = S7NetCpuType.S71500;
|
||||
|
||||
/// <summary>
|
||||
/// Hardware rack number. Almost always 0; relevant only for distributed S7-400 racks
|
||||
/// with multiple CPUs.
|
||||
/// </summary>
|
||||
public short Rack { get; init; } = 0;
|
||||
|
||||
/// <summary>
|
||||
/// CPU slot. Conventions per family: S7-300 = slot 2, S7-400 = slot 2 or 3,
|
||||
/// S7-1200 / S7-1500 = slot 0 (onboard PN). S7.Net uses this to build the remote
|
||||
/// TSAP. Wrong slot → connection refused during handshake.
|
||||
/// </summary>
|
||||
public short Slot { get; init; } = 0;
|
||||
|
||||
/// <summary>Connect + per-operation timeout.</summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>Pre-declared tag map. S7 has a symbol-table protocol but S7.Net does not expose it, so the driver operates off a static tag list configured per-site. Address grammar documented in S7AddressParser (PR 63).</summary>
|
||||
public IReadOnlyList<S7TagDefinition> Tags { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Background connectivity-probe settings. When enabled, the driver runs a tick loop
|
||||
/// that issues a cheap read against <see cref="S7ProbeOptions.ProbeAddress"/> every
|
||||
/// <see cref="S7ProbeOptions.Interval"/> and raises <c>OnHostStatusChanged</c> on
|
||||
/// Running ↔ Stopped transitions.
|
||||
/// </summary>
|
||||
public S7ProbeOptions Probe { get; init; } = new();
|
||||
}
|
||||
|
||||
public sealed class S7ProbeOptions
|
||||
{
|
||||
public bool Enabled { get; init; } = true;
|
||||
public TimeSpan Interval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <summary>
|
||||
/// Address to probe for liveness. DB1.DBW0 is the convention if the PLC project
|
||||
/// reserves a small fingerprint DB for health checks (per <c>docs/v2/s7.md</c>);
|
||||
/// if not, pick any valid Merker word like <c>MW0</c>.
|
||||
/// </summary>
|
||||
public string ProbeAddress { get; init; } = "MW0";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// One S7 variable as exposed by the driver. Addresses use S7.Net syntax — see
|
||||
/// <c>S7AddressParser</c> (PR 63) for the grammar.
|
||||
/// </summary>
|
||||
/// <param name="Name">Tag name; OPC UA browse name + driver full reference.</param>
|
||||
/// <param name="Address">S7 address string, e.g. <c>DB1.DBW0</c>, <c>M0.0</c>, <c>I0.0</c>, <c>QD4</c>. Grammar documented in <c>S7AddressParser</c> (PR 63).</param>
|
||||
/// <param name="DataType">Logical data type — drives the underlying S7.Net read/write width.</param>
|
||||
/// <param name="Writable">When true the driver accepts writes for this tag.</param>
|
||||
/// <param name="StringLength">For <c>DataType = String</c>: S7-string max length. Default 254 (S7 max).</param>
|
||||
/// <param name="WriteIdempotent">
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #44, #45, #143 — flag a tag as safe to replay on
|
||||
/// write timeout / failure. Default <c>false</c>; writes do not auto-retry. Safe candidates
|
||||
/// on S7: DB word/dword set-points holding analog values, configuration DBs where the same
|
||||
/// value can be written again without side-effects. Unsafe: M (merker) bits or Q (output)
|
||||
/// coils that drive edge-triggered routines in the PLC program.
|
||||
/// </param>
|
||||
public sealed record S7TagDefinition(
|
||||
string Name,
|
||||
string Address,
|
||||
S7DataType DataType,
|
||||
bool Writable = true,
|
||||
int StringLength = 254,
|
||||
bool WriteIdempotent = false);
|
||||
|
||||
public enum S7DataType
|
||||
{
|
||||
Bool,
|
||||
Byte,
|
||||
Int16,
|
||||
UInt16,
|
||||
Int32,
|
||||
UInt32,
|
||||
Int64,
|
||||
UInt64,
|
||||
Float32,
|
||||
Float64,
|
||||
String,
|
||||
DateTime,
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<LangVersion>latest</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<NoWarn>$(NoWarn);CS1591</NoWarn>
|
||||
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.S7</RootNamespace>
|
||||
<AssemblyName>ZB.MOM.WW.OtOpcUa.Driver.S7</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Core.Abstractions\ZB.MOM.WW.OtOpcUa.Core.Abstractions.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="S7netplus" Version="0.20.0"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Driver.S7.Tests"/>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,181 @@
|
||||
using System.Net;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Standalone <see cref="HttpListener"/> host for <c>/healthz</c> and <c>/readyz</c>
|
||||
/// separate from the OPC UA binding. Per <c>docs/v2/implementation/phase-6-1-resilience-
|
||||
/// and-observability.md</c> §Stream C.1.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Binds to <c>http://localhost:4841</c> by default — loopback avoids the Windows URL-ACL
|
||||
/// elevation requirement that binding to <c>http://+:4841</c> (wildcard) would impose.
|
||||
/// When a deployment needs remote probing, a reverse proxy or explicit netsh urlacl grant
|
||||
/// is the expected path; documented in <c>docs/v2/Server-Deployment.md</c> in a follow-up.
|
||||
/// </remarks>
|
||||
public sealed class HealthEndpointsHost : IAsyncDisposable
|
||||
{
|
||||
private readonly string _prefix;
|
||||
private readonly DriverHost _driverHost;
|
||||
private readonly Func<bool> _configDbHealthy;
|
||||
private readonly Func<bool> _usingStaleConfig;
|
||||
private readonly ILogger<HealthEndpointsHost> _logger;
|
||||
private readonly HttpListener _listener = new();
|
||||
private readonly DateTime _startedUtc = DateTime.UtcNow;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _acceptLoop;
|
||||
private bool _disposed;
|
||||
|
||||
public HealthEndpointsHost(
|
||||
DriverHost driverHost,
|
||||
ILogger<HealthEndpointsHost> logger,
|
||||
Func<bool>? configDbHealthy = null,
|
||||
Func<bool>? usingStaleConfig = null,
|
||||
string prefix = "http://localhost:4841/")
|
||||
{
|
||||
_driverHost = driverHost;
|
||||
_logger = logger;
|
||||
_configDbHealthy = configDbHealthy ?? (() => true);
|
||||
_usingStaleConfig = usingStaleConfig ?? (() => false);
|
||||
_prefix = prefix.EndsWith('/') ? prefix : prefix + "/";
|
||||
_listener.Prefixes.Add(_prefix);
|
||||
}
|
||||
|
||||
public void Start()
|
||||
{
|
||||
_listener.Start();
|
||||
_cts = new CancellationTokenSource();
|
||||
_acceptLoop = Task.Run(() => AcceptLoopAsync(_cts.Token));
|
||||
_logger.LogInformation("Health endpoints listening on {Prefix}", _prefix);
|
||||
}
|
||||
|
||||
private async Task AcceptLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
HttpListenerContext ctx;
|
||||
try
|
||||
{
|
||||
ctx = await _listener.GetContextAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (HttpListenerException) when (ct.IsCancellationRequested) { break; }
|
||||
catch (ObjectDisposedException) { break; }
|
||||
|
||||
_ = Task.Run(() => HandleAsync(ctx), ct);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleAsync(HttpListenerContext ctx)
|
||||
{
|
||||
try
|
||||
{
|
||||
var path = ctx.Request.Url?.AbsolutePath ?? "/";
|
||||
switch (path)
|
||||
{
|
||||
case "/healthz":
|
||||
await WriteHealthzAsync(ctx).ConfigureAwait(false);
|
||||
break;
|
||||
case "/readyz":
|
||||
await WriteReadyzAsync(ctx).ConfigureAwait(false);
|
||||
break;
|
||||
default:
|
||||
ctx.Response.StatusCode = 404;
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Health endpoint handler failure");
|
||||
try { ctx.Response.StatusCode = 500; } catch { /* ignore */ }
|
||||
}
|
||||
finally
|
||||
{
|
||||
try { ctx.Response.Close(); } catch { /* ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
private async Task WriteHealthzAsync(HttpListenerContext ctx)
|
||||
{
|
||||
var configHealthy = _configDbHealthy();
|
||||
var staleConfig = _usingStaleConfig();
|
||||
// /healthz is 200 when process alive + (config DB reachable OR cache-warm).
|
||||
// Stale-config still serves 200 so the process isn't flagged dead when the DB
|
||||
// blips; the body surfaces the stale flag for operators.
|
||||
var healthy = configHealthy || staleConfig;
|
||||
ctx.Response.StatusCode = healthy ? 200 : 503;
|
||||
|
||||
var body = JsonSerializer.Serialize(new
|
||||
{
|
||||
status = healthy ? "healthy" : "unhealthy",
|
||||
uptimeSeconds = (int)(DateTime.UtcNow - _startedUtc).TotalSeconds,
|
||||
configDbReachable = configHealthy,
|
||||
usingStaleConfig = staleConfig,
|
||||
});
|
||||
await WriteBodyAsync(ctx, body).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task WriteReadyzAsync(HttpListenerContext ctx)
|
||||
{
|
||||
var snapshots = BuildSnapshots();
|
||||
var verdict = DriverHealthReport.Aggregate(snapshots);
|
||||
ctx.Response.StatusCode = DriverHealthReport.HttpStatus(verdict);
|
||||
|
||||
var body = JsonSerializer.Serialize(new
|
||||
{
|
||||
verdict = verdict.ToString(),
|
||||
uptimeSeconds = (int)(DateTime.UtcNow - _startedUtc).TotalSeconds,
|
||||
drivers = snapshots.Select(d => new
|
||||
{
|
||||
id = d.DriverInstanceId,
|
||||
state = d.State.ToString(),
|
||||
detail = d.DetailMessage,
|
||||
}).ToArray(),
|
||||
degradedDrivers = snapshots
|
||||
.Where(d => d.State == DriverState.Degraded || d.State == DriverState.Reconnecting)
|
||||
.Select(d => d.DriverInstanceId)
|
||||
.ToArray(),
|
||||
});
|
||||
await WriteBodyAsync(ctx, body).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private IReadOnlyList<DriverHealthSnapshot> BuildSnapshots()
|
||||
{
|
||||
var list = new List<DriverHealthSnapshot>();
|
||||
foreach (var id in _driverHost.RegisteredDriverIds)
|
||||
{
|
||||
var driver = _driverHost.GetDriver(id);
|
||||
if (driver is null) continue;
|
||||
var health = driver.GetHealth();
|
||||
list.Add(new DriverHealthSnapshot(driver.DriverInstanceId, health.State, health.LastError));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private static async Task WriteBodyAsync(HttpListenerContext ctx, string body)
|
||||
{
|
||||
var bytes = Encoding.UTF8.GetBytes(body);
|
||||
ctx.Response.ContentType = "application/json; charset=utf-8";
|
||||
ctx.Response.ContentLength64 = bytes.LongLength;
|
||||
await ctx.Response.OutputStream.WriteAsync(bytes).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_cts?.Cancel();
|
||||
try { _listener.Stop(); } catch { /* ignore */ }
|
||||
if (_acceptLoop is not null)
|
||||
{
|
||||
try { await _acceptLoop.ConfigureAwait(false); } catch { /* ignore */ }
|
||||
}
|
||||
_listener.Close();
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ using Microsoft.Extensions.Logging;
|
||||
using Opc.Ua;
|
||||
using Opc.Ua.Server;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
using DriverWriteRequest = ZB.MOM.WW.OtOpcUa.Core.Abstractions.WriteRequest;
|
||||
// Core.Abstractions defines a type-named HistoryReadResult (driver-side samples + continuation
|
||||
@@ -33,8 +34,14 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
private readonly IDriver _driver;
|
||||
private readonly IReadable? _readable;
|
||||
private readonly IWritable? _writable;
|
||||
private readonly CapabilityInvoker _invoker;
|
||||
private readonly ILogger<DriverNodeManager> _logger;
|
||||
|
||||
// Per-variable idempotency flag populated during Variable() registration from
|
||||
// DriverAttributeInfo.WriteIdempotent. Drives ExecuteWriteAsync's retry gating in
|
||||
// OnWriteValue; absent entries default to false (decisions #44, #45, #143).
|
||||
private readonly Dictionary<string, bool> _writeIdempotentByFullRef = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
/// <summary>The driver whose address space this node manager exposes.</summary>
|
||||
public IDriver Driver => _driver;
|
||||
|
||||
@@ -53,12 +60,13 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
private FolderState _currentFolder = null!;
|
||||
|
||||
public DriverNodeManager(IServerInternal server, ApplicationConfiguration configuration,
|
||||
IDriver driver, ILogger<DriverNodeManager> logger)
|
||||
IDriver driver, CapabilityInvoker invoker, ILogger<DriverNodeManager> logger)
|
||||
: base(server, configuration, namespaceUris: $"urn:OtOpcUa:{driver.DriverInstanceId}")
|
||||
{
|
||||
_driver = driver;
|
||||
_readable = driver as IReadable;
|
||||
_writable = driver as IWritable;
|
||||
_invoker = invoker;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
@@ -148,6 +156,7 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
AddPredefinedNode(SystemContext, v);
|
||||
_variablesByFullRef[attributeInfo.FullName] = v;
|
||||
_securityByFullRef[attributeInfo.FullName] = attributeInfo.SecurityClass;
|
||||
_writeIdempotentByFullRef[attributeInfo.FullName] = attributeInfo.WriteIdempotent;
|
||||
|
||||
v.OnReadValue = OnReadValue;
|
||||
v.OnWriteValue = OnWriteValue;
|
||||
@@ -188,7 +197,11 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
try
|
||||
{
|
||||
var fullRef = node.NodeId.Identifier as string ?? "";
|
||||
var result = _readable.ReadAsync([fullRef], CancellationToken.None).GetAwaiter().GetResult();
|
||||
var result = _invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => (IReadOnlyList<DataValueSnapshot>)await _readable.ReadAsync([fullRef], ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
if (result.Count == 0)
|
||||
{
|
||||
statusCode = StatusCodes.BadNoData;
|
||||
@@ -381,9 +394,15 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
|
||||
try
|
||||
{
|
||||
var results = _writable.WriteAsync(
|
||||
[new DriverWriteRequest(fullRef!, value)],
|
||||
CancellationToken.None).GetAwaiter().GetResult();
|
||||
var isIdempotent = _writeIdempotentByFullRef.GetValueOrDefault(fullRef!, false);
|
||||
var capturedValue = value;
|
||||
var results = _invoker.ExecuteWriteAsync(
|
||||
_driver.DriverInstanceId,
|
||||
isIdempotent,
|
||||
async ct => (IReadOnlyList<WriteResult>)await _writable.WriteAsync(
|
||||
[new DriverWriteRequest(fullRef!, capturedValue)],
|
||||
ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
if (results.Count > 0 && results[0].StatusCode != 0)
|
||||
{
|
||||
statusCode = results[0].StatusCode;
|
||||
@@ -465,12 +484,16 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = History.ReadRawAsync(
|
||||
fullRef,
|
||||
details.StartTime,
|
||||
details.EndTime,
|
||||
details.NumValuesPerNode,
|
||||
CancellationToken.None).GetAwaiter().GetResult();
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadRawAsync(
|
||||
fullRef,
|
||||
details.StartTime,
|
||||
details.EndTime,
|
||||
details.NumValuesPerNode,
|
||||
ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryData(driverResult.Samples), driverResult.ContinuationPoint);
|
||||
@@ -525,13 +548,17 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = History.ReadProcessedAsync(
|
||||
fullRef,
|
||||
details.StartTime,
|
||||
details.EndTime,
|
||||
interval,
|
||||
aggregate.Value,
|
||||
CancellationToken.None).GetAwaiter().GetResult();
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadProcessedAsync(
|
||||
fullRef,
|
||||
details.StartTime,
|
||||
details.EndTime,
|
||||
interval,
|
||||
aggregate.Value,
|
||||
ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryData(driverResult.Samples), driverResult.ContinuationPoint);
|
||||
@@ -578,8 +605,11 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = History.ReadAtTimeAsync(
|
||||
fullRef, requestedTimes, CancellationToken.None).GetAwaiter().GetResult();
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadAtTimeAsync(fullRef, requestedTimes, ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryData(driverResult.Samples), driverResult.ContinuationPoint);
|
||||
@@ -632,12 +662,16 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
|
||||
|
||||
try
|
||||
{
|
||||
var driverResult = History.ReadEventsAsync(
|
||||
sourceName: fullRef,
|
||||
startUtc: details.StartTime,
|
||||
endUtc: details.EndTime,
|
||||
maxEvents: maxEvents,
|
||||
cancellationToken: CancellationToken.None).GetAwaiter().GetResult();
|
||||
var driverResult = _invoker.ExecuteAsync(
|
||||
DriverCapability.HistoryRead,
|
||||
_driver.DriverInstanceId,
|
||||
async ct => await History.ReadEventsAsync(
|
||||
sourceName: fullRef,
|
||||
startUtc: details.StartTime,
|
||||
endUtc: details.EndTime,
|
||||
maxEvents: maxEvents,
|
||||
cancellationToken: ct).ConfigureAwait(false),
|
||||
CancellationToken.None).AsTask().GetAwaiter().GetResult();
|
||||
|
||||
WriteResult(results, errors, i, StatusCodes.Good,
|
||||
BuildHistoryEvent(driverResult.Events), driverResult.ContinuationPoint);
|
||||
|
||||
@@ -3,6 +3,8 @@ using Opc.Ua;
|
||||
using Opc.Ua.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.OpcUa;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Observability;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
|
||||
@@ -20,18 +22,22 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
private readonly OpcUaServerOptions _options;
|
||||
private readonly DriverHost _driverHost;
|
||||
private readonly IUserAuthenticator _authenticator;
|
||||
private readonly DriverResiliencePipelineBuilder _pipelineBuilder;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly ILogger<OpcUaApplicationHost> _logger;
|
||||
private ApplicationInstance? _application;
|
||||
private OtOpcUaServer? _server;
|
||||
private HealthEndpointsHost? _healthHost;
|
||||
private bool _disposed;
|
||||
|
||||
public OpcUaApplicationHost(OpcUaServerOptions options, DriverHost driverHost,
|
||||
IUserAuthenticator authenticator, ILoggerFactory loggerFactory, ILogger<OpcUaApplicationHost> logger)
|
||||
IUserAuthenticator authenticator, ILoggerFactory loggerFactory, ILogger<OpcUaApplicationHost> logger,
|
||||
DriverResiliencePipelineBuilder? pipelineBuilder = null)
|
||||
{
|
||||
_options = options;
|
||||
_driverHost = driverHost;
|
||||
_authenticator = authenticator;
|
||||
_pipelineBuilder = pipelineBuilder ?? new DriverResiliencePipelineBuilder();
|
||||
_loggerFactory = loggerFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
@@ -58,12 +64,23 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
throw new InvalidOperationException(
|
||||
$"OPC UA application certificate could not be validated or created in {_options.PkiStoreRoot}");
|
||||
|
||||
_server = new OtOpcUaServer(_driverHost, _authenticator, _loggerFactory);
|
||||
_server = new OtOpcUaServer(_driverHost, _authenticator, _pipelineBuilder, _loggerFactory);
|
||||
await _application.Start(_server).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation("OPC UA server started — endpoint={Endpoint} driverCount={Count}",
|
||||
_options.EndpointUrl, _server.DriverNodeManagers.Count);
|
||||
|
||||
// Phase 6.1 Stream C: health endpoints on :4841 (loopback by default — see
|
||||
// HealthEndpointsHost remarks for the Windows URL-ACL tradeoff).
|
||||
if (_options.HealthEndpointsEnabled)
|
||||
{
|
||||
_healthHost = new HealthEndpointsHost(
|
||||
_driverHost,
|
||||
_loggerFactory.CreateLogger<HealthEndpointsHost>(),
|
||||
prefix: _options.HealthEndpointsPrefix);
|
||||
_healthHost.Start();
|
||||
}
|
||||
|
||||
// Drive each driver's discovery through its node manager. The node manager IS the
|
||||
// IAddressSpaceBuilder; GenericDriverNodeManager captures alarm-condition sinks into
|
||||
// its internal map and wires OnAlarmEvent → sink routing.
|
||||
@@ -217,6 +234,12 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
{
|
||||
_logger.LogWarning(ex, "OPC UA server stop threw during dispose");
|
||||
}
|
||||
|
||||
if (_healthHost is not null)
|
||||
{
|
||||
try { await _healthHost.DisposeAsync().ConfigureAwait(false); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "Health endpoints host dispose threw"); }
|
||||
}
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +58,20 @@ public sealed class OpcUaServerOptions
|
||||
/// </summary>
|
||||
public bool AutoAcceptUntrustedClientCertificates { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to start the Phase 6.1 Stream C <c>/healthz</c> + <c>/readyz</c> HTTP listener.
|
||||
/// Defaults to <c>true</c>; set false in embedded deployments that don't need HTTP
|
||||
/// (e.g. tests that only exercise the OPC UA surface).
|
||||
/// </summary>
|
||||
public bool HealthEndpointsEnabled { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// URL prefix the health endpoints bind to. Default <c>http://localhost:4841/</c> — loopback
|
||||
/// avoids Windows URL-ACL elevation. Production deployments that need remote probing should
|
||||
/// either reverse-proxy or use <c>http://+:4841/</c> with netsh urlacl granted.
|
||||
/// </summary>
|
||||
public string HealthEndpointsPrefix { get; init; } = "http://localhost:4841/";
|
||||
|
||||
/// <summary>
|
||||
/// Security profile advertised on the endpoint. Default <see cref="OpcUaSecurityProfile.None"/>
|
||||
/// preserves the PR 17 endpoint shape; set to <see cref="OpcUaSecurityProfile.Basic256Sha256SignAndEncrypt"/>
|
||||
|
||||
@@ -5,6 +5,7 @@ using Opc.Ua.Server;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.OpcUa;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
|
||||
@@ -19,13 +20,19 @@ public sealed class OtOpcUaServer : StandardServer
|
||||
{
|
||||
private readonly DriverHost _driverHost;
|
||||
private readonly IUserAuthenticator _authenticator;
|
||||
private readonly DriverResiliencePipelineBuilder _pipelineBuilder;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly List<DriverNodeManager> _driverNodeManagers = new();
|
||||
|
||||
public OtOpcUaServer(DriverHost driverHost, IUserAuthenticator authenticator, ILoggerFactory loggerFactory)
|
||||
public OtOpcUaServer(
|
||||
DriverHost driverHost,
|
||||
IUserAuthenticator authenticator,
|
||||
DriverResiliencePipelineBuilder pipelineBuilder,
|
||||
ILoggerFactory loggerFactory)
|
||||
{
|
||||
_driverHost = driverHost;
|
||||
_authenticator = authenticator;
|
||||
_pipelineBuilder = pipelineBuilder;
|
||||
_loggerFactory = loggerFactory;
|
||||
}
|
||||
|
||||
@@ -46,7 +53,12 @@ public sealed class OtOpcUaServer : StandardServer
|
||||
if (driver is null) continue;
|
||||
|
||||
var logger = _loggerFactory.CreateLogger<DriverNodeManager>();
|
||||
var manager = new DriverNodeManager(server, configuration, driver, logger);
|
||||
// Per-driver resilience options: default Tier A pending Stream B.1 which wires
|
||||
// per-type tiers into DriverTypeRegistry. Read ResilienceConfig JSON from the
|
||||
// DriverInstance row in a follow-up PR; for now every driver gets Tier A defaults.
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
var invoker = new CapabilityInvoker(_pipelineBuilder, driver.DriverInstanceId, () => options, driver.DriverType);
|
||||
var manager = new DriverNodeManager(server, configuration, driver, invoker, logger);
|
||||
_driverNodeManagers.Add(manager);
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Serilog;
|
||||
using Serilog.Formatting.Compact;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
@@ -13,11 +14,25 @@ using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
var builder = Host.CreateApplicationBuilder(args);
|
||||
|
||||
Log.Logger = new LoggerConfiguration()
|
||||
// Per Phase 6.1 Stream C.3: SIEMs (Splunk, Datadog) ingest the JSON file without a
|
||||
// regex parser. Plain-text rolling file stays on by default for human readability;
|
||||
// JSON file is opt-in via appsetting `Serilog:WriteJson = true`.
|
||||
var writeJson = builder.Configuration.GetValue<bool>("Serilog:WriteJson");
|
||||
var loggerBuilder = new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(builder.Configuration)
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.Console()
|
||||
.WriteTo.File("logs/otopcua-.log", rollingInterval: RollingInterval.Day)
|
||||
.CreateLogger();
|
||||
.WriteTo.File("logs/otopcua-.log", rollingInterval: RollingInterval.Day);
|
||||
|
||||
if (writeJson)
|
||||
{
|
||||
loggerBuilder = loggerBuilder.WriteTo.File(
|
||||
new CompactJsonFormatter(),
|
||||
"logs/otopcua-.json.log",
|
||||
rollingInterval: RollingInterval.Day);
|
||||
}
|
||||
|
||||
Log.Logger = loggerBuilder.CreateLogger();
|
||||
|
||||
builder.Services.AddSerilog();
|
||||
builder.Services.AddWindowsService(o => o.ServiceName = "OtOpcUa");
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
<PackageReference Include="Serilog.Settings.Configuration" Version="9.0.0"/>
|
||||
<PackageReference Include="Serilog.Sinks.Console" Version="6.0.0"/>
|
||||
<PackageReference Include="Serilog.Sinks.File" Version="7.0.0"/>
|
||||
<PackageReference Include="Serilog.Formatting.Compact" Version="3.0.0"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Server" Version="1.5.374.126"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Configuration" Version="1.5.374.126"/>
|
||||
<PackageReference Include="Novell.Directory.Ldap.NETStandard" Version="3.6.0"/>
|
||||
|
||||
@@ -7,11 +7,13 @@ public sealed class DriverTypeRegistryTests
|
||||
{
|
||||
private static DriverTypeMetadata SampleMetadata(
|
||||
string typeName = "Modbus",
|
||||
NamespaceKindCompatibility allowed = NamespaceKindCompatibility.Equipment) =>
|
||||
NamespaceKindCompatibility allowed = NamespaceKindCompatibility.Equipment,
|
||||
DriverTier tier = DriverTier.B) =>
|
||||
new(typeName, allowed,
|
||||
DriverConfigJsonSchema: "{\"type\": \"object\"}",
|
||||
DeviceConfigJsonSchema: "{\"type\": \"object\"}",
|
||||
TagConfigJsonSchema: "{\"type\": \"object\"}");
|
||||
TagConfigJsonSchema: "{\"type\": \"object\"}",
|
||||
Tier: tier);
|
||||
|
||||
[Fact]
|
||||
public void Register_ThenGet_RoundTrips()
|
||||
@@ -24,6 +26,20 @@ public sealed class DriverTypeRegistryTests
|
||||
registry.Get("Modbus").ShouldBe(metadata);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void Register_Requires_NonNullTier(DriverTier tier)
|
||||
{
|
||||
var registry = new DriverTypeRegistry();
|
||||
var metadata = SampleMetadata(typeName: $"Driver-{tier}", tier: tier);
|
||||
|
||||
registry.Register(metadata);
|
||||
|
||||
registry.Get(metadata.TypeName).Tier.ShouldBe(tier);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Get_IsCaseInsensitive()
|
||||
{
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
using Serilog;
|
||||
using Serilog.Core;
|
||||
using Serilog.Events;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Observability;
|
||||
|
||||
[Trait("Category", "Integration")]
|
||||
public sealed class CapabilityInvokerEnrichmentTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task InvokerExecute_LogsInsideCallSite_CarryStructuredProperties()
|
||||
{
|
||||
var sink = new InMemorySink();
|
||||
var logger = new LoggerConfiguration()
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.Sink(sink)
|
||||
.CreateLogger();
|
||||
|
||||
var invoker = new CapabilityInvoker(
|
||||
new DriverResiliencePipelineBuilder(),
|
||||
driverInstanceId: "drv-live",
|
||||
optionsAccessor: () => new DriverResilienceOptions { Tier = DriverTier.A },
|
||||
driverType: "Modbus");
|
||||
|
||||
await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"plc-1",
|
||||
ct =>
|
||||
{
|
||||
logger.Information("inside call site");
|
||||
return ValueTask.FromResult(42);
|
||||
},
|
||||
CancellationToken.None);
|
||||
|
||||
var evt = sink.Events.ShouldHaveSingleItem();
|
||||
evt.Properties["DriverInstanceId"].ToString().ShouldBe("\"drv-live\"");
|
||||
evt.Properties["DriverType"].ToString().ShouldBe("\"Modbus\"");
|
||||
evt.Properties["CapabilityName"].ToString().ShouldBe("\"Read\"");
|
||||
evt.Properties.ShouldContainKey("CorrelationId");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task InvokerExecute_DoesNotLeak_ContextOutsideCallSite()
|
||||
{
|
||||
var sink = new InMemorySink();
|
||||
var logger = new LoggerConfiguration()
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.Sink(sink)
|
||||
.CreateLogger();
|
||||
|
||||
var invoker = new CapabilityInvoker(
|
||||
new DriverResiliencePipelineBuilder(),
|
||||
driverInstanceId: "drv-a",
|
||||
optionsAccessor: () => new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host", _ => ValueTask.FromResult(1), CancellationToken.None);
|
||||
logger.Information("outside");
|
||||
|
||||
var outside = sink.Events.ShouldHaveSingleItem();
|
||||
outside.Properties.ContainsKey("DriverInstanceId").ShouldBeFalse();
|
||||
}
|
||||
|
||||
private sealed class InMemorySink : ILogEventSink
|
||||
{
|
||||
public List<LogEvent> Events { get; } = [];
|
||||
public void Emit(LogEvent logEvent) => Events.Add(logEvent);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Observability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverHealthReportTests
|
||||
{
|
||||
[Fact]
|
||||
public void EmptyFleet_IsHealthy()
|
||||
{
|
||||
DriverHealthReport.Aggregate([]).ShouldBe(ReadinessVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AllHealthy_Fleet_IsHealthy()
|
||||
{
|
||||
var verdict = DriverHealthReport.Aggregate([
|
||||
new DriverHealthSnapshot("a", DriverState.Healthy),
|
||||
new DriverHealthSnapshot("b", DriverState.Healthy),
|
||||
]);
|
||||
verdict.ShouldBe(ReadinessVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AnyFaulted_TrumpsEverything()
|
||||
{
|
||||
var verdict = DriverHealthReport.Aggregate([
|
||||
new DriverHealthSnapshot("a", DriverState.Healthy),
|
||||
new DriverHealthSnapshot("b", DriverState.Degraded),
|
||||
new DriverHealthSnapshot("c", DriverState.Faulted),
|
||||
new DriverHealthSnapshot("d", DriverState.Initializing),
|
||||
]);
|
||||
verdict.ShouldBe(ReadinessVerdict.Faulted);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverState.Unknown)]
|
||||
[InlineData(DriverState.Initializing)]
|
||||
public void Any_NotReady_WithoutFaulted_IsNotReady(DriverState initializingState)
|
||||
{
|
||||
var verdict = DriverHealthReport.Aggregate([
|
||||
new DriverHealthSnapshot("a", DriverState.Healthy),
|
||||
new DriverHealthSnapshot("b", initializingState),
|
||||
]);
|
||||
verdict.ShouldBe(ReadinessVerdict.NotReady);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Any_Degraded_WithoutFaultedOrNotReady_IsDegraded()
|
||||
{
|
||||
var verdict = DriverHealthReport.Aggregate([
|
||||
new DriverHealthSnapshot("a", DriverState.Healthy),
|
||||
new DriverHealthSnapshot("b", DriverState.Degraded),
|
||||
]);
|
||||
verdict.ShouldBe(ReadinessVerdict.Degraded);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(ReadinessVerdict.Healthy, 200)]
|
||||
[InlineData(ReadinessVerdict.Degraded, 200)]
|
||||
[InlineData(ReadinessVerdict.NotReady, 503)]
|
||||
[InlineData(ReadinessVerdict.Faulted, 503)]
|
||||
public void HttpStatus_MatchesStateMatrix(ReadinessVerdict verdict, int expected)
|
||||
{
|
||||
DriverHealthReport.HttpStatus(verdict).ShouldBe(expected);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
using Serilog;
|
||||
using Serilog.Core;
|
||||
using Serilog.Events;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Observability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class LogContextEnricherTests
|
||||
{
|
||||
[Fact]
|
||||
public void Scope_Attaches_AllFour_Properties()
|
||||
{
|
||||
var captured = new InMemorySink();
|
||||
var logger = new LoggerConfiguration()
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.Sink(captured)
|
||||
.CreateLogger();
|
||||
|
||||
using (LogContextEnricher.Push("drv-1", "Modbus", DriverCapability.Read, "abc123"))
|
||||
{
|
||||
logger.Information("test message");
|
||||
}
|
||||
|
||||
var evt = captured.Events.ShouldHaveSingleItem();
|
||||
evt.Properties["DriverInstanceId"].ToString().ShouldBe("\"drv-1\"");
|
||||
evt.Properties["DriverType"].ToString().ShouldBe("\"Modbus\"");
|
||||
evt.Properties["CapabilityName"].ToString().ShouldBe("\"Read\"");
|
||||
evt.Properties["CorrelationId"].ToString().ShouldBe("\"abc123\"");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Scope_Dispose_Pops_Properties()
|
||||
{
|
||||
var captured = new InMemorySink();
|
||||
var logger = new LoggerConfiguration()
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.Sink(captured)
|
||||
.CreateLogger();
|
||||
|
||||
using (LogContextEnricher.Push("drv-1", "Modbus", DriverCapability.Read, "abc123"))
|
||||
{
|
||||
logger.Information("inside");
|
||||
}
|
||||
logger.Information("outside");
|
||||
|
||||
captured.Events.Count.ShouldBe(2);
|
||||
captured.Events[0].Properties.ContainsKey("DriverInstanceId").ShouldBeTrue();
|
||||
captured.Events[1].Properties.ContainsKey("DriverInstanceId").ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void NewCorrelationId_Returns_12_Hex_Chars()
|
||||
{
|
||||
var id = LogContextEnricher.NewCorrelationId();
|
||||
id.Length.ShouldBe(12);
|
||||
id.ShouldMatch("^[0-9a-f]{12}$");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(null)]
|
||||
[InlineData("")]
|
||||
[InlineData(" ")]
|
||||
public void Push_Throws_OnMissingDriverInstanceId(string? id)
|
||||
{
|
||||
Should.Throw<ArgumentException>(() =>
|
||||
LogContextEnricher.Push(id!, "Modbus", DriverCapability.Read, "c"));
|
||||
}
|
||||
|
||||
private sealed class InMemorySink : ILogEventSink
|
||||
{
|
||||
public List<LogEvent> Events { get; } = [];
|
||||
public void Emit(LogEvent logEvent) => Events.Add(logEvent);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class CapabilityInvokerTests
|
||||
{
|
||||
private static CapabilityInvoker MakeInvoker(
|
||||
DriverResiliencePipelineBuilder builder,
|
||||
DriverResilienceOptions options) =>
|
||||
new(builder, "drv-test", () => options);
|
||||
|
||||
[Fact]
|
||||
public async Task Read_ReturnsValue_FromCallSite()
|
||||
{
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
|
||||
var result = await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"host-1",
|
||||
_ => ValueTask.FromResult(42),
|
||||
CancellationToken.None);
|
||||
|
||||
result.ShouldBe(42);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Read_Retries_OnTransientFailure()
|
||||
{
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
var attempts = 0;
|
||||
|
||||
var result = await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"host-1",
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 2) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
return "ok";
|
||||
},
|
||||
CancellationToken.None);
|
||||
|
||||
result.ShouldBe("ok");
|
||||
attempts.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_NonIdempotent_DoesNotRetry_EvenWhenPolicyHasRetries()
|
||||
{
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), options);
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: false,
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
#pragma warning disable CS0162
|
||||
return 0;
|
||||
#pragma warning restore CS0162
|
||||
},
|
||||
CancellationToken.None));
|
||||
|
||||
attempts.ShouldBe(1, "non-idempotent write must never replay");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_Idempotent_Retries_WhenPolicyHasRetries()
|
||||
{
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), options);
|
||||
var attempts = 0;
|
||||
|
||||
var result = await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: true,
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 2) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
return "ok";
|
||||
},
|
||||
CancellationToken.None);
|
||||
|
||||
result.ShouldBe("ok");
|
||||
attempts.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_Default_DoesNotRetry_WhenPolicyHasZeroRetries()
|
||||
{
|
||||
// Tier A Write default is RetryCount=0. Even isIdempotent=true shouldn't retry
|
||||
// because the policy says not to.
|
||||
var invoker = MakeInvoker(new DriverResiliencePipelineBuilder(), new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: true,
|
||||
async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
#pragma warning disable CS0162
|
||||
return 0;
|
||||
#pragma warning restore CS0162
|
||||
},
|
||||
CancellationToken.None));
|
||||
|
||||
attempts.ShouldBe(1, "tier-A default for Write is RetryCount=0");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Execute_HonorsDifferentHosts_Independently()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var invoker = MakeInvoker(builder, new DriverResilienceOptions { Tier = DriverTier.A });
|
||||
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-a", _ => ValueTask.FromResult(1), CancellationToken.None);
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-b", _ => ValueTask.FromResult(2), CancellationToken.None);
|
||||
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverResilienceOptionsTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void TierDefaults_Cover_EveryCapability(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
|
||||
foreach (var capability in Enum.GetValues<DriverCapability>())
|
||||
defaults.ShouldContainKey(capability);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void Write_NeverRetries_ByDefault(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[DriverCapability.Write].RetryCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public void AlarmAcknowledge_NeverRetries_ByDefault(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[DriverCapability.AlarmAcknowledge].RetryCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A, DriverCapability.Read)]
|
||||
[InlineData(DriverTier.A, DriverCapability.HistoryRead)]
|
||||
[InlineData(DriverTier.B, DriverCapability.Discover)]
|
||||
[InlineData(DriverTier.B, DriverCapability.Probe)]
|
||||
[InlineData(DriverTier.C, DriverCapability.AlarmSubscribe)]
|
||||
public void IdempotentCapabilities_Retry_ByDefault(DriverTier tier, DriverCapability capability)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
defaults[capability].RetryCount.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TierC_DisablesCircuitBreaker_DeferringToSupervisor()
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(DriverTier.C);
|
||||
|
||||
foreach (var (_, policy) in defaults)
|
||||
policy.BreakerFailureThreshold.ShouldBe(0, "Tier C breaker is handled by the Proxy supervisor (decision #68)");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public void TierAAndB_EnableCircuitBreaker(DriverTier tier)
|
||||
{
|
||||
var defaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
|
||||
foreach (var (_, policy) in defaults)
|
||||
policy.BreakerFailureThreshold.ShouldBeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_Uses_TierDefaults_When_NoOverride()
|
||||
{
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
|
||||
var resolved = options.Resolve(DriverCapability.Read);
|
||||
|
||||
resolved.ShouldBe(DriverResilienceOptions.GetTierDefaults(DriverTier.A)[DriverCapability.Read]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_Uses_Override_When_Configured()
|
||||
{
|
||||
var custom = new CapabilityPolicy(TimeoutSeconds: 42, RetryCount: 7, BreakerFailureThreshold: 9);
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = custom,
|
||||
},
|
||||
};
|
||||
|
||||
options.Resolve(DriverCapability.Read).ShouldBe(custom);
|
||||
options.Resolve(DriverCapability.Write).ShouldBe(
|
||||
DriverResilienceOptions.GetTierDefaults(DriverTier.A)[DriverCapability.Write]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Timeout;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class DriverResiliencePipelineBuilderTests
|
||||
{
|
||||
private static readonly DriverResilienceOptions TierAOptions = new() { Tier = DriverTier.A };
|
||||
|
||||
[Fact]
|
||||
public async Task Read_Retries_Transient_Failures()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Read, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
if (attempts < 3) throw new InvalidOperationException("transient");
|
||||
await Task.Yield();
|
||||
});
|
||||
|
||||
attempts.ShouldBe(3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_DoesNotRetry_OnFailure()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Write, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
});
|
||||
});
|
||||
|
||||
attempts.ShouldBe(1);
|
||||
ex.Message.ShouldBe("boom");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AlarmAcknowledge_DoesNotRetry_OnFailure()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.AlarmAcknowledge, TierAOptions);
|
||||
var attempts = 0;
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
{
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
attempts++;
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
});
|
||||
});
|
||||
|
||||
attempts.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsIsolated_PerHost()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var hostA = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var hostB = builder.GetOrCreate(driverId, "host-b", DriverCapability.Read, TierAOptions);
|
||||
|
||||
hostA.ShouldNotBeSameAs(hostB);
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsReused_ForSameTriple()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var first = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var second = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
|
||||
first.ShouldBeSameAs(second);
|
||||
builder.CachedPipelineCount.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pipeline_IsIsolated_PerCapability()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var read = builder.GetOrCreate(driverId, "host-a", DriverCapability.Read, TierAOptions);
|
||||
var write = builder.GetOrCreate(driverId, "host-a", DriverCapability.Write, TierAOptions);
|
||||
|
||||
read.ShouldNotBeSameAs(write);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeadHost_DoesNotOpenBreaker_ForSiblingHost()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var driverId = "drv-test";
|
||||
|
||||
var deadHost = builder.GetOrCreate(driverId, "dead-plc", DriverCapability.Read, TierAOptions);
|
||||
var liveHost = builder.GetOrCreate(driverId, "live-plc", DriverCapability.Read, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Read).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold + 5; i++)
|
||||
{
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await deadHost.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("dead plc");
|
||||
}));
|
||||
}
|
||||
|
||||
var liveAttempts = 0;
|
||||
await liveHost.ExecuteAsync(async _ =>
|
||||
{
|
||||
liveAttempts++;
|
||||
await Task.Yield();
|
||||
});
|
||||
|
||||
liveAttempts.ShouldBe(1, "healthy sibling host must not be affected by dead peer");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CircuitBreaker_Opens_AfterFailureThreshold_OnTierA()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Write, TierAOptions);
|
||||
|
||||
var threshold = TierAOptions.Resolve(DriverCapability.Write).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold; i++)
|
||||
{
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("boom");
|
||||
}));
|
||||
}
|
||||
|
||||
await Should.ThrowAsync<BrokenCircuitException>(async () =>
|
||||
await pipeline.ExecuteAsync(async _ =>
|
||||
{
|
||||
await Task.Yield();
|
||||
}));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Timeout_Cancels_SlowOperation()
|
||||
{
|
||||
var tierAWithShortTimeout = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 1, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
},
|
||||
};
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Read, tierAWithShortTimeout);
|
||||
|
||||
await Should.ThrowAsync<TimeoutRejectedException>(async () =>
|
||||
await pipeline.ExecuteAsync(async ct =>
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(5), ct);
|
||||
}));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Invalidate_Removes_OnlyMatchingInstance()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var keepId = "drv-keep";
|
||||
var dropId = "drv-drop";
|
||||
|
||||
builder.GetOrCreate(keepId, "h", DriverCapability.Read, TierAOptions);
|
||||
builder.GetOrCreate(keepId, "h", DriverCapability.Write, TierAOptions);
|
||||
builder.GetOrCreate(dropId, "h", DriverCapability.Read, TierAOptions);
|
||||
|
||||
var removed = builder.Invalidate(dropId);
|
||||
|
||||
removed.ShouldBe(1);
|
||||
builder.CachedPipelineCount.ShouldBe(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Cancellation_IsNot_Retried()
|
||||
{
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var pipeline = builder.GetOrCreate("drv-test", "host-1", DriverCapability.Read, TierAOptions);
|
||||
var attempts = 0;
|
||||
using var cts = new CancellationTokenSource();
|
||||
cts.Cancel();
|
||||
|
||||
await Should.ThrowAsync<OperationCanceledException>(async () =>
|
||||
await pipeline.ExecuteAsync(async ct =>
|
||||
{
|
||||
attempts++;
|
||||
ct.ThrowIfCancellationRequested();
|
||||
await Task.Yield();
|
||||
}, cts.Token));
|
||||
|
||||
attempts.ShouldBeLessThanOrEqualTo(1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for the Phase 6.1 Stream A.5 contract — wrapping a flaky
|
||||
/// <see cref="IReadable"/> / <see cref="IWritable"/> through the <see cref="CapabilityInvoker"/>.
|
||||
/// Exercises the three scenarios the plan enumerates: transient read succeeds after N
|
||||
/// retries; non-idempotent write fails after one attempt; idempotent write retries through.
|
||||
/// </summary>
|
||||
[Trait("Category", "Integration")]
|
||||
public sealed class FlakeyDriverIntegrationTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task Read_SurfacesSuccess_AfterTransientFailures()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failReadsBeforeIndex: 5);
|
||||
var options = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
// TimeoutSeconds=30 gives slack for 5 exponential-backoff retries under
|
||||
// parallel-test-execution CPU pressure; 10 retries at the default Delay=100ms
|
||||
// exponential can otherwise exceed a 2-second budget intermittently.
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 30, RetryCount: 10, BreakerFailureThreshold: 50),
|
||||
},
|
||||
};
|
||||
var invoker = new CapabilityInvoker(new DriverResiliencePipelineBuilder(), "drv-test", () => options);
|
||||
|
||||
var result = await invoker.ExecuteAsync(
|
||||
DriverCapability.Read,
|
||||
"host-1",
|
||||
async ct => await flaky.ReadAsync(["tag-a"], ct),
|
||||
CancellationToken.None);
|
||||
|
||||
flaky.ReadAttempts.ShouldBe(6);
|
||||
result[0].StatusCode.ShouldBe(0u);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_NonIdempotent_FailsOnFirstFailure_NoReplay()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failWritesBeforeIndex: 3);
|
||||
var optionsWithAggressiveRetry = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 5, BreakerFailureThreshold: 50),
|
||||
},
|
||||
};
|
||||
var invoker = new CapabilityInvoker(new DriverResiliencePipelineBuilder(), "drv-test", () => optionsWithAggressiveRetry);
|
||||
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: false,
|
||||
async ct => await flaky.WriteAsync([new WriteRequest("pulse-coil", true)], ct),
|
||||
CancellationToken.None));
|
||||
|
||||
flaky.WriteAttempts.ShouldBe(1, "non-idempotent write must never replay (decision #44)");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Write_Idempotent_RetriesUntilSuccess()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failWritesBeforeIndex: 2);
|
||||
var optionsWithRetry = new DriverResilienceOptions
|
||||
{
|
||||
Tier = DriverTier.A,
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 5, BreakerFailureThreshold: 50),
|
||||
},
|
||||
};
|
||||
var invoker = new CapabilityInvoker(new DriverResiliencePipelineBuilder(), "drv-test", () => optionsWithRetry);
|
||||
|
||||
var results = await invoker.ExecuteWriteAsync(
|
||||
"host-1",
|
||||
isIdempotent: true,
|
||||
async ct => await flaky.WriteAsync([new WriteRequest("set-point", 42.0f)], ct),
|
||||
CancellationToken.None);
|
||||
|
||||
flaky.WriteAttempts.ShouldBe(3);
|
||||
results[0].StatusCode.ShouldBe(0u);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MultipleHosts_OnOneDriver_HaveIndependentFailureCounts()
|
||||
{
|
||||
var flaky = new FlakeyDriver(failReadsBeforeIndex: 0);
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
var builder = new DriverResiliencePipelineBuilder();
|
||||
var invoker = new CapabilityInvoker(builder, "drv-test", () => options);
|
||||
|
||||
// host-dead: force many failures to exhaust retries + trip breaker
|
||||
var threshold = options.Resolve(DriverCapability.Read).BreakerFailureThreshold;
|
||||
for (var i = 0; i < threshold + 5; i++)
|
||||
{
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-dead",
|
||||
_ => throw new InvalidOperationException("dead"),
|
||||
CancellationToken.None));
|
||||
}
|
||||
|
||||
// host-live: succeeds on first call — unaffected by the dead-host breaker
|
||||
var liveAttempts = 0;
|
||||
await invoker.ExecuteAsync(DriverCapability.Read, "host-live",
|
||||
_ => { liveAttempts++; return ValueTask.FromResult("ok"); },
|
||||
CancellationToken.None);
|
||||
|
||||
liveAttempts.ShouldBe(1);
|
||||
}
|
||||
|
||||
private sealed class FlakeyDriver : IReadable, IWritable
|
||||
{
|
||||
private readonly int _failReadsBeforeIndex;
|
||||
private readonly int _failWritesBeforeIndex;
|
||||
|
||||
public int ReadAttempts { get; private set; }
|
||||
public int WriteAttempts { get; private set; }
|
||||
|
||||
public FlakeyDriver(int failReadsBeforeIndex = 0, int failWritesBeforeIndex = 0)
|
||||
{
|
||||
_failReadsBeforeIndex = failReadsBeforeIndex;
|
||||
_failWritesBeforeIndex = failWritesBeforeIndex;
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
||||
IReadOnlyList<string> fullReferences,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var attempt = ++ReadAttempts;
|
||||
if (attempt <= _failReadsBeforeIndex)
|
||||
throw new InvalidOperationException($"transient read failure #{attempt}");
|
||||
|
||||
var now = DateTime.UtcNow;
|
||||
IReadOnlyList<DataValueSnapshot> result = fullReferences
|
||||
.Select(_ => new DataValueSnapshot(Value: 0, StatusCode: 0u, SourceTimestampUtc: now, ServerTimestampUtc: now))
|
||||
.ToList();
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var attempt = ++WriteAttempts;
|
||||
if (attempt <= _failWritesBeforeIndex)
|
||||
throw new InvalidOperationException($"transient write failure #{attempt}");
|
||||
|
||||
IReadOnlyList<WriteResult> result = writes.Select(_ => new WriteResult(0u)).ToList();
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class MemoryRecycleTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task TierC_HardBreach_RequestsSupervisorRecycle()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(DriverTier.C, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeTrue();
|
||||
supervisor.RecycleCount.ShouldBe(1);
|
||||
supervisor.LastReason.ShouldContain("hard-breach");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public async Task InProcessTier_HardBreach_NeverRequestsRecycle(DriverTier tier)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(tier, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse("Tier A/B hard-breach logs a promotion recommendation only (decisions #74, #145)");
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TierC_WithoutSupervisor_HardBreach_NoOp()
|
||||
{
|
||||
var recycle = new MemoryRecycle(DriverTier.C, supervisor: null, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse("no supervisor → no recycle path; action logged only");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
[InlineData(DriverTier.C)]
|
||||
public async Task SoftBreach_NeverRequestsRecycle(DriverTier tier)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(tier, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(MemoryTrackingAction.SoftBreach, 1_000_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse("soft-breach is surface-only at every tier");
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(MemoryTrackingAction.None)]
|
||||
[InlineData(MemoryTrackingAction.Warming)]
|
||||
public async Task NonBreachActions_NoOp(MemoryTrackingAction action)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var recycle = new MemoryRecycle(DriverTier.C, supervisor, NullLogger<MemoryRecycle>.Instance);
|
||||
|
||||
var requested = await recycle.HandleAsync(action, 100_000_000, CancellationToken.None);
|
||||
|
||||
requested.ShouldBeFalse();
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
private sealed class FakeSupervisor : IDriverSupervisor
|
||||
{
|
||||
public string DriverInstanceId => "fake-tier-c";
|
||||
public int RecycleCount { get; private set; }
|
||||
public string? LastReason { get; private set; }
|
||||
|
||||
public Task RecycleAsync(string reason, CancellationToken cancellationToken)
|
||||
{
|
||||
RecycleCount++;
|
||||
LastReason = reason;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class MemoryTrackingTests
|
||||
{
|
||||
private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||
|
||||
[Fact]
|
||||
public void WarmingUp_Returns_Warming_UntilWindowElapses()
|
||||
{
|
||||
var tracker = new MemoryTracking(DriverTier.A, TimeSpan.FromMinutes(5));
|
||||
|
||||
tracker.Sample(100_000_000, T0).ShouldBe(MemoryTrackingAction.Warming);
|
||||
tracker.Sample(105_000_000, T0.AddMinutes(1)).ShouldBe(MemoryTrackingAction.Warming);
|
||||
tracker.Sample(102_000_000, T0.AddMinutes(4.9)).ShouldBe(MemoryTrackingAction.Warming);
|
||||
|
||||
tracker.Phase.ShouldBe(TrackingPhase.WarmingUp);
|
||||
tracker.BaselineBytes.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WindowElapsed_CapturesBaselineAsMedian_AndTransitionsToSteady()
|
||||
{
|
||||
var tracker = new MemoryTracking(DriverTier.A, TimeSpan.FromMinutes(5));
|
||||
|
||||
tracker.Sample(100_000_000, T0);
|
||||
tracker.Sample(200_000_000, T0.AddMinutes(1));
|
||||
tracker.Sample(150_000_000, T0.AddMinutes(2));
|
||||
var first = tracker.Sample(150_000_000, T0.AddMinutes(5));
|
||||
|
||||
tracker.Phase.ShouldBe(TrackingPhase.Steady);
|
||||
tracker.BaselineBytes.ShouldBe(150_000_000L, "median of 4 samples [100, 200, 150, 150] = (150+150)/2 = 150");
|
||||
first.ShouldBe(MemoryTrackingAction.None, "150 MB is the baseline itself, well under soft threshold");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A, 3, 50)]
|
||||
[InlineData(DriverTier.B, 3, 100)]
|
||||
[InlineData(DriverTier.C, 2, 500)]
|
||||
public void GetTierConstants_MatchesDecision146(DriverTier tier, int expectedMultiplier, long expectedFloorMB)
|
||||
{
|
||||
var (multiplier, floor) = MemoryTracking.GetTierConstants(tier);
|
||||
multiplier.ShouldBe(expectedMultiplier);
|
||||
floor.ShouldBe(expectedFloorMB * 1024 * 1024);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SoftThreshold_UsesMax_OfMultiplierAndFloor_SmallBaseline()
|
||||
{
|
||||
// Tier A: mult=3, floor=50 MB. Baseline 10 MB → 3×10=30 MB < 10+50=60 MB → floor wins.
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 10L * 1024 * 1024);
|
||||
tracker.SoftThresholdBytes.ShouldBe(60L * 1024 * 1024);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SoftThreshold_UsesMax_OfMultiplierAndFloor_LargeBaseline()
|
||||
{
|
||||
// Tier A: mult=3, floor=50 MB. Baseline 200 MB → 3×200=600 MB > 200+50=250 MB → multiplier wins.
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
tracker.SoftThresholdBytes.ShouldBe(600L * 1024 * 1024);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HardThreshold_IsTwiceSoft()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.B, 200L * 1024 * 1024);
|
||||
tracker.HardThresholdBytes.ShouldBe(tracker.SoftThresholdBytes * 2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_Below_Soft_Returns_None()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 100L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(200L * 1024 * 1024, T0.AddMinutes(10)).ShouldBe(MemoryTrackingAction.None);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_AtSoft_Returns_SoftBreach()
|
||||
{
|
||||
// Tier A, baseline 200 MB → soft = 600 MB. Sample exactly at soft.
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(tracker.SoftThresholdBytes, T0.AddMinutes(10))
|
||||
.ShouldBe(MemoryTrackingAction.SoftBreach);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_AtHard_Returns_HardBreach()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(tracker.HardThresholdBytes, T0.AddMinutes(10))
|
||||
.ShouldBe(MemoryTrackingAction.HardBreach);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sample_AboveHard_Returns_HardBreach()
|
||||
{
|
||||
var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024);
|
||||
|
||||
tracker.Sample(tracker.HardThresholdBytes + 100_000_000, T0.AddMinutes(10))
|
||||
.ShouldBe(MemoryTrackingAction.HardBreach);
|
||||
}
|
||||
|
||||
private static MemoryTracking WarmupWithBaseline(DriverTier tier, long baseline)
|
||||
{
|
||||
var tracker = new MemoryTracking(tier, TimeSpan.FromMinutes(5));
|
||||
tracker.Sample(baseline, T0);
|
||||
tracker.Sample(baseline, T0.AddMinutes(5));
|
||||
tracker.BaselineBytes.ShouldBe(baseline);
|
||||
return tracker;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class ScheduledRecycleSchedulerTests
|
||||
{
|
||||
private static readonly DateTime T0 = new(2026, 4, 19, 0, 0, 0, DateTimeKind.Utc);
|
||||
private static readonly TimeSpan Weekly = TimeSpan.FromDays(7);
|
||||
|
||||
[Theory]
|
||||
[InlineData(DriverTier.A)]
|
||||
[InlineData(DriverTier.B)]
|
||||
public void TierAOrB_Ctor_Throws(DriverTier tier)
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
Should.Throw<ArgumentException>(() => new ScheduledRecycleScheduler(
|
||||
tier, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ZeroOrNegativeInterval_Throws()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
Should.Throw<ArgumentException>(() => new ScheduledRecycleScheduler(
|
||||
DriverTier.C, TimeSpan.Zero, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance));
|
||||
Should.Throw<ArgumentException>(() => new ScheduledRecycleScheduler(
|
||||
DriverTier.C, TimeSpan.FromSeconds(-1), T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tick_BeforeNextRecycle_NoOp()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
|
||||
var fired = await sch.TickAsync(T0 + TimeSpan.FromDays(6), CancellationToken.None);
|
||||
|
||||
fired.ShouldBeFalse();
|
||||
supervisor.RecycleCount.ShouldBe(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Tick_AtOrAfterNextRecycle_FiresOnce_AndAdvances()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
|
||||
var fired = await sch.TickAsync(T0 + Weekly + TimeSpan.FromMinutes(1), CancellationToken.None);
|
||||
|
||||
fired.ShouldBeTrue();
|
||||
supervisor.RecycleCount.ShouldBe(1);
|
||||
sch.NextRecycleUtc.ShouldBe(T0 + Weekly + Weekly);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RequestRecycleNow_Fires_Immediately_WithoutAdvancingSchedule()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
var nextBefore = sch.NextRecycleUtc;
|
||||
|
||||
await sch.RequestRecycleNowAsync("memory hard-breach", CancellationToken.None);
|
||||
|
||||
supervisor.RecycleCount.ShouldBe(1);
|
||||
supervisor.LastReason.ShouldBe("memory hard-breach");
|
||||
sch.NextRecycleUtc.ShouldBe(nextBefore, "ad-hoc recycle doesn't shift the cron schedule");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MultipleFires_AcrossTicks_AdvanceOneIntervalEach()
|
||||
{
|
||||
var supervisor = new FakeSupervisor();
|
||||
var sch = new ScheduledRecycleScheduler(DriverTier.C, TimeSpan.FromDays(1), T0, supervisor, NullLogger<ScheduledRecycleScheduler>.Instance);
|
||||
|
||||
await sch.TickAsync(T0 + TimeSpan.FromDays(1) + TimeSpan.FromHours(1), CancellationToken.None);
|
||||
await sch.TickAsync(T0 + TimeSpan.FromDays(2) + TimeSpan.FromHours(1), CancellationToken.None);
|
||||
await sch.TickAsync(T0 + TimeSpan.FromDays(3) + TimeSpan.FromHours(1), CancellationToken.None);
|
||||
|
||||
supervisor.RecycleCount.ShouldBe(3);
|
||||
sch.NextRecycleUtc.ShouldBe(T0 + TimeSpan.FromDays(4));
|
||||
}
|
||||
|
||||
private sealed class FakeSupervisor : IDriverSupervisor
|
||||
{
|
||||
public string DriverInstanceId => "tier-c-fake";
|
||||
public int RecycleCount { get; private set; }
|
||||
public string? LastReason { get; private set; }
|
||||
|
||||
public Task RecycleAsync(string reason, CancellationToken cancellationToken)
|
||||
{
|
||||
RecycleCount++;
|
||||
LastReason = reason;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class WedgeDetectorTests
|
||||
{
|
||||
private static readonly DateTime Now = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc);
|
||||
private static readonly TimeSpan Threshold = TimeSpan.FromSeconds(120);
|
||||
|
||||
[Fact]
|
||||
public void SubSixtySecondThreshold_ClampsToSixty()
|
||||
{
|
||||
var detector = new WedgeDetector(TimeSpan.FromSeconds(10));
|
||||
detector.Threshold.ShouldBe(TimeSpan.FromSeconds(60));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Unhealthy_Driver_AlwaysNotApplicable()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 5, ActiveMonitoredItems: 10, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-10));
|
||||
|
||||
detector.Classify(DriverState.Faulted, demand, Now).ShouldBe(WedgeVerdict.NotApplicable);
|
||||
detector.Classify(DriverState.Degraded, demand, Now).ShouldBe(WedgeVerdict.NotApplicable);
|
||||
detector.Classify(DriverState.Initializing, demand, Now).ShouldBe(WedgeVerdict.NotApplicable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Idle_Subscription_Only_StaysIdle()
|
||||
{
|
||||
// Idle driver: bulkhead 0, monitored items 0, no history reads queued.
|
||||
// Even if LastProgressUtc is ancient, the verdict is Idle, not Faulted.
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(0, 0, 0, Now.AddHours(-12));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Idle);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PendingWork_WithRecentProgress_StaysHealthy()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 2, ActiveMonitoredItems: 0, QueuedHistoryReads: 0, LastProgressUtc: Now.AddSeconds(-30));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PendingWork_WithStaleProgress_IsFaulted()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 2, ActiveMonitoredItems: 0, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-5));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Faulted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MonitoredItems_Active_ButNoRecentPublish_IsFaulted()
|
||||
{
|
||||
// Subscription-only driver with live MonitoredItems but no publish progress within threshold
|
||||
// is a real wedge — this is the case the previous "no successful Read" formulation used
|
||||
// to miss (no reads ever happen).
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 5, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-10));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Faulted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MonitoredItems_Active_WithFreshPublish_StaysHealthy()
|
||||
{
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 5, QueuedHistoryReads: 0, LastProgressUtc: Now.AddSeconds(-10));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HistoryBackfill_SlowButMakingProgress_StaysHealthy()
|
||||
{
|
||||
// Slow historian backfill — QueuedHistoryReads > 0 but progress advances within threshold.
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 0, QueuedHistoryReads: 50, LastProgressUtc: Now.AddSeconds(-60));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WriteOnlyBurst_StaysIdle_WhenBulkheadEmpty()
|
||||
{
|
||||
// A write-only driver that just finished a burst: bulkhead drained, no subscriptions, no
|
||||
// history reads. Idle — the previous formulation would have faulted here because no
|
||||
// reads were succeeding even though the driver is perfectly healthy.
|
||||
var detector = new WedgeDetector(Threshold);
|
||||
var demand = new DemandSignal(0, 0, 0, Now.AddMinutes(-30));
|
||||
|
||||
detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Idle);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DemandSignal_HasPendingWork_TrueForAnyNonZeroCounter()
|
||||
{
|
||||
new DemandSignal(1, 0, 0, Now).HasPendingWork.ShouldBeTrue();
|
||||
new DemandSignal(0, 1, 0, Now).HasPendingWork.ShouldBeTrue();
|
||||
new DemandSignal(0, 0, 1, Now).HasPendingWork.ShouldBeTrue();
|
||||
new DemandSignal(0, 0, 0, Now).HasPendingWork.ShouldBeFalse();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus.IntegrationTests.Mitsubishi;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the MELSEC-family Modbus quirks against the <c>mitsubishi.json</c> pymodbus
|
||||
/// profile: CDAB word order default, binary-not-BCD D-register encoding, hex X-input
|
||||
/// parsing (Q/L/iQ-R), D0 fingerprint, M-relay coil mapping with bank base.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Groups all quirks in one test class instead of per-behavior classes (unlike the DL205
|
||||
/// set) because MELSEC's per-model differentiation is handled by the
|
||||
/// <see cref="MelsecFamily"/> enum on the helper + <c>MODBUS_SIM_PROFILE</c> env var on
|
||||
/// the fixture, rather than per-PR test classes.
|
||||
/// </remarks>
|
||||
[Collection(ModbusSimulatorCollection.Name)]
|
||||
[Trait("Category", "Integration")]
|
||||
[Trait("Device", "Mitsubishi")]
|
||||
public sealed class MitsubishiQuirkTests(ModbusSimulatorFixture sim)
|
||||
{
|
||||
[Fact]
|
||||
public async Task Mitsubishi_D0_fingerprint_reads_0x1234()
|
||||
{
|
||||
if (!ShouldRun()) return;
|
||||
await using var driver = await NewDriverAsync(
|
||||
new ModbusTagDefinition("D0_Fingerprint",
|
||||
ModbusRegion.HoldingRegisters,
|
||||
Address: MelsecAddress.DRegisterToHolding("D0"),
|
||||
DataType: ModbusDataType.UInt16, Writable: false));
|
||||
|
||||
var r = await driver.ReadAsync(["D0_Fingerprint"], TestContext.Current.CancellationToken);
|
||||
r[0].StatusCode.ShouldBe(0u);
|
||||
r[0].Value.ShouldBe((ushort)0x1234);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Mitsubishi_Float32_CDAB_decodes_1_5f_from_D100()
|
||||
{
|
||||
if (!ShouldRun()) return;
|
||||
// MELSEC Q/L/iQ-R/iQ-F all store 32-bit values with CDAB word order (low word at
|
||||
// lower D-register address). HR[100..101] = [0, 0x3FC0] decodes as 1.5f under
|
||||
// WordSwap but as a denormal under BigEndian.
|
||||
var addr = MelsecAddress.DRegisterToHolding("D100");
|
||||
await using var driver = await NewDriverAsync(
|
||||
new ModbusTagDefinition("D100_Float_CDAB",
|
||||
ModbusRegion.HoldingRegisters, Address: addr,
|
||||
DataType: ModbusDataType.Float32, Writable: false,
|
||||
ByteOrder: ModbusByteOrder.WordSwap),
|
||||
new ModbusTagDefinition("D100_Float_ABCD_control",
|
||||
ModbusRegion.HoldingRegisters, Address: addr,
|
||||
DataType: ModbusDataType.Float32, Writable: false,
|
||||
ByteOrder: ModbusByteOrder.BigEndian));
|
||||
|
||||
var r = await driver.ReadAsync(
|
||||
["D100_Float_CDAB", "D100_Float_ABCD_control"],
|
||||
TestContext.Current.CancellationToken);
|
||||
r[0].Value.ShouldBe(1.5f, "MELSEC stores Float32 CDAB; WordSwap decode returns 1.5f");
|
||||
r[1].Value.ShouldNotBe(1.5f, "same wire with BigEndian must decode to a different value");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Mitsubishi_D10_is_binary_not_BCD()
|
||||
{
|
||||
if (!ShouldRun()) return;
|
||||
// Counter-to-DL205: MELSEC D-registers are binary by default. D10 = 1234 decimal =
|
||||
// 0x04D2. Reading as Int16 returns 1234; reading as Bcd16 would throw (nibble 0xD is
|
||||
// non-BCD) — the integration test proves the Int16 decode wins.
|
||||
await using var driver = await NewDriverAsync(
|
||||
new ModbusTagDefinition("D10_Binary",
|
||||
ModbusRegion.HoldingRegisters,
|
||||
Address: MelsecAddress.DRegisterToHolding("D10"),
|
||||
DataType: ModbusDataType.Int16, Writable: false));
|
||||
|
||||
var r = await driver.ReadAsync(["D10_Binary"], TestContext.Current.CancellationToken);
|
||||
r[0].StatusCode.ShouldBe(0u);
|
||||
r[0].Value.ShouldBe((short)1234, "MELSEC stores numeric D-register values in binary; 0x04D2 = 1234");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Mitsubishi_D10_as_BCD_throws_because_nibble_is_non_decimal()
|
||||
{
|
||||
if (!ShouldRun()) return;
|
||||
// If a site configured D10 with Bcd16 data type but the ladder writes binary, the
|
||||
// BCD decoder MUST reject the garbage rather than silently returning wrong decimal.
|
||||
// 0x04D2 contains nibble 0xD which fails BCD validation.
|
||||
await using var driver = await NewDriverAsync(
|
||||
new ModbusTagDefinition("D10_WrongBcd",
|
||||
ModbusRegion.HoldingRegisters,
|
||||
Address: MelsecAddress.DRegisterToHolding("D10"),
|
||||
DataType: ModbusDataType.Bcd16, Writable: false));
|
||||
|
||||
var r = await driver.ReadAsync(["D10_WrongBcd"], TestContext.Current.CancellationToken);
|
||||
// ReadAsync catches the InvalidDataException from DecodeBcd and surfaces it as
|
||||
// BadCommunicationError (PR 52 mapping). Non-zero status = caller sees a real
|
||||
// problem and can check their tag config instead of getting silently-wrong numbers.
|
||||
r[0].StatusCode.ShouldNotBe(0u, "BCD decode of binary 0x04D2 must fail loudly because nibble D is non-BCD");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Mitsubishi_QLiQR_X210_hex_maps_to_DI_528_reads_ON()
|
||||
{
|
||||
if (!ShouldRun()) return;
|
||||
// MELSEC-Q / L / iQ-R: X addresses are hex. X210 = 0x210 = 528 decimal.
|
||||
// mitsubishi.json seeds cell 33 (DI 528..543) with value 9 = bit 0 + bit 3 set.
|
||||
// X210 → DI 528 → cell 33 bit 0 = 1 (ON).
|
||||
var addr = MelsecAddress.XInputToDiscrete("X210", MelsecFamily.Q_L_iQR);
|
||||
addr.ShouldBe((ushort)528);
|
||||
|
||||
await using var driver = await NewDriverAsync(
|
||||
new ModbusTagDefinition("X210_hex",
|
||||
ModbusRegion.DiscreteInputs, Address: addr,
|
||||
DataType: ModbusDataType.Bool, Writable: false));
|
||||
|
||||
var r = await driver.ReadAsync(["X210_hex"], TestContext.Current.CancellationToken);
|
||||
r[0].StatusCode.ShouldBe(0u);
|
||||
r[0].Value.ShouldBe(true);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Mitsubishi_family_trap_X20_differs_on_Q_vs_FX()
|
||||
{
|
||||
// Not a live-sim test — a unit-level proof that the MELSEC family selector gates the
|
||||
// address correctly. Included in the integration suite so anyone running the MELSEC
|
||||
// tests sees the trap called out explicitly.
|
||||
MelsecAddress.XInputToDiscrete("X20", MelsecFamily.Q_L_iQR).ShouldBe((ushort)32);
|
||||
MelsecAddress.XInputToDiscrete("X20", MelsecFamily.F_iQF).ShouldBe((ushort)16);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Mitsubishi_M512_maps_to_coil_512_reads_ON()
|
||||
{
|
||||
if (!ShouldRun()) return;
|
||||
// mitsubishi.json seeds cell 32 (coil 512..527) with value 5 = bit 0 + bit 2 set.
|
||||
// M512 → coil 512 → cell 32 bit 0 = 1 (ON).
|
||||
var addr = MelsecAddress.MRelayToCoil("M512");
|
||||
addr.ShouldBe((ushort)512);
|
||||
|
||||
await using var driver = await NewDriverAsync(
|
||||
new ModbusTagDefinition("M512",
|
||||
ModbusRegion.Coils, Address: addr,
|
||||
DataType: ModbusDataType.Bool, Writable: false));
|
||||
|
||||
var r = await driver.ReadAsync(["M512"], TestContext.Current.CancellationToken);
|
||||
r[0].StatusCode.ShouldBe(0u);
|
||||
r[0].Value.ShouldBe(true);
|
||||
}
|
||||
|
||||
// --- helpers ---
|
||||
|
||||
private bool ShouldRun()
|
||||
{
|
||||
if (sim.SkipReason is not null) { Assert.Skip(sim.SkipReason); return false; }
|
||||
if (!string.Equals(Environment.GetEnvironmentVariable("MODBUS_SIM_PROFILE"), "mitsubishi",
|
||||
StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
Assert.Skip("MODBUS_SIM_PROFILE != mitsubishi — skipping.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private async Task<ModbusDriver> NewDriverAsync(params ModbusTagDefinition[] tags)
|
||||
{
|
||||
var drv = new ModbusDriver(
|
||||
new ModbusDriverOptions
|
||||
{
|
||||
Host = sim.Host,
|
||||
Port = sim.Port,
|
||||
UnitId = 1,
|
||||
Timeout = TimeSpan.FromSeconds(2),
|
||||
Tags = tags,
|
||||
Probe = new ModbusProbeOptions { Enabled = false },
|
||||
},
|
||||
driverInstanceId: "melsec-quirk");
|
||||
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken);
|
||||
return drv;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Modbus.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class MelsecAddressTests
|
||||
{
|
||||
// --- X / Y hex vs octal family trap ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("X0", (ushort)0)]
|
||||
[InlineData("X9", (ushort)9)]
|
||||
[InlineData("XA", (ushort)10)] // hex
|
||||
[InlineData("XF", (ushort)15)]
|
||||
[InlineData("X10", (ushort)16)] // hex 0x10 = decimal 16
|
||||
[InlineData("X20", (ushort)32)] // hex 0x20 = decimal 32 — the classic MELSEC-Q trap
|
||||
[InlineData("X1FF", (ushort)511)]
|
||||
[InlineData("x10", (ushort)16)] // lowercase prefix
|
||||
public void XInputToDiscrete_QLiQR_parses_hex(string x, ushort expected)
|
||||
=> MelsecAddress.XInputToDiscrete(x, MelsecFamily.Q_L_iQR).ShouldBe(expected);
|
||||
|
||||
[Theory]
|
||||
[InlineData("X0", (ushort)0)]
|
||||
[InlineData("X7", (ushort)7)]
|
||||
[InlineData("X10", (ushort)8)] // octal 10 = decimal 8
|
||||
[InlineData("X20", (ushort)16)] // octal 20 = decimal 16 — SAME string, DIFFERENT value on FX
|
||||
[InlineData("X777", (ushort)511)]
|
||||
public void XInputToDiscrete_FiQF_parses_octal(string x, ushort expected)
|
||||
=> MelsecAddress.XInputToDiscrete(x, MelsecFamily.F_iQF).ShouldBe(expected);
|
||||
|
||||
[Theory]
|
||||
[InlineData("Y0", (ushort)0)]
|
||||
[InlineData("Y1F", (ushort)31)]
|
||||
public void YOutputToCoil_QLiQR_parses_hex(string y, ushort expected)
|
||||
=> MelsecAddress.YOutputToCoil(y, MelsecFamily.Q_L_iQR).ShouldBe(expected);
|
||||
|
||||
[Theory]
|
||||
[InlineData("Y0", (ushort)0)]
|
||||
[InlineData("Y17", (ushort)15)]
|
||||
public void YOutputToCoil_FiQF_parses_octal(string y, ushort expected)
|
||||
=> MelsecAddress.YOutputToCoil(y, MelsecFamily.F_iQF).ShouldBe(expected);
|
||||
|
||||
[Fact]
|
||||
public void Same_address_string_decodes_differently_between_families()
|
||||
{
|
||||
// This is the headline quirk: "X20" in GX Works means one thing on Q-series and
|
||||
// another on FX-series. The driver's family selector is the only defence.
|
||||
MelsecAddress.XInputToDiscrete("X20", MelsecFamily.Q_L_iQR).ShouldBe((ushort)32);
|
||||
MelsecAddress.XInputToDiscrete("X20", MelsecFamily.F_iQF).ShouldBe((ushort)16);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("X8")] // 8 is non-octal
|
||||
[InlineData("X12G")] // G is non-hex
|
||||
public void XInputToDiscrete_FiQF_rejects_non_octal(string bad)
|
||||
=> Should.Throw<ArgumentException>(() => MelsecAddress.XInputToDiscrete(bad, MelsecFamily.F_iQF));
|
||||
|
||||
[Theory]
|
||||
[InlineData("X12G")]
|
||||
public void XInputToDiscrete_QLiQR_rejects_non_hex(string bad)
|
||||
=> Should.Throw<ArgumentException>(() => MelsecAddress.XInputToDiscrete(bad, MelsecFamily.Q_L_iQR));
|
||||
|
||||
[Fact]
|
||||
public void XInputToDiscrete_honors_bank_base_from_assignment_block()
|
||||
{
|
||||
// Real-world QJ71MT91 assignment blocks commonly place X at DI 8192+ when other
|
||||
// ranges take the low Modbus addresses. Helper must add the base cleanly.
|
||||
MelsecAddress.XInputToDiscrete("X10", MelsecFamily.Q_L_iQR, xBankBase: 8192).ShouldBe((ushort)(8192 + 16));
|
||||
}
|
||||
|
||||
// --- M-relay (decimal, both families) ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("M0", (ushort)0)]
|
||||
[InlineData("M10", (ushort)10)] // M addresses are DECIMAL, not hex or octal
|
||||
[InlineData("M511", (ushort)511)]
|
||||
[InlineData("m99", (ushort)99)] // lowercase
|
||||
public void MRelayToCoil_parses_decimal(string m, ushort expected)
|
||||
=> MelsecAddress.MRelayToCoil(m).ShouldBe(expected);
|
||||
|
||||
[Fact]
|
||||
public void MRelayToCoil_honors_bank_base()
|
||||
=> MelsecAddress.MRelayToCoil("M0", mBankBase: 512).ShouldBe((ushort)512);
|
||||
|
||||
[Fact]
|
||||
public void MRelayToCoil_rejects_non_numeric()
|
||||
=> Should.Throw<ArgumentException>(() => MelsecAddress.MRelayToCoil("M1F"));
|
||||
|
||||
// --- D-register (decimal, both families) ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("D0", (ushort)0)]
|
||||
[InlineData("D100", (ushort)100)]
|
||||
[InlineData("d1023", (ushort)1023)]
|
||||
public void DRegisterToHolding_parses_decimal(string d, ushort expected)
|
||||
=> MelsecAddress.DRegisterToHolding(d).ShouldBe(expected);
|
||||
|
||||
[Fact]
|
||||
public void DRegisterToHolding_honors_bank_base()
|
||||
=> MelsecAddress.DRegisterToHolding("D10", dBankBase: 4096).ShouldBe((ushort)4106);
|
||||
|
||||
[Fact]
|
||||
public void DRegisterToHolding_rejects_empty()
|
||||
=> Should.Throw<ArgumentException>(() => MelsecAddress.DRegisterToHolding("D"));
|
||||
|
||||
// --- overflow ---
|
||||
|
||||
[Fact]
|
||||
public void XInputToDiscrete_overflow_throws()
|
||||
{
|
||||
// 0xFFFF + base 1 = 0x10000 — past ushort.
|
||||
Should.Throw<OverflowException>(() =>
|
||||
MelsecAddress.XInputToDiscrete("XFFFF", MelsecFamily.Q_L_iQR, xBankBase: 1));
|
||||
}
|
||||
}
|
||||
@@ -220,6 +220,23 @@ public sealed class ModbusDriverTests
|
||||
builder.Variables.ShouldContain(v => v.BrowseName == "Run" && v.Info.DriverDataType == DriverDataType.Boolean);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Discover_propagates_WriteIdempotent_from_tag_to_attribute_info()
|
||||
{
|
||||
var (drv, _) = NewDriver(
|
||||
new ModbusTagDefinition("SetPoint", ModbusRegion.HoldingRegisters, 0, ModbusDataType.Float32, WriteIdempotent: true),
|
||||
new ModbusTagDefinition("PulseCoil", ModbusRegion.Coils, 0, ModbusDataType.Bool));
|
||||
await drv.InitializeAsync("{}", CancellationToken.None);
|
||||
|
||||
var builder = new RecordingBuilder();
|
||||
await drv.DiscoverAsync(builder, CancellationToken.None);
|
||||
|
||||
var setPoint = builder.Variables.Single(v => v.BrowseName == "SetPoint");
|
||||
var pulse = builder.Variables.Single(v => v.BrowseName == "PulseCoil");
|
||||
setPoint.Info.WriteIdempotent.ShouldBeTrue();
|
||||
pulse.Info.WriteIdempotent.ShouldBeFalse("default is opt-in per decision #44");
|
||||
}
|
||||
|
||||
// --- helpers ---
|
||||
|
||||
private sealed class RecordingBuilder : IAddressSpaceBuilder
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientAlarmTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData((ushort)1, AlarmSeverity.Low)]
|
||||
[InlineData((ushort)200, AlarmSeverity.Low)]
|
||||
[InlineData((ushort)201, AlarmSeverity.Medium)]
|
||||
[InlineData((ushort)500, AlarmSeverity.Medium)]
|
||||
[InlineData((ushort)501, AlarmSeverity.High)]
|
||||
[InlineData((ushort)800, AlarmSeverity.High)]
|
||||
[InlineData((ushort)801, AlarmSeverity.Critical)]
|
||||
[InlineData((ushort)1000, AlarmSeverity.Critical)]
|
||||
public void MapSeverity_buckets_per_OPC_UA_Part_9_guidance(ushort opcSev, AlarmSeverity expected)
|
||||
{
|
||||
OpcUaClientDriver.MapSeverity(opcSev).ShouldBe(expected);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapSeverity_zero_maps_to_Low()
|
||||
{
|
||||
// 0 isn't in OPC UA's 1-1000 range but we handle it gracefully as Low.
|
||||
OpcUaClientDriver.MapSeverity(0).ShouldBe(AlarmSeverity.Low);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SubscribeAlarmsAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-alarm-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.SubscribeAlarmsAsync([], TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task UnsubscribeAlarmsAsync_with_unknown_handle_is_noop()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-alarm-unknown");
|
||||
// Parallels the subscribe handle path — session-drop races shouldn't crash the caller.
|
||||
await drv.UnsubscribeAlarmsAsync(new FakeAlarmHandle(), TestContext.Current.CancellationToken);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AcknowledgeAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-ack-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.AcknowledgeAsync(
|
||||
[new AlarmAcknowledgeRequest("ns=2;s=Src", "ns=2;s=Cond", "operator ack")],
|
||||
TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AcknowledgeAsync_with_empty_batch_is_noop_even_without_init()
|
||||
{
|
||||
// Empty batch short-circuits before touching the session, so it's safe pre-init. This
|
||||
// keeps batch-ack callers from needing to guard the list size themselves.
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-ack-empty");
|
||||
await drv.AcknowledgeAsync([], TestContext.Current.CancellationToken);
|
||||
}
|
||||
|
||||
private sealed class FakeAlarmHandle : IAlarmSubscriptionHandle
|
||||
{
|
||||
public string DiagnosticId => "fake-alarm";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
using Opc.Ua;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientAttributeMappingTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData((uint)DataTypes.Boolean, DriverDataType.Boolean)]
|
||||
[InlineData((uint)DataTypes.Int16, DriverDataType.Int16)]
|
||||
[InlineData((uint)DataTypes.UInt16, DriverDataType.UInt16)]
|
||||
[InlineData((uint)DataTypes.Int32, DriverDataType.Int32)]
|
||||
[InlineData((uint)DataTypes.UInt32, DriverDataType.UInt32)]
|
||||
[InlineData((uint)DataTypes.Int64, DriverDataType.Int64)]
|
||||
[InlineData((uint)DataTypes.UInt64, DriverDataType.UInt64)]
|
||||
[InlineData((uint)DataTypes.Float, DriverDataType.Float32)]
|
||||
[InlineData((uint)DataTypes.Double, DriverDataType.Float64)]
|
||||
[InlineData((uint)DataTypes.String, DriverDataType.String)]
|
||||
[InlineData((uint)DataTypes.DateTime, DriverDataType.DateTime)]
|
||||
public void MapUpstreamDataType_recognizes_standard_builtin_types(uint typeId, DriverDataType expected)
|
||||
{
|
||||
var nodeId = new NodeId(typeId);
|
||||
OpcUaClientDriver.MapUpstreamDataType(nodeId).ShouldBe(expected);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapUpstreamDataType_maps_SByte_and_Byte_to_Int16_since_DriverDataType_lacks_8bit()
|
||||
{
|
||||
// DriverDataType has no 8-bit type; conservative widen to Int16. Documented so a
|
||||
// future Core.Abstractions PR that adds Int8/Byte can find this call site.
|
||||
OpcUaClientDriver.MapUpstreamDataType(new NodeId((uint)DataTypes.SByte)).ShouldBe(DriverDataType.Int16);
|
||||
OpcUaClientDriver.MapUpstreamDataType(new NodeId((uint)DataTypes.Byte)).ShouldBe(DriverDataType.Int16);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapUpstreamDataType_falls_back_to_String_for_unknown_custom_types()
|
||||
{
|
||||
// Custom vendor extension object — NodeId in namespace 2 that isn't a standard type.
|
||||
OpcUaClientDriver.MapUpstreamDataType(new NodeId("CustomStruct", 2)).ShouldBe(DriverDataType.String);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapUpstreamDataType_handles_UtcTime_as_DateTime()
|
||||
{
|
||||
OpcUaClientDriver.MapUpstreamDataType(new NodeId((uint)DataTypes.UtcTime)).ShouldBe(DriverDataType.DateTime);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData((byte)0, SecurityClassification.ViewOnly)] // no access flags set
|
||||
[InlineData((byte)1, SecurityClassification.ViewOnly)] // CurrentRead only
|
||||
[InlineData((byte)2, SecurityClassification.Operate)] // CurrentWrite only
|
||||
[InlineData((byte)3, SecurityClassification.Operate)] // CurrentRead + CurrentWrite
|
||||
[InlineData((byte)0x0F, SecurityClassification.Operate)] // read+write+historyRead+historyWrite
|
||||
[InlineData((byte)0x04, SecurityClassification.ViewOnly)] // HistoryRead only — no Write bit
|
||||
public void MapAccessLevelToSecurityClass_respects_CurrentWrite_bit(byte accessLevel, SecurityClassification expected)
|
||||
{
|
||||
OpcUaClientDriver.MapAccessLevelToSecurityClass(accessLevel).ShouldBe(expected);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Security.Cryptography.X509Certificates;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientCertAuthTests
|
||||
{
|
||||
[Fact]
|
||||
public void BuildCertificateIdentity_rejects_missing_path()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions { AuthType = OpcUaAuthType.Certificate };
|
||||
Should.Throw<InvalidOperationException>(() => OpcUaClientDriver.BuildCertificateIdentity(opts))
|
||||
.Message.ShouldContain("UserCertificatePath");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildCertificateIdentity_rejects_nonexistent_file()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
AuthType = OpcUaAuthType.Certificate,
|
||||
UserCertificatePath = Path.Combine(Path.GetTempPath(), $"does-not-exist-{Guid.NewGuid():N}.pfx"),
|
||||
};
|
||||
Should.Throw<FileNotFoundException>(() => OpcUaClientDriver.BuildCertificateIdentity(opts));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BuildCertificateIdentity_loads_a_valid_PFX_with_private_key()
|
||||
{
|
||||
// Generate a self-signed cert on the fly so the test doesn't ship a static PFX.
|
||||
// The driver doesn't care about the issuer — just needs a cert with a private key.
|
||||
using var rsa = RSA.Create(2048);
|
||||
var req = new CertificateRequest("CN=OpcUaClientCertAuthTests", rsa,
|
||||
HashAlgorithmName.SHA256, RSASignaturePadding.Pkcs1);
|
||||
var cert = req.CreateSelfSigned(DateTimeOffset.UtcNow.AddMinutes(-5), DateTimeOffset.UtcNow.AddHours(1));
|
||||
|
||||
var tmpPath = Path.Combine(Path.GetTempPath(), $"opcua-cert-test-{Guid.NewGuid():N}.pfx");
|
||||
File.WriteAllBytes(tmpPath, cert.Export(X509ContentType.Pfx, "testpw"));
|
||||
try
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
AuthType = OpcUaAuthType.Certificate,
|
||||
UserCertificatePath = tmpPath,
|
||||
UserCertificatePassword = "testpw",
|
||||
};
|
||||
var identity = OpcUaClientDriver.BuildCertificateIdentity(opts);
|
||||
identity.ShouldNotBeNull();
|
||||
identity.TokenType.ShouldBe(Opc.Ua.UserTokenType.Certificate);
|
||||
}
|
||||
finally
|
||||
{
|
||||
try { File.Delete(tmpPath); } catch { /* best-effort */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Scaffold tests for <see cref="OpcUaClientDriver"/>'s <see cref="ITagDiscovery"/>
|
||||
/// surface that don't require a live remote server. Live-browse coverage lands in a
|
||||
/// follow-up PR once the in-process OPC UA server fixture is scaffolded.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientDiscoveryTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task DiscoverAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-disco");
|
||||
var builder = new NullAddressSpaceBuilder();
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.DiscoverAsync(builder, TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DiscoverAsync_rejects_null_builder()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-disco");
|
||||
Should.ThrowAsync<ArgumentNullException>(async () =>
|
||||
await drv.DiscoverAsync(null!, TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Discovery_caps_are_sensible_defaults()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions();
|
||||
opts.MaxDiscoveredNodes.ShouldBe(10_000, "bounds memory on runaway servers without clipping normal models");
|
||||
opts.MaxBrowseDepth.ShouldBe(10, "deep enough for realistic info models; shallow enough for cycle safety");
|
||||
opts.BrowseRoot.ShouldBeNull("null = default to ObjectsFolder i=85");
|
||||
}
|
||||
|
||||
private sealed class NullAddressSpaceBuilder : IAddressSpaceBuilder
|
||||
{
|
||||
public IAddressSpaceBuilder Folder(string browseName, string displayName) => this;
|
||||
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
|
||||
=> new StubHandle();
|
||||
public void AddProperty(string browseName, DriverDataType dataType, object? value) { }
|
||||
public void AttachAlarmCondition(IVariableHandle sourceVariable, string alarmName, DriverAttributeInfo alarmInfo) { }
|
||||
|
||||
private sealed class StubHandle : IVariableHandle
|
||||
{
|
||||
public string FullReference => "stub";
|
||||
public IAlarmConditionSink MarkAsAlarmCondition(AlarmConditionInfo info) => throw new NotSupportedException();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Scaffold-level tests for <see cref="OpcUaClientDriver"/> that don't require a live
|
||||
/// remote OPC UA server. PR 67+ adds IReadable/IWritable/ITagDiscovery/ISubscribable
|
||||
/// tests against a local in-process OPC UA server fixture.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientDriverScaffoldTests
|
||||
{
|
||||
[Fact]
|
||||
public void Default_options_target_standard_opcua_port_and_anonymous_auth()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions();
|
||||
opts.EndpointUrl.ShouldBe("opc.tcp://localhost:4840", "4840 is the IANA-assigned OPC UA port");
|
||||
opts.SecurityMode.ShouldBe(OpcUaSecurityMode.None);
|
||||
opts.SecurityPolicy.ShouldBe(OpcUaSecurityPolicy.None);
|
||||
opts.AuthType.ShouldBe(OpcUaAuthType.Anonymous);
|
||||
opts.AutoAcceptCertificates.ShouldBeFalse("production default must reject untrusted server certs");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Default_timeouts_match_driver_specs_section_8()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions();
|
||||
opts.SessionTimeout.ShouldBe(TimeSpan.FromSeconds(120));
|
||||
opts.KeepAliveInterval.ShouldBe(TimeSpan.FromSeconds(5));
|
||||
opts.ReconnectPeriod.ShouldBe(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Driver_reports_type_and_id_before_connect()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-test");
|
||||
drv.DriverType.ShouldBe("OpcUaClient");
|
||||
drv.DriverInstanceId.ShouldBe("opcua-test");
|
||||
drv.GetHealth().State.ShouldBe(DriverState.Unknown);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Initialize_against_unreachable_endpoint_transitions_to_Faulted_and_throws()
|
||||
{
|
||||
// RFC 5737 reserved-for-documentation IP; won't route anywhere. Pick opc.tcp:// so
|
||||
// endpoint selection hits the transport-layer connection rather than a DNS lookup.
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
// Port 1 on loopback is effectively guaranteed to be closed — the OS responds
|
||||
// with TCP RST immediately instead of hanging on connect, which keeps the
|
||||
// unreachable-host tests snappy. Don't use an RFC 5737 reserved IP; those get
|
||||
// routed to a black-hole + time out only after the SDK's internal retry/backoff
|
||||
// fully elapses (~60s even with Options.Timeout=500ms).
|
||||
EndpointUrl = "opc.tcp://127.0.0.1:1",
|
||||
Timeout = TimeSpan.FromMilliseconds(500),
|
||||
AutoAcceptCertificates = true, // dev-mode to bypass cert validation in the test
|
||||
};
|
||||
using var drv = new OpcUaClientDriver(opts, "opcua-unreach");
|
||||
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||
|
||||
var health = drv.GetHealth();
|
||||
health.State.ShouldBe(DriverState.Faulted);
|
||||
health.LastError.ShouldNotBeNull();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Reinitialize_against_unreachable_endpoint_re_throws()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
// Port 1 on loopback is effectively guaranteed to be closed — the OS responds
|
||||
// with TCP RST immediately instead of hanging on connect, which keeps the
|
||||
// unreachable-host tests snappy. Don't use an RFC 5737 reserved IP; those get
|
||||
// routed to a black-hole + time out only after the SDK's internal retry/backoff
|
||||
// fully elapses (~60s even with Options.Timeout=500ms).
|
||||
EndpointUrl = "opc.tcp://127.0.0.1:1",
|
||||
Timeout = TimeSpan.FromMilliseconds(500),
|
||||
AutoAcceptCertificates = true,
|
||||
};
|
||||
using var drv = new OpcUaClientDriver(opts, "opcua-reinit");
|
||||
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await drv.ReinitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientFailoverTests
|
||||
{
|
||||
[Fact]
|
||||
public void ResolveEndpointCandidates_prefers_EndpointUrls_when_provided()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
EndpointUrl = "opc.tcp://fallback:4840",
|
||||
EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
|
||||
};
|
||||
var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
|
||||
list.Count.ShouldBe(2);
|
||||
list[0].ShouldBe("opc.tcp://primary:4840");
|
||||
list[1].ShouldBe("opc.tcp://backup:4841");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ResolveEndpointCandidates_falls_back_to_single_EndpointUrl_when_list_empty()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions { EndpointUrl = "opc.tcp://only:4840" };
|
||||
var list = OpcUaClientDriver.ResolveEndpointCandidates(opts);
|
||||
list.Count.ShouldBe(1);
|
||||
list[0].ShouldBe("opc.tcp://only:4840");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ResolveEndpointCandidates_empty_list_treated_as_fallback_to_EndpointUrl()
|
||||
{
|
||||
// Explicit empty list should still fall back to the single-URL shortcut rather than
|
||||
// producing a zero-candidate sweep that would immediately throw with no URLs tried.
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
EndpointUrl = "opc.tcp://single:4840",
|
||||
EndpointUrls = [],
|
||||
};
|
||||
OpcUaClientDriver.ResolveEndpointCandidates(opts).Count.ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HostName_uses_first_candidate_before_connect()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
EndpointUrls = ["opc.tcp://primary:4840", "opc.tcp://backup:4841"],
|
||||
};
|
||||
using var drv = new OpcUaClientDriver(opts, "opcua-host");
|
||||
drv.HostName.ShouldBe("opc.tcp://primary:4840",
|
||||
"pre-connect the dashboard should show the first candidate URL so operators can link back");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Initialize_against_all_unreachable_endpoints_throws_AggregateException_listing_each()
|
||||
{
|
||||
// Port 1 + port 2 + port 3 on loopback are all guaranteed closed (TCP RST immediate).
|
||||
// Failover sweep should attempt all three and throw AggregateException naming each URL
|
||||
// so operators see exactly which candidates were tried.
|
||||
var opts = new OpcUaClientDriverOptions
|
||||
{
|
||||
EndpointUrls = ["opc.tcp://127.0.0.1:1", "opc.tcp://127.0.0.1:2", "opc.tcp://127.0.0.1:3"],
|
||||
PerEndpointConnectTimeout = TimeSpan.FromMilliseconds(500),
|
||||
Timeout = TimeSpan.FromMilliseconds(500),
|
||||
AutoAcceptCertificates = true,
|
||||
};
|
||||
using var drv = new OpcUaClientDriver(opts, "opcua-failover");
|
||||
|
||||
var ex = await Should.ThrowAsync<AggregateException>(async () =>
|
||||
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||
|
||||
ex.Message.ShouldContain("127.0.0.1:1");
|
||||
ex.Message.ShouldContain("127.0.0.1:2");
|
||||
ex.Message.ShouldContain("127.0.0.1:3");
|
||||
drv.GetHealth().State.ShouldBe(DriverState.Faulted);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using Opc.Ua;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientHistoryTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(HistoryAggregateType.Average)]
|
||||
[InlineData(HistoryAggregateType.Minimum)]
|
||||
[InlineData(HistoryAggregateType.Maximum)]
|
||||
[InlineData(HistoryAggregateType.Total)]
|
||||
[InlineData(HistoryAggregateType.Count)]
|
||||
public void MapAggregateToNodeId_returns_standard_Part13_aggregate_for_every_enum(HistoryAggregateType agg)
|
||||
{
|
||||
var nodeId = OpcUaClientDriver.MapAggregateToNodeId(agg);
|
||||
NodeId.IsNull(nodeId).ShouldBeFalse();
|
||||
// Every mapping should resolve to an AggregateFunction_* NodeId (namespace 0, numeric id).
|
||||
nodeId.NamespaceIndex.ShouldBe((ushort)0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapAggregateToNodeId_rejects_invalid_enum_value()
|
||||
{
|
||||
// Defense-in-depth: a future HistoryAggregateType addition mustn't silently fall through.
|
||||
Should.Throw<ArgumentOutOfRangeException>(() =>
|
||||
OpcUaClientDriver.MapAggregateToNodeId((HistoryAggregateType)99));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadRawAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-hist-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.ReadRawAsync("ns=2;s=Counter",
|
||||
DateTime.UtcNow.AddMinutes(-5), DateTime.UtcNow, 1000,
|
||||
TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadRawAsync_with_malformed_NodeId_returns_empty_result_not_throw()
|
||||
{
|
||||
// Same defensive pattern as ReadAsync / WriteAsync — malformed NodeId short-circuits
|
||||
// to an empty result rather than crashing a batch history call. Needs init via the
|
||||
// throw path first, then we pass "" to trigger the parse-fail branch inside
|
||||
// ExecuteHistoryReadAsync. The init itself fails against 127.0.0.1:1 so we stop there.
|
||||
// Not runnable without init — keep as placeholder for when the in-process fixture
|
||||
// PR lands.
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadProcessedAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-hist-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.ReadProcessedAsync("ns=2;s=Counter",
|
||||
DateTime.UtcNow.AddMinutes(-5), DateTime.UtcNow,
|
||||
TimeSpan.FromSeconds(10), HistoryAggregateType.Average,
|
||||
TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadAtTimeAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-hist-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.ReadAtTimeAsync("ns=2;s=Counter",
|
||||
[DateTime.UtcNow.AddMinutes(-5), DateTime.UtcNow],
|
||||
TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadEventsAsync_throws_NotSupportedException_as_documented()
|
||||
{
|
||||
// The IHistoryProvider default implementation throws; the OPC UA Client driver
|
||||
// deliberately inherits that default (see PR 76 commit body) because the OPC UA
|
||||
// client call path needs an EventFilter SelectClauses spec the interface doesn't carry.
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-events-default");
|
||||
await Should.ThrowAsync<NotSupportedException>(async () =>
|
||||
await ((IHistoryProvider)drv).ReadEventsAsync(
|
||||
sourceName: null,
|
||||
startUtc: DateTime.UtcNow.AddMinutes(-5),
|
||||
endUtc: DateTime.UtcNow,
|
||||
maxEvents: 100,
|
||||
cancellationToken: TestContext.Current.CancellationToken));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for the IReadable/IWritable surface that don't need a live remote OPC UA
|
||||
/// server. Wire-level round-trips against a local in-process server fixture land in a
|
||||
/// follow-up PR once we have one scaffolded.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientReadWriteTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task ReadAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.ReadAsync(["ns=2;s=Demo"], TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task WriteAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.WriteAsync(
|
||||
[new WriteRequest("ns=2;s=Demo", 42)],
|
||||
TestContext.Current.CancellationToken));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Scaffold tests for <see cref="SessionReconnectHandler"/> wiring. Wire-level
|
||||
/// disconnect-reconnect-resume coverage against a live upstream server lands with the
|
||||
/// in-process fixture — too much machinery for a unit-test-only lane.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientReconnectTests
|
||||
{
|
||||
[Fact]
|
||||
public void Default_ReconnectPeriod_matches_driver_specs_5_seconds()
|
||||
{
|
||||
new OpcUaClientDriverOptions().ReconnectPeriod.ShouldBe(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Options_ReconnectPeriod_is_configurable_for_aggressive_or_relaxed_retry()
|
||||
{
|
||||
var opts = new OpcUaClientDriverOptions { ReconnectPeriod = TimeSpan.FromMilliseconds(500) };
|
||||
opts.ReconnectPeriod.ShouldBe(TimeSpan.FromMilliseconds(500));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Driver_starts_with_no_reconnect_handler_active_pre_init()
|
||||
{
|
||||
// The reconnect handler is lazy — spun up only when a bad keep-alive fires. Pre-init
|
||||
// there's no session to reconnect, so the field must be null (indirectly verified by
|
||||
// the lifecycle-shape test suite catching any accidental construction).
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-reconnect");
|
||||
drv.GetHealth().State.ShouldBe(Core.Abstractions.DriverState.Unknown);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
using Opc.Ua;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientSecurityPolicyTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(OpcUaSecurityPolicy.None)]
|
||||
[InlineData(OpcUaSecurityPolicy.Basic128Rsa15)]
|
||||
[InlineData(OpcUaSecurityPolicy.Basic256)]
|
||||
[InlineData(OpcUaSecurityPolicy.Basic256Sha256)]
|
||||
[InlineData(OpcUaSecurityPolicy.Aes128_Sha256_RsaOaep)]
|
||||
[InlineData(OpcUaSecurityPolicy.Aes256_Sha256_RsaPss)]
|
||||
public void MapSecurityPolicy_returns_known_non_empty_uri_for_every_enum_value(OpcUaSecurityPolicy policy)
|
||||
{
|
||||
var uri = OpcUaClientDriver.MapSecurityPolicy(policy);
|
||||
uri.ShouldNotBeNullOrEmpty();
|
||||
// Each URI should end in the enum name (for the non-None policies) so a driver
|
||||
// operator reading logs can correlate the URI back to the config value.
|
||||
if (policy != OpcUaSecurityPolicy.None)
|
||||
uri.ShouldContain(policy.ToString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapSecurityPolicy_None_matches_SDK_None_URI()
|
||||
{
|
||||
OpcUaClientDriver.MapSecurityPolicy(OpcUaSecurityPolicy.None)
|
||||
.ShouldBe(SecurityPolicies.None);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapSecurityPolicy_Basic256Sha256_matches_SDK_URI()
|
||||
{
|
||||
OpcUaClientDriver.MapSecurityPolicy(OpcUaSecurityPolicy.Basic256Sha256)
|
||||
.ShouldBe(SecurityPolicies.Basic256Sha256);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MapSecurityPolicy_Aes256_Sha256_RsaPss_matches_SDK_URI()
|
||||
{
|
||||
OpcUaClientDriver.MapSecurityPolicy(OpcUaSecurityPolicy.Aes256_Sha256_RsaPss)
|
||||
.ShouldBe(SecurityPolicies.Aes256_Sha256_RsaPss);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Every_enum_value_has_a_mapping()
|
||||
{
|
||||
foreach (OpcUaSecurityPolicy p in Enum.GetValues<OpcUaSecurityPolicy>())
|
||||
Should.NotThrow(() => OpcUaClientDriver.MapSecurityPolicy(p));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Scaffold tests for <c>ISubscribable</c> + <c>IHostConnectivityProbe</c> that don't
|
||||
/// need a live remote server. Live-session tests (subscribe/unsubscribe round-trip,
|
||||
/// keep-alive transitions) land in a follow-up PR once the in-process OPC UA server
|
||||
/// fixture is scaffolded.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientSubscribeAndProbeTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task SubscribeAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-sub-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.SubscribeAsync(["ns=2;s=Demo"], TimeSpan.FromMilliseconds(100), TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task UnsubscribeAsync_with_unknown_handle_is_noop()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-sub-unknown");
|
||||
// UnsubscribeAsync returns cleanly for handles it doesn't recognise — protects against
|
||||
// the caller's race with server-side cleanup after a session drop.
|
||||
await drv.UnsubscribeAsync(new FakeHandle(), TestContext.Current.CancellationToken);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHostStatuses_returns_endpoint_url_row_pre_init()
|
||||
{
|
||||
using var drv = new OpcUaClientDriver(
|
||||
new OpcUaClientDriverOptions { EndpointUrl = "opc.tcp://plc.example:4840" },
|
||||
"opcua-hosts");
|
||||
var rows = drv.GetHostStatuses();
|
||||
rows.Count.ShouldBe(1);
|
||||
rows[0].HostName.ShouldBe("opc.tcp://plc.example:4840",
|
||||
"host identity mirrors the endpoint URL so the Admin /hosts dashboard can link back to the remote server");
|
||||
rows[0].State.ShouldBe(HostState.Unknown);
|
||||
}
|
||||
|
||||
private sealed class FakeHandle : ISubscriptionHandle
|
||||
{
|
||||
public string DiagnosticId => "fake";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<IsPackable>false</IsPackable>
|
||||
<IsTestProject>true</IsTestProject>
|
||||
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="xunit.v3" Version="1.1.0"/>
|
||||
<PackageReference Include="Shouldly" Version="4.3.0"/>
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0"/>
|
||||
<PackageReference Include="xunit.runner.visualstudio" Version="3.0.2">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
</PackageReference>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient\ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-37gx-xxp4-5rgx"/>
|
||||
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-w3x6-4m5h-cxqf"/>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
119
tests/ZB.MOM.WW.OtOpcUa.Driver.S7.Tests/S7AddressParserTests.cs
Normal file
119
tests/ZB.MOM.WW.OtOpcUa.Driver.S7.Tests/S7AddressParserTests.cs
Normal file
@@ -0,0 +1,119 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7.Tests;
|
||||
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class S7AddressParserTests
|
||||
{
|
||||
// --- Data blocks ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("DB1.DBX0.0", 1, S7Size.Bit, 0, 0)]
|
||||
[InlineData("DB1.DBX0.7", 1, S7Size.Bit, 0, 7)]
|
||||
[InlineData("DB1.DBB0", 1, S7Size.Byte, 0, 0)]
|
||||
[InlineData("DB1.DBW0", 1, S7Size.Word, 0, 0)]
|
||||
[InlineData("DB1.DBD4", 1, S7Size.DWord, 4, 0)]
|
||||
[InlineData("DB10.DBW100", 10, S7Size.Word, 100, 0)]
|
||||
[InlineData("DB1.DBX15.3", 1, S7Size.Bit, 15, 3)]
|
||||
public void Parse_data_block_addresses(string input, int db, S7Size size, int byteOff, int bitOff)
|
||||
{
|
||||
var r = S7AddressParser.Parse(input);
|
||||
r.Area.ShouldBe(S7Area.DataBlock);
|
||||
r.DbNumber.ShouldBe(db);
|
||||
r.Size.ShouldBe(size);
|
||||
r.ByteOffset.ShouldBe(byteOff);
|
||||
r.BitOffset.ShouldBe(bitOff);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("db1.dbw0", 1, S7Size.Word, 0)]
|
||||
[InlineData(" DB1.DBW0 ", 1, S7Size.Word, 0)] // trim whitespace
|
||||
public void Parse_is_case_insensitive_and_trims(string input, int db, S7Size size, int off)
|
||||
{
|
||||
var r = S7AddressParser.Parse(input);
|
||||
r.Area.ShouldBe(S7Area.DataBlock);
|
||||
r.DbNumber.ShouldBe(db);
|
||||
r.Size.ShouldBe(size);
|
||||
r.ByteOffset.ShouldBe(off);
|
||||
}
|
||||
|
||||
// --- M / I / Q ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("MB0", S7Area.Memory, S7Size.Byte, 0, 0)]
|
||||
[InlineData("MW10", S7Area.Memory, S7Size.Word, 10, 0)]
|
||||
[InlineData("MD4", S7Area.Memory, S7Size.DWord, 4, 0)]
|
||||
[InlineData("M0.0", S7Area.Memory, S7Size.Bit, 0, 0)]
|
||||
[InlineData("M255.7", S7Area.Memory, S7Size.Bit, 255, 7)]
|
||||
[InlineData("IB0", S7Area.Input, S7Size.Byte, 0, 0)]
|
||||
[InlineData("IW0", S7Area.Input, S7Size.Word, 0, 0)]
|
||||
[InlineData("I0.0", S7Area.Input, S7Size.Bit, 0, 0)]
|
||||
[InlineData("QB0", S7Area.Output, S7Size.Byte, 0, 0)]
|
||||
[InlineData("QW0", S7Area.Output, S7Size.Word, 0, 0)]
|
||||
[InlineData("Q0.0", S7Area.Output, S7Size.Bit, 0, 0)]
|
||||
[InlineData("QD4", S7Area.Output, S7Size.DWord, 4, 0)]
|
||||
public void Parse_MIQ_addresses(string input, S7Area area, S7Size size, int byteOff, int bitOff)
|
||||
{
|
||||
var r = S7AddressParser.Parse(input);
|
||||
r.Area.ShouldBe(area);
|
||||
r.DbNumber.ShouldBe(0);
|
||||
r.Size.ShouldBe(size);
|
||||
r.ByteOffset.ShouldBe(byteOff);
|
||||
r.BitOffset.ShouldBe(bitOff);
|
||||
}
|
||||
|
||||
// --- Timers / counters ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("T0", S7Area.Timer, 0)]
|
||||
[InlineData("T15", S7Area.Timer, 15)]
|
||||
[InlineData("C0", S7Area.Counter, 0)]
|
||||
[InlineData("C10", S7Area.Counter, 10)]
|
||||
public void Parse_timer_and_counter(string input, S7Area area, int number)
|
||||
{
|
||||
var r = S7AddressParser.Parse(input);
|
||||
r.Area.ShouldBe(area);
|
||||
r.ByteOffset.ShouldBe(number);
|
||||
r.Size.ShouldBe(S7Size.Word, "timers + counters are 16-bit opaque");
|
||||
}
|
||||
|
||||
// --- Reject garbage ---
|
||||
|
||||
[Theory]
|
||||
[InlineData("")]
|
||||
[InlineData(" ")]
|
||||
[InlineData("Z0")] // unknown area
|
||||
[InlineData("DB")] // no number or tail
|
||||
[InlineData("DB1")] // no tail
|
||||
[InlineData("DB1.")] // empty tail
|
||||
[InlineData("DB1.DBX0")] // bit size without .bit
|
||||
[InlineData("DB1.DBX0.8")] // bit 8 out of range
|
||||
[InlineData("DB1.DBW0.0")] // word with bit suffix
|
||||
[InlineData("DB0.DBW0")] // db 0 invalid
|
||||
[InlineData("DBA.DBW0")] // non-numeric db
|
||||
[InlineData("DB1.DBQ0")] // invalid size letter
|
||||
[InlineData("M")] // no offset
|
||||
[InlineData("M0")] // bit access needs .bit
|
||||
[InlineData("M0.8")] // bit 8
|
||||
[InlineData("MB-1")] // negative offset
|
||||
[InlineData("MW")] // no offset digits
|
||||
[InlineData("TA")] // non-numeric timer
|
||||
public void Parse_rejects_invalid(string bad)
|
||||
=> Should.Throw<FormatException>(() => S7AddressParser.Parse(bad));
|
||||
|
||||
[Fact]
|
||||
public void TryParse_returns_false_for_garbage_without_throwing()
|
||||
{
|
||||
S7AddressParser.TryParse("not-an-address", out var r).ShouldBeFalse();
|
||||
r.ShouldBe(default);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryParse_returns_true_for_valid_address()
|
||||
{
|
||||
S7AddressParser.TryParse("DB1.DBW0", out var r).ShouldBeTrue();
|
||||
r.DbNumber.ShouldBe(1);
|
||||
r.Size.ShouldBe(S7Size.Word);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Shape tests for <see cref="S7Driver"/>'s <see cref="ITagDiscovery"/>,
|
||||
/// <see cref="ISubscribable"/>, and <see cref="IHostConnectivityProbe"/> surfaces that
|
||||
/// don't need a live PLC. Wire-level polling round-trips and probe transitions land in a
|
||||
/// follow-up PR once we have a mock S7 server.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class S7DiscoveryAndSubscribeTests
|
||||
{
|
||||
private sealed class RecordingAddressSpaceBuilder : IAddressSpaceBuilder
|
||||
{
|
||||
public readonly List<string> Folders = new();
|
||||
public readonly List<(string Name, DriverAttributeInfo Attr)> Variables = new();
|
||||
|
||||
public IAddressSpaceBuilder Folder(string browseName, string displayName)
|
||||
{
|
||||
Folders.Add(browseName);
|
||||
return this;
|
||||
}
|
||||
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
|
||||
{
|
||||
Variables.Add((browseName, attributeInfo));
|
||||
return new StubHandle();
|
||||
}
|
||||
public void AddProperty(string browseName, DriverDataType dataType, object? value) { }
|
||||
public void AttachAlarmCondition(IVariableHandle sourceVariable, string alarmName, DriverAttributeInfo alarmInfo) { }
|
||||
|
||||
private sealed class StubHandle : IVariableHandle
|
||||
{
|
||||
public string FullReference => "stub";
|
||||
public IAlarmConditionSink MarkAsAlarmCondition(AlarmConditionInfo info)
|
||||
=> throw new NotImplementedException("S7 driver never calls this — no alarm surfacing");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DiscoverAsync_projects_every_tag_into_the_address_space()
|
||||
{
|
||||
var opts = new S7DriverOptions
|
||||
{
|
||||
Host = "192.0.2.1",
|
||||
Tags =
|
||||
[
|
||||
new("TempSetpoint", "DB1.DBW0", S7DataType.Int16, Writable: true),
|
||||
new("FaultBit", "M0.0", S7DataType.Bool, Writable: false),
|
||||
new("PIDOutput", "DB5.DBD12", S7DataType.Float32, Writable: true),
|
||||
],
|
||||
};
|
||||
using var drv = new S7Driver(opts, "s7-disco");
|
||||
|
||||
var builder = new RecordingAddressSpaceBuilder();
|
||||
await drv.DiscoverAsync(builder, TestContext.Current.CancellationToken);
|
||||
|
||||
builder.Folders.ShouldContain("S7");
|
||||
builder.Variables.Count.ShouldBe(3);
|
||||
builder.Variables[0].Name.ShouldBe("TempSetpoint");
|
||||
builder.Variables[0].Attr.SecurityClass.ShouldBe(SecurityClassification.Operate, "writable tags get Operate security class");
|
||||
builder.Variables[1].Attr.SecurityClass.ShouldBe(SecurityClassification.ViewOnly, "read-only tags get ViewOnly");
|
||||
builder.Variables[2].Attr.DriverDataType.ShouldBe(DriverDataType.Float32);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DiscoverAsync_propagates_WriteIdempotent_from_tag_to_attribute_info()
|
||||
{
|
||||
var opts = new S7DriverOptions
|
||||
{
|
||||
Host = "192.0.2.1",
|
||||
Tags =
|
||||
[
|
||||
new("SetPoint", "DB1.DBW0", S7DataType.Int16, WriteIdempotent: true),
|
||||
new("StartBit", "M0.0", S7DataType.Bool),
|
||||
],
|
||||
};
|
||||
using var drv = new S7Driver(opts, "s7-idem");
|
||||
|
||||
var builder = new RecordingAddressSpaceBuilder();
|
||||
await drv.DiscoverAsync(builder, TestContext.Current.CancellationToken);
|
||||
|
||||
builder.Variables.Single(v => v.Name == "SetPoint").Attr.WriteIdempotent.ShouldBeTrue();
|
||||
builder.Variables.Single(v => v.Name == "StartBit").Attr.WriteIdempotent.ShouldBeFalse("default is opt-in per decision #44");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHostStatuses_returns_one_row_with_host_port_identity_pre_init()
|
||||
{
|
||||
var opts = new S7DriverOptions { Host = "plc1.internal", Port = 102 };
|
||||
using var drv = new S7Driver(opts, "s7-host");
|
||||
|
||||
var rows = drv.GetHostStatuses();
|
||||
rows.Count.ShouldBe(1);
|
||||
rows[0].HostName.ShouldBe("plc1.internal:102");
|
||||
rows[0].State.ShouldBe(HostState.Unknown, "pre-init / pre-probe state is Unknown");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SubscribeAsync_returns_unique_handles_and_UnsubscribeAsync_accepts_them()
|
||||
{
|
||||
var opts = new S7DriverOptions { Host = "192.0.2.1" };
|
||||
using var drv = new S7Driver(opts, "s7-sub");
|
||||
|
||||
// SubscribeAsync does not itself call ReadAsync (the poll task does), so this works
|
||||
// even though the driver isn't initialized. The poll task catches the resulting
|
||||
// InvalidOperationException and the loop quietly continues — same pattern as the
|
||||
// Modbus driver's poll loop tolerating transient transport failures.
|
||||
var h1 = await drv.SubscribeAsync(["T1"], TimeSpan.FromMilliseconds(200), TestContext.Current.CancellationToken);
|
||||
var h2 = await drv.SubscribeAsync(["T2"], TimeSpan.FromMilliseconds(200), TestContext.Current.CancellationToken);
|
||||
|
||||
h1.DiagnosticId.ShouldStartWith("s7-sub-");
|
||||
h2.DiagnosticId.ShouldStartWith("s7-sub-");
|
||||
h1.DiagnosticId.ShouldNotBe(h2.DiagnosticId);
|
||||
|
||||
await drv.UnsubscribeAsync(h1, TestContext.Current.CancellationToken);
|
||||
await drv.UnsubscribeAsync(h2, TestContext.Current.CancellationToken);
|
||||
// UnsubscribeAsync with an unknown handle must be a no-op, not throw.
|
||||
await drv.UnsubscribeAsync(h1, TestContext.Current.CancellationToken);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Subscribe_publishing_interval_is_floored_at_100ms()
|
||||
{
|
||||
var opts = new S7DriverOptions { Host = "192.0.2.1", Probe = new S7ProbeOptions { Enabled = false } };
|
||||
using var drv = new S7Driver(opts, "s7-floor");
|
||||
|
||||
// 50 ms requested — the floor protects the S7 CPU from sub-scan polling that would
|
||||
// just queue wire-side. Test that the subscription is accepted (the floor is applied
|
||||
// internally; the floor value isn't exposed, so we're really just asserting that the
|
||||
// driver doesn't reject small intervals).
|
||||
var h = await drv.SubscribeAsync(["T"], TimeSpan.FromMilliseconds(50), TestContext.Current.CancellationToken);
|
||||
h.ShouldNotBeNull();
|
||||
await drv.UnsubscribeAsync(h, TestContext.Current.CancellationToken);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for <see cref="S7Driver"/>'s <c>IReadable</c>/<c>IWritable</c> surface
|
||||
/// that don't require a live PLC — covers error paths (not-initialized, unknown tag,
|
||||
/// read-only write rejection, unsupported data types). Wire-level round-trip tests
|
||||
/// against a live S7 or a mock-server land in a follow-up PR since S7.Net doesn't ship
|
||||
/// an in-process fake and an adequate mock is non-trivial.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class S7DriverReadWriteTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task Initialize_rejects_invalid_tag_address_and_fails_fast()
|
||||
{
|
||||
// Bad address at init time must throw; the alternative (deferring the parse to the
|
||||
// first read) would surface the config bug as BadInternalError on every subsequent
|
||||
// Read which is impossible for an operator to diagnose from the OPC UA client.
|
||||
var opts = new S7DriverOptions
|
||||
{
|
||||
Host = "192.0.2.1", // reserved — will never complete TCP handshake
|
||||
Timeout = TimeSpan.FromMilliseconds(250),
|
||||
Tags = [new S7TagDefinition("BadTag", "NOT-AN-S7-ADDRESS", S7DataType.Int16)],
|
||||
};
|
||||
using var drv = new S7Driver(opts, "s7-bad-tag");
|
||||
|
||||
// Either the TCP connect fails first (Exception) or the parser fails (FormatException)
|
||||
// — both are acceptable since both are init-time fail-fast. What matters is that we
|
||||
// don't return a "healthy" driver with a latent bad tag.
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReadAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new S7Driver(new S7DriverOptions { Host = "192.0.2.1" }, "s7-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.ReadAsync(["Any"], TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task WriteAsync_without_initialize_throws_InvalidOperationException()
|
||||
{
|
||||
using var drv = new S7Driver(new S7DriverOptions { Host = "192.0.2.1" }, "s7-uninit");
|
||||
await Should.ThrowAsync<InvalidOperationException>(async () =>
|
||||
await drv.WriteAsync(
|
||||
[new(FullReference: "Any", Value: (short)0)],
|
||||
TestContext.Current.CancellationToken));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.S7.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Scaffold-level tests that don't need a live S7 PLC — exercise driver lifecycle shape,
|
||||
/// default option values, and failure-mode transitions. PR 64 adds IReadable/IWritable
|
||||
/// tests against a mock-server, PR 65 adds discovery + subscribe.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class S7DriverScaffoldTests
|
||||
{
|
||||
[Fact]
|
||||
public void Default_options_target_S7_1500_slot_0_on_port_102()
|
||||
{
|
||||
var opts = new S7DriverOptions();
|
||||
opts.Port.ShouldBe(102, "ISO-on-TCP is always 102 for S7; documented in driver-specs.md §5");
|
||||
opts.CpuType.ShouldBe(global::S7.Net.CpuType.S71500);
|
||||
opts.Rack.ShouldBe((short)0);
|
||||
opts.Slot.ShouldBe((short)0, "S7-1200/1500 onboard PN ports are slot 0 by convention");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Default_probe_interval_is_reasonable_for_S7_scan_cycle()
|
||||
{
|
||||
// S7 PLCs scan 2-10 ms but comms mailbox typically processed once per scan.
|
||||
// 5 s default probe is lightweight — ~0.001% of comms budget.
|
||||
new S7ProbeOptions().Interval.ShouldBe(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Tag_definition_defaults_to_writable_with_S7_max_string_length()
|
||||
{
|
||||
var tag = new S7TagDefinition("T", "DB1.DBW0", S7DataType.Int16);
|
||||
tag.Writable.ShouldBeTrue();
|
||||
tag.StringLength.ShouldBe(254, "S7 STRING type max length is 254 chars");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Driver_instance_reports_type_and_id_before_connect()
|
||||
{
|
||||
var opts = new S7DriverOptions { Host = "127.0.0.1" };
|
||||
using var drv = new S7Driver(opts, "s7-test");
|
||||
drv.DriverType.ShouldBe("S7");
|
||||
drv.DriverInstanceId.ShouldBe("s7-test");
|
||||
drv.GetHealth().State.ShouldBe(DriverState.Unknown, "health starts Unknown until InitializeAsync runs");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Initialize_against_unreachable_host_transitions_to_Faulted_and_throws()
|
||||
{
|
||||
// Pick an RFC 5737 reserved-for-documentation IP so the connect attempt fails fast
|
||||
// (no DNS mismatch, no accidental traffic to a real PLC).
|
||||
var opts = new S7DriverOptions { Host = "192.0.2.1", Timeout = TimeSpan.FromMilliseconds(250) };
|
||||
using var drv = new S7Driver(opts, "s7-unreach");
|
||||
|
||||
await Should.ThrowAsync<Exception>(async () =>
|
||||
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken));
|
||||
|
||||
var health = drv.GetHealth();
|
||||
health.State.ShouldBe(DriverState.Faulted, "unreachable host must flip the driver to Faulted so operators see it");
|
||||
health.LastError.ShouldNotBeNull();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<IsPackable>false</IsPackable>
|
||||
<IsTestProject>true</IsTestProject>
|
||||
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.S7.Tests</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="xunit.v3" Version="1.1.0"/>
|
||||
<PackageReference Include="Shouldly" Version="4.3.0"/>
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0"/>
|
||||
<PackageReference Include="xunit.runner.visualstudio" Version="3.0.2">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
</PackageReference>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\ZB.MOM.WW.OtOpcUa.Driver.S7\ZB.MOM.WW.OtOpcUa.Driver.S7.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-37gx-xxp4-5rgx"/>
|
||||
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-w3x6-4m5h-cxqf"/>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
177
tests/ZB.MOM.WW.OtOpcUa.Server.Tests/HealthEndpointsHostTests.cs
Normal file
177
tests/ZB.MOM.WW.OtOpcUa.Server.Tests/HealthEndpointsHostTests.cs
Normal file
@@ -0,0 +1,177 @@
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
|
||||
|
||||
[Trait("Category", "Integration")]
|
||||
public sealed class HealthEndpointsHostTests : IAsyncLifetime
|
||||
{
|
||||
private static int _portCounter = 48500 + Random.Shared.Next(0, 99);
|
||||
private readonly int _port = Interlocked.Increment(ref _portCounter);
|
||||
private string Prefix => $"http://localhost:{_port}/";
|
||||
private readonly DriverHost _driverHost = new();
|
||||
private HealthEndpointsHost _host = null!;
|
||||
private HttpClient _client = null!;
|
||||
|
||||
public ValueTask InitializeAsync()
|
||||
{
|
||||
_client = new HttpClient { BaseAddress = new Uri(Prefix) };
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
_client.Dispose();
|
||||
if (_host is not null) await _host.DisposeAsync();
|
||||
}
|
||||
|
||||
private HealthEndpointsHost Start(Func<bool>? configDbHealthy = null, Func<bool>? usingStaleConfig = null)
|
||||
{
|
||||
_host = new HealthEndpointsHost(
|
||||
_driverHost,
|
||||
NullLogger<HealthEndpointsHost>.Instance,
|
||||
configDbHealthy,
|
||||
usingStaleConfig,
|
||||
prefix: Prefix);
|
||||
_host.Start();
|
||||
return _host;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Healthz_ReturnsHealthy_EmptyFleet()
|
||||
{
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/healthz");
|
||||
|
||||
response.IsSuccessStatusCode.ShouldBeTrue();
|
||||
var body = JsonDocument.Parse(await response.Content.ReadAsStringAsync()).RootElement;
|
||||
body.GetProperty("status").GetString().ShouldBe("healthy");
|
||||
body.GetProperty("configDbReachable").GetBoolean().ShouldBeTrue();
|
||||
body.GetProperty("usingStaleConfig").GetBoolean().ShouldBeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Healthz_StaleConfig_Returns200_WithFlag()
|
||||
{
|
||||
Start(configDbHealthy: () => false, usingStaleConfig: () => true);
|
||||
|
||||
var response = await _client.GetAsync("/healthz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.OK);
|
||||
var body = JsonDocument.Parse(await response.Content.ReadAsStringAsync()).RootElement;
|
||||
body.GetProperty("configDbReachable").GetBoolean().ShouldBeFalse();
|
||||
body.GetProperty("usingStaleConfig").GetBoolean().ShouldBeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Healthz_UnreachableConfig_And_NoCache_Returns503()
|
||||
{
|
||||
Start(configDbHealthy: () => false, usingStaleConfig: () => false);
|
||||
|
||||
var response = await _client.GetAsync("/healthz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.ServiceUnavailable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Readyz_EmptyFleet_Is200_Healthy()
|
||||
{
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/readyz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.OK);
|
||||
var body = JsonDocument.Parse(await response.Content.ReadAsStringAsync()).RootElement;
|
||||
body.GetProperty("verdict").GetString().ShouldBe("Healthy");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Readyz_WithHealthyDriver_Is200()
|
||||
{
|
||||
await _driverHost.RegisterAsync(new StubDriver("drv-1", DriverState.Healthy), "{}", CancellationToken.None);
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/readyz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.OK);
|
||||
var body = JsonDocument.Parse(await response.Content.ReadAsStringAsync()).RootElement;
|
||||
body.GetProperty("verdict").GetString().ShouldBe("Healthy");
|
||||
body.GetProperty("drivers").GetArrayLength().ShouldBe(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Readyz_WithFaultedDriver_Is503()
|
||||
{
|
||||
await _driverHost.RegisterAsync(new StubDriver("dead", DriverState.Faulted), "{}", CancellationToken.None);
|
||||
await _driverHost.RegisterAsync(new StubDriver("alive", DriverState.Healthy), "{}", CancellationToken.None);
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/readyz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.ServiceUnavailable);
|
||||
var body = JsonDocument.Parse(await response.Content.ReadAsStringAsync()).RootElement;
|
||||
body.GetProperty("verdict").GetString().ShouldBe("Faulted");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Readyz_WithDegradedDriver_Is200_WithDegradedList()
|
||||
{
|
||||
await _driverHost.RegisterAsync(new StubDriver("drv-ok", DriverState.Healthy), "{}", CancellationToken.None);
|
||||
await _driverHost.RegisterAsync(new StubDriver("drv-deg", DriverState.Degraded), "{}", CancellationToken.None);
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/readyz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.OK);
|
||||
var body = JsonDocument.Parse(await response.Content.ReadAsStringAsync()).RootElement;
|
||||
body.GetProperty("verdict").GetString().ShouldBe("Degraded");
|
||||
body.GetProperty("degradedDrivers").GetArrayLength().ShouldBe(1);
|
||||
body.GetProperty("degradedDrivers")[0].GetString().ShouldBe("drv-deg");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Readyz_WithInitializingDriver_Is503()
|
||||
{
|
||||
await _driverHost.RegisterAsync(new StubDriver("init", DriverState.Initializing), "{}", CancellationToken.None);
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/readyz");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.ServiceUnavailable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Unknown_Path_Returns404()
|
||||
{
|
||||
Start();
|
||||
|
||||
var response = await _client.GetAsync("/foo");
|
||||
|
||||
response.StatusCode.ShouldBe(System.Net.HttpStatusCode.NotFound);
|
||||
}
|
||||
|
||||
private sealed class StubDriver : IDriver
|
||||
{
|
||||
private readonly DriverState _state;
|
||||
public StubDriver(string id, DriverState state)
|
||||
{
|
||||
DriverInstanceId = id;
|
||||
_state = state;
|
||||
}
|
||||
public string DriverInstanceId { get; }
|
||||
public string DriverType => "Stub";
|
||||
public Task InitializeAsync(string _, CancellationToken ct) => Task.CompletedTask;
|
||||
public Task ReinitializeAsync(string _, CancellationToken ct) => Task.CompletedTask;
|
||||
public Task ShutdownAsync(CancellationToken ct) => Task.CompletedTask;
|
||||
public DriverHealth GetHealth() => new(_state, null, null);
|
||||
public long GetMemoryFootprint() => 0;
|
||||
public Task FlushOptionalCachesAsync(CancellationToken ct) => Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,7 @@ public sealed class HistoryReadIntegrationTests : IAsyncLifetime
|
||||
ApplicationName = "OtOpcUaHistoryTest",
|
||||
ApplicationUri = "urn:OtOpcUa:Server:HistoryTest",
|
||||
PkiStoreRoot = _pkiRoot,
|
||||
AutoAcceptUntrustedClientCertificates = true,
|
||||
AutoAcceptUntrustedClientCertificates = true, HealthEndpointsEnabled = false,
|
||||
};
|
||||
|
||||
_server = new OpcUaApplicationHost(options, _driverHost, new DenyAllUserAuthenticator(),
|
||||
|
||||
@@ -49,7 +49,7 @@ public sealed class MultipleDriverInstancesIntegrationTests : IAsyncLifetime
|
||||
ApplicationName = "OtOpcUaMultiDriverTest",
|
||||
ApplicationUri = "urn:OtOpcUa:Server:MultiDriverTest",
|
||||
PkiStoreRoot = _pkiRoot,
|
||||
AutoAcceptUntrustedClientCertificates = true,
|
||||
AutoAcceptUntrustedClientCertificates = true, HealthEndpointsEnabled = false,
|
||||
};
|
||||
|
||||
_server = new OpcUaApplicationHost(options, _driverHost, new DenyAllUserAuthenticator(),
|
||||
|
||||
@@ -36,7 +36,7 @@ public sealed class OpcUaServerIntegrationTests : IAsyncLifetime
|
||||
ApplicationName = "OtOpcUaTest",
|
||||
ApplicationUri = "urn:OtOpcUa:Server:Test",
|
||||
PkiStoreRoot = _pkiRoot,
|
||||
AutoAcceptUntrustedClientCertificates = true,
|
||||
AutoAcceptUntrustedClientCertificates = true, HealthEndpointsEnabled = false,
|
||||
};
|
||||
|
||||
_server = new OpcUaApplicationHost(options, _driverHost, new DenyAllUserAuthenticator(),
|
||||
|
||||
Reference in New Issue
Block a user